diff --git a/.gitignore b/.gitignore
index 5764bfe22c6..13d8aefe39d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -73,7 +73,8 @@ GSYMS
 /src/kaldi.mk.bak
 
 # /egs/
-/egs/*/*/mfcc
+/egs/*/*/mfcc*
+/egs/*/*/fbank*
 /egs/*/*/plp
 /egs/*/*/exp
 /egs/*/*/data
diff --git a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh
index 0be0e2c79c6..2aeb836083c 100644
--- a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -177,7 +177,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh
index 78dd4000e58..e3e97e9ae2a 100644
--- a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh
+++ b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -197,7 +197,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh
index b38fa4d9c7a..e0acea5f168 100755
--- a/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -168,7 +168,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh
index 6b7223785d9..965932316b8 100755
--- a/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh
+++ b/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -170,7 +170,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh
index 86c9becac5b..9148f54d29b 100755
--- a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -188,7 +188,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh
index d8560e63909..f0d87890c00 100755
--- a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -238,7 +238,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/ami/s5/local/chain/run_blstm_ami_5.sh b/egs/ami/s5/local/chain/run_blstm_ami_5.sh
index 53221a2bd53..90e096a9264 100755
--- a/egs/ami/s5/local/chain/run_blstm_ami_5.sh
+++ b/egs/ami/s5/local/chain/run_blstm_ami_5.sh
@@ -149,7 +149,7 @@ if [ $stage -le 18 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 19 ]; then
diff --git a/egs/ami/s5/local/chain/run_tdnn_ami_5.sh b/egs/ami/s5/local/chain/run_tdnn_ami_5.sh
index df635316127..5b9ab9de043 100755
--- a/egs/ami/s5/local/chain/run_tdnn_ami_5.sh
+++ b/egs/ami/s5/local/chain/run_tdnn_ami_5.sh
@@ -175,7 +175,7 @@ if [ $stage -le 18 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 19 ]; then
diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh
index 4d260e3c517..57628d86798 100755
--- a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh
+++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh
@@ -307,7 +307,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh
index 3546b6a7ced..8aae7760a71 100755
--- a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh
@@ -301,7 +301,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh
index 1a839b045bd..64d8e1822ca 100755
--- a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh
@@ -330,7 +330,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
index d926c1dc6d7..23e5bda2038 100644
--- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
@@ -284,7 +284,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh
index d9cd1c356e8..d9dd08166c2 100644
--- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh
@@ -278,7 +278,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh
index a0805b4f9f1..ac5c403c4bd 100755
--- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh
@@ -287,7 +287,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh
index 03ebc5845e4..3b107519114 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -217,7 +217,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
index 997357b80a9..2ea2266b1b5 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
@@ -245,7 +245,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
index 4d062e65429..de2030c71cc 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
@@ -232,7 +232,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
index 387570388d0..4375253d3a2 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
@@ -244,7 +244,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh
index 0436b08cdc0..b372db56e32 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh
@@ -242,7 +242,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
index 4ca526d63b8..ee887fd91c2 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
@@ -247,7 +247,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
index baed760bb68..8c421c58351 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
@@ -248,7 +248,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh
index e721a858c0a..2c226c01105 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh
@@ -251,7 +251,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
index de40cb2d1a4..7486b3b6d6e 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
@@ -253,7 +253,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
index 4f580b88f6b..84470f6530b 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -259,7 +259,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
index 904a079d7de..93ef04d79f5 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -263,7 +263,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
index 511e520465a..60a6356077e 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -262,7 +262,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
index bd81b7df4eb..a3ee0bcb631 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -264,7 +264,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
index 50903e78b6d..aff42a3647f 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -264,7 +264,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
index f6c53001498..a748e034cf8 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -263,7 +263,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
index 79fd9ef3fb5..0cdf44279f2 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -264,7 +264,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
index e58a7f89e03..428e4926693 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -265,7 +265,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
index 13f894f5a48..3bd87ca26f0 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -264,7 +264,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
index 48b31832e8c..b835da9cf38 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -275,7 +275,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
index e675bc494bb..0caf4494b79 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -271,7 +271,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
index 2d019398274..f8a6a0f1aa7 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -315,7 +315,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
index 9e5b971bbe2..cb49eb94888 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -321,7 +321,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
index 9575c3cf686..0df4d741fe4 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -269,7 +269,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh
index a7f2625c181..6bbc6fd52ad 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh
@@ -276,7 +276,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh
index ca920869b30..dacf4639a1f 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh
@@ -278,7 +278,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh
index 53dbd5238db..1fd80acab90 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -272,7 +272,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh
index dafef668e60..d39a7cf6c9f 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh
@@ -273,7 +273,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh
index 677946d0b9a..d0b3f4181bc 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh
@@ -272,7 +272,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
index bd13010c791..2928bde6ab4 100755
--- a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
@@ -230,7 +230,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $dir/graph_pp
+  utils/mkgraph.sh data/lang_pp_test $dir $dir/graph_pp
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
index b5979a3ce6b..fed98e57b99 100755
--- a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
@@ -199,7 +199,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $dir/graph_pp
+  utils/mkgraph.sh data/lang_pp_test $dir $dir/graph_pp
 fi
 
 
diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
index cd548142598..ad85e63a975 100755
--- a/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
@@ -217,7 +217,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $dir/graph_pp
+  utils/mkgraph.sh data/lang_pp_test $dir $dir/graph_pp
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_asp_1.sh
index 5b35c902354..8eeb2fef21a 100755
--- a/egs/aspire/s5/local/chain/tuning/run_tdnn_asp_1.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_asp_1.sh
@@ -190,7 +190,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $dir/graph_pp
+  utils/mkgraph.sh data/lang_pp_test $dir $dir/graph_pp
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index f98dff5e6fa..ae298c3148e 100755
--- a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -248,7 +248,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir
+  utils/mkgraph.sh data/lang_pp_test $dir $graph_dir
 fi
 
 if [ $stage -le 15 ]; then
diff --git a/egs/babel/s5c/local/ali_to_rttm.sh b/egs/babel/s5c/local/ali_to_rttm.sh
index ef11f516ea3..4b1ef5948cd 100755
--- a/egs/babel/s5c/local/ali_to_rttm.sh
+++ b/egs/babel/s5c/local/ali_to_rttm.sh
@@ -23,7 +23,7 @@
 #local/ali_to_rttm.sh data/dev2h data/lang exp/sgmm5/align_dev2h/
 
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0
diff --git a/egs/babel/s5d/local/ali_to_rttm.sh b/egs/babel/s5d/local/ali_to_rttm.sh
index cb4f0740130..6a720c91287 100755
--- a/egs/babel/s5d/local/ali_to_rttm.sh
+++ b/egs/babel/s5d/local/ali_to_rttm.sh
@@ -23,7 +23,7 @@
 #local/ali_to_rttm.sh data/dev2h data/lang exp/sgmm5/align_dev2h/
 
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn.sh
index 7b4535f8c5e..102225f9bc4 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn.sh
@@ -210,7 +210,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh
index 5fc14dda826..93958c93717 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh
@@ -217,7 +217,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh
index 8c7de5d18d4..d4f2ed70cfb 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh
@@ -215,7 +215,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh
index 0b3e70b5a04..b9ff6c1a15d 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh
@@ -215,7 +215,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh
index 45f2907645e..1c3f26e7def 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh
@@ -216,7 +216,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh
index 0d92aff5c28..2342437c83c 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh
@@ -216,7 +216,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh
index 4129c00dcb4..38bdcfda2f5 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh
@@ -216,7 +216,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh
index 1cfa50c1aa1..d0c7ca09b1c 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh
@@ -216,7 +216,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
index ba8ac1e0373..ceb94f5e16b 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
@@ -218,7 +218,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh
index 5de285e080e..98c6b13aeef 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh
@@ -220,7 +220,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+  utils/mkgraph.sh data/langp_test $dir $dir/graph
 fi
 
 exit 0
diff --git a/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index ec530ef1ce4..fac93f0a790 100755
--- a/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -110,7 +110,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -235,7 +235,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh
index 716bdce3729..c5d4106e44a 100755
--- a/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh
+++ b/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -147,7 +147,7 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/bentham/v1/run_end2end.sh b/egs/bentham/v1/run_end2end.sh
index 63c034e41f6..5d821aeb9ee 100755
--- a/egs/bentham/v1/run_end2end.sh
+++ b/egs/bentham/v1/run_end2end.sh
@@ -111,7 +111,7 @@ if [ $stage -le 6 ]; then
   echo "$0: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
                        --use-gpu false \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       --scale-opts ' --acoustic-scale=1.0' \
                        data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh
index 3f8b7c60090..fb254339cb5 100755
--- a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh
@@ -321,7 +321,7 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr_5k/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr_5k \
+    data/lang_test_tgpr_5k \
     $tree_dir $tree_dir/graph_tgpr_5k || exit 1;
 fi
 
diff --git a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh
index 8b4e93cd05b..19ea72a944a 100755
--- a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -275,7 +275,7 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr_5k/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr_5k \
+    data/lang_test_tgpr_5k \
     $tree_dir $tree_dir/graph_tgpr_5k || exit 1;
 fi
 
diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
index f0f469e46c8..2fb91a07a33 100755
--- a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -212,7 +212,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/chime5/s5/local/run_recog.sh b/egs/chime5/s5/local/run_recog.sh
index 5c74c9ff242..9da73a02821 100755
--- a/egs/chime5/s5/local/run_recog.sh
+++ b/egs/chime5/s5/local/run_recog.sh
@@ -130,7 +130,7 @@ if [ $stage -le 18 ]; then
   chunk_right_context=0
   
   utils/mkgraph.sh \
-      --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+      data/lang${lm_suffix}/ \
       $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
diff --git a/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
index 95e9d934bd3..7b14b7dff67 100755
--- a/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
+++ b/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
@@ -246,7 +246,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh
index daad37e2cd7..3b6c73e41d8 100755
--- a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -216,7 +216,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh
index e033715d884..ccde8a0fcd3 100755
--- a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh
@@ -224,7 +224,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
index e3d8e6ac4dc..e80797de57a 100755
--- a/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -239,7 +239,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/chime5/s5b/local/run_recog.sh b/egs/chime5/s5b/local/run_recog.sh
index 5c74c9ff242..9da73a02821 100755
--- a/egs/chime5/s5b/local/run_recog.sh
+++ b/egs/chime5/s5b/local/run_recog.sh
@@ -130,7 +130,7 @@ if [ $stage -le 18 ]; then
   chunk_right_context=0
   
   utils/mkgraph.sh \
-      --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+      data/lang${lm_suffix}/ \
       $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
diff --git a/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh
index d4acd0fed4b..74d37961396 100755
--- a/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -229,7 +229,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh
index 75ceb80e3e0..1cb21d96375 100755
--- a/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -213,7 +213,7 @@ fi
 
 if [ $stage -le 14 ]; then
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_csj_tg $dir $dir/graph_csj_tg
+    data/lang_csj_tg $dir $dir/graph_csj_tg
 
   for decode_set in $test_sets; do
     steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj 10 \
diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
index 7f407552c2e..5f4690d05b4 100755
--- a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
+++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
@@ -249,7 +249,7 @@ if [ $stage -le 21 ]; then
   #LM was trained only on Fisher Spanish train subset.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph_fsp_train || exit 1;
 
 fi
diff --git a/egs/fisher_english/s5/local/chain/run_tdnn.sh b/egs/fisher_english/s5/local/chain/run_tdnn.sh
index 1fd0f1fdf3a..424a4610bab 100755
--- a/egs/fisher_english/s5/local/chain/run_tdnn.sh
+++ b/egs/fisher_english/s5/local/chain/run_tdnn.sh
@@ -193,7 +193,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $graph_dir
+  utils/mkgraph.sh data/lang_test $dir $graph_dir
 fi
 
 decode_suff=
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
index 07636a8b3c8..a4040e9494a 100644
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
@@ -126,7 +126,7 @@ for f in data/${supervised_set_perturbed}/feats.scp \
 done
 
 if [ ! -f $graphdir/HCLG.fst ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 $unsup_decode_lang $sup_chain_dir $graphdir
+  utils/mkgraph.sh $unsup_decode_lang $sup_chain_dir $graphdir
 fi
 
 # Prepare the speed-perturbed unsupervised data directory
@@ -402,7 +402,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir
+  utils/mkgraph.sh ${test_lang} $dir $test_graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
index b1c133942ef..aa2818c23ce 100755
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
@@ -224,7 +224,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_poco_unk $dir $graph_dir
+  utils/mkgraph.sh data/lang_test_poco_unk $dir $graph_dir
 fi
 
 decode_suff=
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
index 04244014502..ed487734eef 100755
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
@@ -138,7 +138,7 @@ for f in data/${supervised_set_perturbed}/feats.scp \
 done
 
 if [ ! -f $graphdir/HCLG.fst ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 $unsup_decode_lang $sup_chain_dir $graphdir
+  utils/mkgraph.sh $unsup_decode_lang $sup_chain_dir $graphdir
 fi
 
 if [ $stage -le 2 ]; then
@@ -421,7 +421,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir
+  utils/mkgraph.sh ${test_lang} $dir $test_graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
index 66f87c8da8f..ce32a3ca9b7 100755
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
@@ -143,7 +143,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
index c12f604f26b..0bedf85c8cb 100755
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
@@ -216,7 +216,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh
index 543f753bd4e..0179ebd26e3 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh
@@ -135,7 +135,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
index efcd1eced4a..910bbe358bf 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
@@ -212,7 +212,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
index e4a555abfdd..ac990889e2a 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
@@ -221,7 +221,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
index 5650cedca28..89ef17fa9bc 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
@@ -232,7 +232,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
index 5beb2e74a9a..1a711089912 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
@@ -312,7 +312,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
index f3cc869e6de..aed698b343d 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
@@ -242,7 +242,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
index 059a81e15fc..cd5910cf9b4 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
@@ -239,7 +239,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
index d86b699d6f6..51546ddd622 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
@@ -240,7 +240,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
index 66c5ad3335f..e4aa735a9d8 100755
--- a/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -164,7 +164,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
index 1981bb0530d..ec2f9dc1b6c 100755
--- a/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -172,7 +172,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
index 6fa10344cfc..c3ee11a0638 100755
--- a/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -174,7 +174,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
index 1f4b7e12850..03d739579bd 100755
--- a/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -173,7 +173,7 @@ if [ $stage -le 12 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
index bf2e45c9914..d62d214d957 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -200,7 +200,7 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
index deebafc95e4..5278b97591a 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -206,7 +206,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh --left-biphone data/lang_test $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
index bf2e45c9914..d62d214d957 100755
--- a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
@@ -200,7 +200,7 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
index deebafc95e4..5278b97591a 100755
--- a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -206,7 +206,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh --left-biphone data/lang_test $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/gp/s1/steps/align_deltas.sh b/egs/gp/s1/steps/align_deltas.sh
index 22da04432c7..37406b4d2a6 100755
--- a/egs/gp/s1/steps/align_deltas.sh
+++ b/egs/gp/s1/steps/align_deltas.sh
@@ -93,7 +93,7 @@ mkdir -p $dir
 # Create copy of the tree and model and occs...
 cp $srcdir/{tree,final.mdl,final.occs} $dir || exit 1;
 
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 
 if [ ! -d $data/split$nj -o $data/split$nj -ot $data/feats.scp ]; then
   split_data.sh $data $nj
diff --git a/egs/gp/s1/steps/train_deltas.sh b/egs/gp/s1/steps/train_deltas.sh
index 0efe7b60379..45a4a54f861 100755
--- a/egs/gp/s1/steps/train_deltas.sh
+++ b/egs/gp/s1/steps/train_deltas.sh
@@ -125,7 +125,7 @@ if [ ! -f $alidir/final.mdl ]; then
   exit 1;
 fi
 
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 oov_sym=`cat $lang/oov.txt`
 silphonelist=`cat $lang/silphones.csl`
diff --git a/egs/gp/s1/steps/train_mono.sh b/egs/gp/s1/steps/train_mono.sh
index e82c14fcaf2..c4e2ad42228 100755
--- a/egs/gp/s1/steps/train_mono.sh
+++ b/egs/gp/s1/steps/train_mono.sh
@@ -77,7 +77,7 @@ dir=$3
 [ -f path.sh ] && . ./path.sh
 
 # Configuration:
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 numiters=40    # Number of iterations of training
 maxiterinc=30 # Last iter to increase #Gauss on.
 numgauss=300 # Initial num-Gauss (must be more than #states=3*phones).
diff --git a/egs/gp/s1/utils/lmrescore.sh b/egs/gp/s1/utils/lmrescore.sh
index c911d0ce8b0..bf70021f13e 100755
--- a/egs/gp/s1/utils/lmrescore.sh
+++ b/egs/gp/s1/utils/lmrescore.sh
@@ -157,7 +157,7 @@ case "$mode" in
       lattice-compose ark:- $outdir/Ldet.fst ark:- \| \
       lattice-determinize ark:- ark:- \| \
       lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
-      lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=0.1 \
+      lattice-add-trans-probs \
         $mdl ark:- ark:- \| \
       gzip -c \>$newlat  ||  error_exit "Error doing LM rescoring."
   ;;
diff --git a/egs/gp/s1/utils/mkgraph.sh b/egs/gp/s1/utils/mkgraph.sh
index 3aba742832d..14a4048ffba 100755
--- a/egs/gp/s1/utils/mkgraph.sh
+++ b/egs/gp/s1/utils/mkgraph.sh
@@ -19,7 +19,7 @@
 # all the language-model, pronunciation dictionary (lexicon), context-dependency,
 # and HMM structure in our model.  The output is a Finite State Transducer
 # that has word-ids on the output, and pdf-ids on the input (these are indexes
-# that resolve to Gaussian Mixture Models).  
+# that resolve to Gaussian Mixture Models).
 # See
 #  http://kaldi-asr.org/doc/graph_recipe_test.html
 # (this is compiled from this repository using Doxygen,
@@ -30,7 +30,7 @@ N=3
 P=1
 clean=false
 
-for x in 1 2 3; do 
+for x in 1 2 3; do
   if [ $1 == "--mono" ]; then
     N=1;
     P=0;
@@ -60,9 +60,6 @@ if $clean; then rm -r $lang/tmp; fi
 
 mkdir -p $dir
 
-tscale=1.0
-loopscale=0.1
-
 # If $lang/tmp/LG.fst does not exist or is older than its sources, make it...
 # (note: the [[ ]] brackets make the || type operators work (inside [ ], we
 # would have to use -o instead),  -f means file exists, and -ot means older than).
@@ -101,7 +98,7 @@ fi
 if [[ ! -f $dir/Ha.fst || $dir/Ha.fst -ot $model  \
     || $dir/Ha.fst -ot $lang/tmp/ilabels_${N}_${P} ]]; then
   make-h-transducer --disambig-syms-out=$dir/disambig_tid.list \
-    --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \
+    $lang/tmp/ilabels_${N}_${P} $tree $model \
      > $dir/Ha.fst  || exit 1;
 fi
 
@@ -114,13 +111,10 @@ if [[ ! -f $dir/HCLGa.fst || $dir/HCLGa.fst -ot $dir/Ha.fst || \
 fi
 
 if [[ ! -f $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then
-  add-self-loops --self-loop-scale=$loopscale --reorder=true \
+  add-self-loops \
     $model < $dir/HCLGa.fst > $dir/HCLG.fst || exit 1;
 
-  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
-    # No point doing this test if transition-scale not 1, as it is bound to fail. 
-    fstisstochastic $dir/HCLG.fst || echo "Final HCLG is not stochastic."
-  fi
+  fstisstochastic $dir/HCLG.fst || echo "Final HCLG is not stochastic."
 fi
 
 # keep a copy of the lexicon and a list of silence phones with HCLG...
diff --git a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index 361879b4142..0a40bd33c66 100755
--- a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -249,7 +249,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 \
+    \
     data/lang_test \
     $tree_dir \
     $tree_dir/graph || exit 1;
diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
index 290bd4c7970..147195d18b7 100755
--- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -236,7 +236,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 \
+    \
     data/lang_test \
     $tree_dir \
     $tree_dir/graph || exit 1;
diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
index cfb4dc1f697..3591f11d228 100755
--- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -232,7 +232,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 \
+    \
     data/lang_test \
     $tree_dir \
     $tree_dir/graph || exit 1;
diff --git a/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh
index c62b776de2b..c79606dcfd1 100755
--- a/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh
+++ b/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -207,7 +207,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index d1b657a2d74..e2a51260ff5 100755
--- a/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -254,7 +254,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/langp_test \
+    data/langp_test \
     $tree_dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh
index 40bbbe1ae79..25b2224a855 100755
--- a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -223,7 +223,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/langp_test \
+    data/langp_test \
     $tree_dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh
index a498d8157f3..246adb1e45d 100755
--- a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -240,7 +240,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/langp_test \
+    data/langp_test \
     $tree_dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
index ef1273f3961..a0655c6f247 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
@@ -209,7 +209,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index bbcc55aa2b0..2b80fbcb4de 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -100,7 +100,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -206,7 +206,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
index 401ffa14e19..ad7367b614e 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
@@ -98,7 +98,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -207,7 +207,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
index 17209b9204f..3770eb0aa40 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
@@ -97,7 +97,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -213,7 +213,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
index 89a40ed2a13..e5d12aabbb7 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
@@ -101,7 +101,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -215,7 +215,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index 703d404159a..81399230b2e 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -94,7 +94,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -211,7 +211,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index 905c4661477..ecc93e9341a 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -91,7 +91,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             $train_data_dir data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -203,7 +203,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_decode \
+    data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh
index 26b1aca0929..f7cf2d3ff59 100755
--- a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh
@@ -93,7 +93,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             $train_data_dir data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -206,7 +206,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh
index 462ad0522de..72ad70e7dcd 100755
--- a/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -138,7 +138,7 @@ if [ $stage -le 4 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh
index 0a8b014715f..e81c2eb54ba 100755
--- a/egs/iam/v1/run_end2end.sh
+++ b/egs/iam/v1/run_end2end.sh
@@ -114,7 +114,7 @@ if [ $stage -le 6 ]; then
   echo "$0: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
                        --use-gpu false \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       --scale-opts ' --acoustic-scale=1.0' \
                        data/$train_set data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
index 9a01688ba35..10a69265b3f 100755
--- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -106,7 +106,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -231,7 +231,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
index 28aa246f334..ce3fda36052 100755
--- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -108,7 +108,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -233,7 +233,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
index f158317950a..c253a796813 100755
--- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
@@ -110,7 +110,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -234,7 +234,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh
index 1c44057454a..dde868d6918 100755
--- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh
@@ -108,7 +108,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -233,7 +233,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh
index cb2bfa0a82d..1758efd8f4d 100755
--- a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh
+++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -158,7 +158,7 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh
index d5f79602695..f02246503d1 100755
--- a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh
+++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh
@@ -144,7 +144,7 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh
index c515c85fc72..51dc6737c86 100755
--- a/egs/iam/v2/run_end2end.sh
+++ b/egs/iam/v2/run_end2end.sh
@@ -136,7 +136,7 @@ if [ $stage -le 7 ]; then
   echo "$0: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
                        --use-gpu false \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       --scale-opts ' --acoustic-scale=1.0' \
                        data/train_aug data/lang exp/chain/e2e_cnn_1b exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh
index 10650a18269..af7f6599f97 100755
--- a/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -223,7 +223,7 @@ if [ $stage -le 14 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh
index db62e6f8a55..e657c9bc3f3 100755
--- a/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -237,7 +237,7 @@ if [ $stage -le 14 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/ifnenit/v1/local/chain/run_cnn_1a.sh b/egs/ifnenit/v1/local/chain/run_cnn_1a.sh
index b0ecd547741..d0c386e75ee 100755
--- a/egs/ifnenit/v1/local/chain/run_cnn_1a.sh
+++ b/egs/ifnenit/v1/local/chain/run_cnn_1a.sh
@@ -211,7 +211,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    data/$lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh b/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh
index 7f3132d657e..7eeb6f4a15c 100755
--- a/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh
+++ b/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh
@@ -101,7 +101,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -216,7 +216,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    data/$lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
index 6bf3a139ad1..0e5a3410e31 100755
--- a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
+++ b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
@@ -125,7 +125,7 @@ if [ $stage -le 1 ]; then
          # have some stragglers.
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
     --online-ivector-dir $train_ivector_dir \
-    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --scale-opts "--acoustic-scale=1.0 " \
     --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ;
 fi
 
@@ -139,7 +139,7 @@ if [ -z "$lats_dir" ]; then
     subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
     # total slots = 80 * 6 = 480.
     steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \
-      --self-loop-scale 1.0 --acwt 1.0 --determinize true \
+      --acwt 1.0 --determinize true \
       --online-ivector-dir $train_ivector_dir \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
       $train_data_dir $lang $srcdir ${lats_dir} ;
diff --git a/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index db17a35be64..a191aba2db9 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -16,7 +16,7 @@
 # WER on test(fglarge)              3.80          3.69
 # WER on test(tglarge)              3.89          3.80
 # WER on test(tgmed)                4.72          4.64
-# WER on test(tgsmall)              5.19          5.16      
+# WER on test(tgsmall)              5.19          5.16
 # WER on test_other(fglarge)        8.76          8.71
 # WER on test_other(tglarge)        9.19          9.11
 # WER on test_other(tgmed)         11.22         11.00
@@ -211,7 +211,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
 fi
 
 iter_opts=
@@ -226,7 +226,7 @@ if [ $stage -le 17 ]; then
           --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
@@ -271,4 +271,4 @@ if $test_online_decoding && [ $stage -le 18 ]; then
 fi
 
 exit 0;
-                
+
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
index fb652a719a2..48184bca926 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -178,7 +178,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
 fi
 
 
@@ -194,7 +194,7 @@ if [ $stage -le 17 ]; then
           --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
index 48d6ddb804f..196f4b5a709 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -205,7 +205,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
 fi
 
 
@@ -221,7 +221,7 @@ if [ $stage -le 17 ]; then
           --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
index 101fd6a4c15..e1baf2c792d 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -196,7 +196,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
 fi
 
 iter_opts=
@@ -211,7 +211,7 @@ if [ $stage -le 17 ]; then
           --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
index 865b10dea0c..da6cfada36f 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -296,7 +296,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
 fi
 
 iter_opts=
@@ -311,7 +311,7 @@ if [ $stage -le 17 ]; then
           --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 0e97e46194d..7237c1463c7 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -179,7 +179,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
 fi
 
 
@@ -200,7 +200,7 @@ if [ $stage -le 15 ]; then
           --frames-per-chunk "$frames_per_chunk_primary" \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
index 0da813267fc..9ddd2457312 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -214,7 +214,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --remove-oov data/lang_test_tgsmall $dir $graph_dir
 fi
 
 
@@ -235,7 +235,7 @@ if [ $stage -le 15 ]; then
           --frames-per-chunk "$frames_per_chunk_primary" \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
-      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
           data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
       steps/lmrescore_const_arpa.sh \
           --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
index 892ee441516..33de4ae8b93 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
@@ -194,7 +194,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index 7ca7c652fd2..62ec4686ed6 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -86,7 +86,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -194,7 +194,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index a8bc1836ffe..59a167f1e64 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -87,7 +87,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -206,7 +206,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index 0828e051dcc..fdf6f994268 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -100,7 +100,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -219,7 +219,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh
index ccbb7119674..5403dd2af05 100755
--- a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh
+++ b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh
@@ -94,7 +94,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 
@@ -213,7 +213,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh
index 3fca8cf5fdc..90ca63a971e 100755
--- a/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh
+++ b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh
@@ -150,7 +150,7 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_ar/v1/local/tl/run_text_localization.sh b/egs/madcat_ar/v1/local/tl/run_text_localization.sh
index 8d12f7d802f..5066adc73dd 100755
--- a/egs/madcat_ar/v1/local/tl/run_text_localization.sh
+++ b/egs/madcat_ar/v1/local/tl/run_text_localization.sh
@@ -133,7 +133,7 @@ if [ $stage -le 6 ]; then
   echo "$0: Aligning the training data using the e2e chain model...$(date)."
   steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
                        --use-gpu false \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       --scale-opts ' --acoustic-scale=1.0' \
                        data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh
index 62f4eeb7c71..bb22e1b1a8e 100755
--- a/egs/madcat_ar/v1/run_end2end.sh
+++ b/egs/madcat_ar/v1/run_end2end.sh
@@ -119,7 +119,7 @@ fi
 
 if [ $stage -le 5 ] && $decode_e2e; then
   echo "$0: $(date) stage 5: decoding end2end setup..."
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode \
+  utils/mkgraph.sh $lang_decode \
     exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
 
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj $nj --cmd "$cmd" \
diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
index 164d62a7ad9..6affb1587aa 100755
--- a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
@@ -206,7 +206,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index be51bdcc3d1..46df193483c 100755
--- a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -92,7 +92,7 @@ if [ $stage -le 2 ]; then
   # Get the alignments as lattices (gives the chain training more freedom).
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -202,7 +202,7 @@ if [ $stage -le 6 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
index aa61620a92f..a478a63160f 100755
--- a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
@@ -96,7 +96,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
@@ -210,7 +210,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_zh/v1/run_end2end.sh b/egs/madcat_zh/v1/run_end2end.sh
index 7e0fc1e25d1..a89222fe1b4 100755
--- a/egs/madcat_zh/v1/run_end2end.sh
+++ b/egs/madcat_zh/v1/run_end2end.sh
@@ -96,7 +96,7 @@ fi
 
 if [ $stage -le 5 ] && $decode_e2e; then
   echo "$0: $(date) stage 5: decoding end2end setup..."
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode \
+  utils/mkgraph.sh $lang_decode \
     exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
 
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj $nj --cmd "$cmd" \
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
index 4f38ee886a7..f7c99ef08d5 100755
--- a/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -243,11 +243,11 @@ if [ $stage -le 12 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_combined_test \
+    data/lang_combined_test \
     $tree_dir ${tree_dir}/graph_combined || exit 1;
 fi
 
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh
index 023cb34b43d..4c853eefa9f 100755
--- a/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -268,7 +268,7 @@ if [ $stage -le 12 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_combined_test \
+    data/lang_combined_test \
     $tree_dir ${tree_dir}/graph_combined || exit 1;
 fi
 
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index af5a62dad0d..576d1146d63 100755
--- a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -234,11 +234,11 @@ if [ $stage -le 12 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_combined_test \
+    data/lang_combined_test \
     $tree_dir ${tree_dir}/graph_combined || exit 1;
 fi
 
diff --git a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
index 3d3056182ee..6b641a9235c 100755
--- a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
+++ b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
@@ -246,7 +246,7 @@ if [ $stage -le 12 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_combined_test \
+    data/lang_combined_test \
     $tree_dir ${tree_dir}/graph_combined || exit 1;
 fi
 
diff --git a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh
index 37c957a3227..54bfa09b261 100755
--- a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh
+++ b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh
@@ -124,7 +124,7 @@ done
 
 if [ $stage -le 1 ]; then
   if [ ! -f $graphdir/HCLG.fst ]; then
-    utils/mkgraph.sh --self-loop-scale 1.0 $unsup_decode_lang $sup_chain_dir $graphdir
+    utils/mkgraph.sh $unsup_decode_lang $sup_chain_dir $graphdir
   fi
 fi
 
@@ -439,7 +439,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir
+  utils/mkgraph.sh ${test_lang} $dir $test_graph_dir
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh
index 6300511e817..9b6caa24e09 100644
--- a/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -224,7 +224,7 @@ if [ $stage -le 14 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/run_att.sh b/egs/mini_librispeech/s5/local/chain/run_att.sh
new file mode 120000
index 00000000000..bf5d5a0c0f1
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/run_att.sh
@@ -0,0 +1 @@
+tuning/run_att_1a.sh
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_att_1a.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_att_1a.sh
new file mode 100755
index 00000000000..2238e66f041
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_att_1a.sh
@@ -0,0 +1,300 @@
+#!/bin/bash
+
+# run_att_1a.sh is similar to run_tdnn_1h.sh but with some TDNN layers replaced
+# with attention layers.
+
+
+# Note: below, att1a and att1a2 are two different runs of the same script.
+#
+# local/chain/compare_wer.sh exp/chain/tdnn1h_sp exp/chain/att1a_sp exp/chain/att1a2_sp
+# System                tdnn1h_sp  att1a_sp att1a2_sp
+#WER dev_clean_2 (tgsmall)      12.27     12.16     12.65
+#WER dev_clean_2 (tglarge)       8.61      8.68      8.94
+# Final train prob        -0.0462   -0.0434   -0.0425
+# Final valid prob        -0.0814   -0.0807   -0.0814
+# Final train prob (xent)   -1.1354   -1.0721   -1.0647
+# Final valid prob (xent)   -1.3680   -1.3254   -1.3263
+# Num-params                 5210944   4193064   4193064
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1h_sp exp/chain/att1a_sp
+# exp/chain/tdnn1h_sp: num-iters=34 nj=2..5 num-params=5.2M dim=40+100->2336 combine=-0.049->-0.047 (over 3) xent:train/valid[21,33,final]=(-1.36,-1.16,-1.14/-1.57,-1.40,-1.37) logprob:train/valid[21,33,final]=(-0.061,-0.051,-0.046/-0.094,-0.089,-0.081)
+# exp/chain/att1a_sp: num-iters=34 nj=2..5 num-params=4.2M dim=40+100->2336 combine=-0.046->-0.044 (over 4) xent:train/valid[21,33,final]=(-1.30,-1.10,-1.07/-1.53,-1.38,-1.33) logprob:train/valid[21,33,final]=(-0.057,-0.049,-0.043/-0.091,-0.087,-0.081)
+# exp/chain/att1a2_sp: num-iters=34 nj=2..5 num-params=4.2M dim=40+100->2336 combine=-0.046->-0.044 (over 3) xent:train/valid[21,33,final]=(-1.30,-1.08,-1.06/-1.53,-1.36,-1.33) logprob:train/valid[21,33,final]=(-0.056,-0.048,-0.043/-0.091,-0.085,-0.081)
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/att${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+
+  tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+  attention_opts="l2-regularize=0.01 bottleneck-dim=96 num-heads=4 value-dim=50 key-dim=50 time-stride=3 num-left-inputs=4 num-right-inputs=2 bypass-scale=0.66"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  attention-block name=att7  $attention_opts
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  attention-block name=att9  $attention_opts
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index c8f2503b578..6a1a5d9222f 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -246,7 +246,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
index da16297c9dd..066ef065f4a 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -233,7 +233,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
index 3d0c2d63902..42d51ef2b37 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -241,7 +241,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
index 081af8fe2f8..f4192726330 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -237,7 +237,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c_discriminative.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c_discriminative.sh
index 1aa519ccb9d..d68018a1032 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c_discriminative.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c_discriminative.sh
@@ -116,7 +116,7 @@ if [ $stage -le 1 ]; then
          # have some stragglers.
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
     --online-ivector-dir $online_ivector_dir \
-    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --scale-opts "--acoustic-scale=1.0 " \
     --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ;
 fi
 
@@ -130,7 +130,7 @@ if [ -z "$lats_dir" ]; then
     subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
     # total slots = 80 * 6 = 480.
     steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \
-      --self-loop-scale 1.0 --acwt 1.0 --determinize true \
+      --acwt 1.0 --determinize true \
       --online-ivector-dir $online_ivector_dir \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
       $train_data_dir $lang $srcdir ${lats_dir} ;
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
index 04df38d4da3..9c2ee69f2d5 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -239,7 +239,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh
index cdf9bb584f4..6df89c4acef 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh
@@ -238,7 +238,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
index d1385ff2be5..c619e6a287e 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -246,7 +246,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
index ad51780e191..f7f356fcb15 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -246,7 +246,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh
index dbfe5c5a07a..92e34bf6a78 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh
@@ -259,7 +259,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh
index cc4123e2755..e9b04da7ec0 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh
@@ -239,7 +239,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh
index 60d620b06c6..518da044c54 100755
--- a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh
+++ b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh
@@ -70,7 +70,7 @@ fi
 
 if [ $stage -le 2 ]; then
   # make the top-level part of the graph.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_base $tree_dir $tree_dir/extvocab_nosp_top
+  utils/mkgraph.sh $lang_base $tree_dir $tree_dir/extvocab_nosp_top
 fi
 
 if [ $stage -le 3 ] && $run_g2p; then
@@ -266,7 +266,7 @@ if [ $stage -le 6 ]; then
   # make the part of the graph that will be included.
   # Refer to the 'compile-graph' commands in ./simple_demo.sh for how you'd do
   # this in code.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_ext $tree_dir $tree_dir/extvocab_nosp_part
+  utils/mkgraph.sh $lang_ext $tree_dir $tree_dir/extvocab_nosp_part
 fi
 
 if [ $stage -le 7 ]; then
diff --git a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo_silprobs.sh b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo_silprobs.sh
index 28c58dfa453..1975ac97152 100755
--- a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo_silprobs.sh
+++ b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo_silprobs.sh
@@ -71,7 +71,7 @@ fi
 
 if [ $stage -le 2 ]; then
   # make the top-level part of the graph.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_base $tree_dir $tree_dir/extvocab_top
+  utils/mkgraph.sh $lang_base $tree_dir $tree_dir/extvocab_top
 fi
 
 if [ $stage -le 3 ] && $run_g2p; then
@@ -267,7 +267,7 @@ if [ $stage -le 6 ]; then
   # make the part of the graph that will be included.
   # Refer to the 'compile-graph' commands in ./simple_demo.sh for how you'd do
   # this in code.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_ext $tree_dir $tree_dir/extvocab_part
+  utils/mkgraph.sh $lang_ext $tree_dir $tree_dir/extvocab_part
 fi
 
 if [ $stage -le 7 ]; then
diff --git a/egs/mini_librispeech/s5/local/grammar/simple_demo.sh b/egs/mini_librispeech/s5/local/grammar/simple_demo.sh
index a4edeb8091c..bd187c19bcf 100755
--- a/egs/mini_librispeech/s5/local/grammar/simple_demo.sh
+++ b/egs/mini_librispeech/s5/local/grammar/simple_demo.sh
@@ -66,7 +66,7 @@ if [ $stage -le 2 ]; then
 2  0.69314718055994
 3
 EOF
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar1
+  utils/mkgraph.sh $lang $tree_dir $tree_dir/grammar1
 
   # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
   compile-graph --read-disambig-syms=$lang/phones/disambig.int $tree_dir/tree $tree_dir/1.mdl $lang/L_disambig.fst $lang/G.fst $tree_dir/grammar1/HCLG2.fst
@@ -96,7 +96,7 @@ if [ $stage -le 3 ]; then
 2  0.69314718055994
 3
 EOF
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar2a
+  utils/mkgraph.sh $lang $tree_dir $tree_dir/grammar2a
 
   # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
   offset=$(grep nonterm_bos $lang/phones.txt | awk '{print $2}') # 364
@@ -123,7 +123,7 @@ if [ $stage -le 4 ]; then
 2    3    #nonterm_end <eps>
 3
 EOF
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar2b
+  utils/mkgraph.sh $lang $tree_dir $tree_dir/grammar2b
 
 
   # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
diff --git a/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh b/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh
index 414227f2ad6..088b20eba1a 100755
--- a/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh
+++ b/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh
@@ -65,7 +65,7 @@ if [ $stage -le 2 ]; then
 2  0.69314718055994
 3
 EOF
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar1
+  utils/mkgraph.sh $lang $tree_dir $tree_dir/grammar1
 
   # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
   compile-graph --read-disambig-syms=$lang/phones/disambig.int $tree_dir/tree $tree_dir/1.mdl $lang/L_disambig.fst $lang/G.fst $tree_dir/grammar1/HCLG2.fst
@@ -94,7 +94,7 @@ if [ $stage -le 3 ]; then
 2  0.69314718055994
 3
 EOF
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar2a
+  utils/mkgraph.sh $lang $tree_dir $tree_dir/grammar2a
 
   # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
   offset=$(grep nonterm_bos $lang/phones.txt | awk '{print $2}') # 364
@@ -121,7 +121,7 @@ if [ $stage -le 4 ]; then
 2    3    #nonterm_end <eps>
 3
 EOF
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar2b
+  utils/mkgraph.sh $lang $tree_dir $tree_dir/grammar2b
 
 
   # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
diff --git a/egs/mini_librispeech/s5/local/kws/create_hitlist.sh b/egs/mini_librispeech/s5/local/kws/create_hitlist.sh
index be06a3b9312..6ad516607d5 100755
--- a/egs/mini_librispeech/s5/local/kws/create_hitlist.sh
+++ b/egs/mini_librispeech/s5/local/kws/create_hitlist.sh
@@ -17,7 +17,7 @@
 
 
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0
diff --git a/egs/multi_en/s5/local/chain/run_blstm_6h.sh b/egs/multi_en/s5/local/chain/run_blstm_6h.sh
index 126d29350a1..81ab737ddd9 100644
--- a/egs/multi_en/s5/local/chain/run_blstm_6h.sh
+++ b/egs/multi_en/s5/local/chain/run_blstm_6h.sh
@@ -151,7 +151,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh
index 96f5fdac8f3..31b467f9398 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh
@@ -235,7 +235,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 62266334962..13156cfc118 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -252,7 +252,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_dir \
+  utils/mkgraph.sh $lang_dir \
     $dir $dir/graph${lang_suffix}
 fi
 
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
index 79cd3eb3014..be12f7cca45 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -241,7 +241,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh
index a7170af9431..6e45ade836d 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh
@@ -237,7 +237,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+  utils/mkgraph.sh data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
 fi
 
 decode_suff=fsh_sw1_tg
diff --git a/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh
index c8b4997161e..e68051c1770 100755
--- a/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -223,7 +223,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 4723400c76b..0d27842f18a 100755
--- a/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -239,7 +239,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    data/lang${lm_suffix}/ \
     $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
 fi
 
diff --git a/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index 33eb9dcb98c..f0303909c8e 100755
--- a/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -110,7 +110,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 --generate-ali-from-lats true \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -239,7 +239,7 @@ if [ $stage -le 7 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh
index 9d28a41316d..b6226ab5c2e 100755
--- a/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh
+++ b/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -140,7 +140,7 @@ if [ $stage -le 4 ]; then
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/rimes/v1/run_end2end.sh b/egs/rimes/v1/run_end2end.sh
index d3e3da2be13..89ed4e656a6 100755
--- a/egs/rimes/v1/run_end2end.sh
+++ b/egs/rimes/v1/run_end2end.sh
@@ -103,7 +103,7 @@ fi
 if [ $stage -le 7 ]; then
   echo "$0: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       --scale-opts ' --acoustic-scale=1.0' \
                        data/$train_set data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh
index c393a9aa28b..0464c073b2a 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh
@@ -134,7 +134,7 @@ if [ $stage -le 10 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" \
@@ -143,7 +143,7 @@ if [ $stage -le 10 ]; then
 fi
 
 if [ $stage -le 11 ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
+  utils/mkgraph.sh data/lang_ug $dir $dir/graph_ug
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --nj 20 --cmd "$decode_cmd" \
     --online-ivector-dir exp/nnet2_online/ivectors_test \
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh
index 131bcf98de9..c2869d20731 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh
@@ -136,7 +136,7 @@ if [ $stage -le 10 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" \
@@ -145,7 +145,7 @@ if [ $stage -le 10 ]; then
 fi
 
 if [ $stage -le 11 ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
+  utils/mkgraph.sh data/lang_ug $dir $dir/graph_ug
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --nj 20 --cmd "$decode_cmd" \
     --online-ivector-dir exp/nnet2_online/ivectors_test \
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
index db5944fdbea..60cf4733185 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
@@ -166,7 +166,7 @@ if [ $stage -le 10 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" \
@@ -175,7 +175,7 @@ if [ $stage -le 10 ]; then
 fi
 
 if [ $stage -le 11 ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
+  utils/mkgraph.sh data/lang_ug $dir $dir/graph_ug
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --nj 20 --cmd "$decode_cmd" \
     --online-ivector-dir exp/nnet2_online/ivectors_test \
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh
index 2fd2556c19b..df9c020bd30 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh
@@ -198,7 +198,7 @@ if [ $stage -le 9 ]; then
   if $use_ivector;then
     ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test"
   fi
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" $ivec_opt \
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh
index 3e8d5717d4b..f21aec5c29a 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh
@@ -211,7 +211,7 @@ if [ $stage -le 8 ]; then
   # the lang directory.
   ivec_opt=""
   if $use_ivector;then ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test" ; fi
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_src_tgt $dir $dir/graph
+  utils/mkgraph.sh $lang_src_tgt $dir $dir/graph
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" $ivec_opt \
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh
index 611aede371d..f6f4d9b0193 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh
@@ -147,7 +147,7 @@ if [ $stage -le 4 ]; then
     --generate-ali-from-lats true \
     --acoustic-scale 1.0 --extra-left-context-initial 0 --extra-right-context-final 0 \
     --frames-per-chunk 150 \
-    --scale-opts "--transition-scale=1.0 --self-loop-scale=1.0" \
+    --scale-opts "" \
     data/train_hires $lang_src_tgt $src_mdl_dir $lat_dir || exit 1;
   rm $lat_dir/fsts.*.gz # save space
 fi
@@ -219,7 +219,7 @@ if [ $stage -le 8 ]; then
   tes_ivec_opt=""
   if $use_ivector;then test_ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test" ; fi
 
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_src_tgt $dir $dir/graph
+  utils/mkgraph.sh $lang_src_tgt $dir $dir/graph
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
     --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" $test_ivec_opt \
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh
index 47557f93696..8ebf33af18f 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh
@@ -232,7 +232,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh
index 7afa1b7f902..6414898fca7 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh
@@ -233,7 +233,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh
index e69e499e152..8ce19734a22 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh
@@ -231,7 +231,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh
index 86e0352828c..cc51f149446 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh
@@ -244,7 +244,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh
index 313f899a471..b6b8ffb9885 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh
@@ -232,7 +232,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_tg $dir $dir/graph
+  utils/mkgraph.sh data/lang_test_tg $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1a.sh
index 4991326a86d..cf28beb7691 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -177,7 +177,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh
index 600f27ddf86..f9ffab37b73 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -215,7 +215,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_tg $dir $dir/graph
+  utils/mkgraph.sh data/lang_test_tg $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index cedc448464a..6fc8735783f 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -231,7 +231,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_tg $dir $dir/graph
+  utils/mkgraph.sh data/lang_test_tg $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh
index d317b1dc55a..25a66075419 100755
--- a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh
+++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh
@@ -221,7 +221,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh
index 20dcab8eb50..2c9437e5343 100755
--- a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh
+++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh
@@ -234,7 +234,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 graph_dir=$dir/graph_sw1_tg
diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh
index 8762430ee7f..353a77d8668 100755
--- a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh
+++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh
@@ -214,7 +214,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh
index a1be44cdbbf..0c2c8ee1f54 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh
@@ -172,7 +172,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
index d7382d78dc6..ed051c1729d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
@@ -132,7 +132,7 @@ if [ $stage -le 1 ]; then
          # have some stragglers.
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
     --online-ivector-dir $online_ivector_dir $context_opts \
-    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --scale-opts "--acoustic-scale=1.0 " \
     --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ;
 fi
 
@@ -146,7 +146,7 @@ if [ -z "$lats_dir" ]; then
     subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
     # total slots = 80 * 6 = 480.
     steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \
-      --self-loop-scale 1.0 --acwt 1.0 --determinize true \
+      --acwt 1.0 --determinize true \
       --online-ivector-dir $online_ivector_dir $context_opts \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
       $train_data_dir $lang $srcdir ${lats_dir} ;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
index 1eac1c60c27..b633d64375b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
@@ -178,7 +178,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
index acdae844b65..426e8647cd8 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
@@ -203,7 +203,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
index bbd8cb63697..7e36d47f1f9 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
@@ -203,7 +203,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
index 16f2ea211d0..5377ca916a1 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
@@ -213,7 +213,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh
index 09f7d72434c..692b2240aa0 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh
@@ -211,7 +211,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh
index 8e44d0bc114..86e2a7786ce 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh
@@ -217,7 +217,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh
index 6a836e81b09..f0a746ca362 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh
@@ -219,7 +219,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh
index d1a61360f85..f4dee51dd03 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -207,7 +207,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh
index ac22e858aea..459e9b4b00d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh
@@ -177,7 +177,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
index aa48db04841..12646f07897 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
@@ -177,7 +177,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
index 48db81f586f..3b8a4b6f104 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
@@ -197,7 +197,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
index 021eab09506..65ef23f8ce3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
@@ -204,7 +204,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh
index f219167f9ec..b8e08c61ddf 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh
@@ -215,7 +215,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh
index 551be099390..532eb3ede7e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh
@@ -182,7 +182,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2a.sh
index c584bbe29a6..318d026b8d6 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2a.sh
@@ -220,8 +220,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2b.sh
index 227a74067d4..e7b574f0ea3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2b.sh
@@ -211,8 +211,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2c.sh
index 9fc08f27d45..556325b5bdc 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2c.sh
@@ -202,7 +202,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2d.sh
index 2ef8c374514..4c9b497f2fa 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2d.sh
@@ -207,7 +207,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2e.sh
index 2db9a59c2e2..4c31608c8f9 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2e.sh
@@ -230,7 +230,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2f.sh
index f510fccd882..d940e81efab 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2f.sh
@@ -212,7 +212,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2g.sh
index 65b48b43685..8f43562e4e3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2g.sh
@@ -215,7 +215,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2h.sh
index d86233ff83b..5154834004b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2h.sh
@@ -217,7 +217,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2i.sh
index cab9dd957a3..53e8e2a9ed6 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2i.sh
@@ -215,7 +215,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2j.sh
index 0eca2ff10ff..d5b5a80bef4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2j.sh
@@ -216,7 +216,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2k.sh
index 7e127c10917..1eb99309edd 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2k.sh
@@ -225,7 +225,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2l.sh
index fbe45761996..53ad7c17e14 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2l.sh
@@ -235,7 +235,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2m.sh
index 93db16408cc..345446f3cc8 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2m.sh
@@ -236,7 +236,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2n.sh
index 57eb66dac35..1ee2a92f494 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2n.sh
@@ -276,7 +276,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2o.sh
index ae085c9804f..e5111d13bf2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2o.sh
@@ -235,7 +235,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2p.sh
index 4c6ad3b9761..b7ae60e2449 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2p.sh
@@ -250,7 +250,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2q.sh
index 54b03fb2296..872a2d5bd04 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2q.sh
@@ -244,7 +244,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2r.sh
index 4bdc61ef0e5..657c8925e60 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2r.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2r.sh
@@ -210,7 +210,7 @@ if [ $stage -le 11 ]; then
   # needed, as in this type of topology we only have a single pdf-class,
   # numbered zero.
   steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --cluster-phones-opts "--pdf-class-list=0" \
+      --cluster-phones-opts "--pdf-class-list=1" \
       --leftmost-questions-truncate $leftmost_questions_truncate \
       --cmd "$train_cmd" 6000 data/$train_set data/lang_chain_2r $ali_dir $treedir
 fi
@@ -248,7 +248,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2s.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2s.sh
index 3e829e246f3..f326b2e286a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2s.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2s.sh
@@ -236,7 +236,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2t.sh
index 4a322e1a8fa..8bd8111537d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2t.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2t.sh
@@ -240,7 +240,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2u.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2u.sh
index 9ec5bf81d3d..1d506714faa 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2u.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2u.sh
@@ -252,7 +252,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2v.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2v.sh
index cd009cfcc12..7c3b7cccead 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2v.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2v.sh
@@ -257,7 +257,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2w.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2w.sh
index 687093c98c5..674be6e15e1 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2w.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2w.sh
@@ -252,7 +252,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2x.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2x.sh
index e2d6204af0c..df5513245f6 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2x.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2x.sh
@@ -258,7 +258,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2y.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2y.sh
index c1211feae64..1169193dfce 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_2y.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_2y.sh
@@ -243,7 +243,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3c.sh
index 01ff8079f2a..b5c73f9d700 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3c.sh
@@ -250,7 +250,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3d.sh
index 0cb513c84f1..0759e406e56 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3d.sh
@@ -262,7 +262,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3e.sh
index 687f684a68c..422505200ea 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3e.sh
@@ -251,7 +251,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3f.sh
index 0a4b935485a..32ae7f8ed07 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3f.sh
@@ -258,7 +258,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3g.sh
index 077a84d31e9..b1ebcfb706a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3g.sh
@@ -278,7 +278,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3h.sh
index dcda3a00383..2c7cad476d2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3h.sh
@@ -264,7 +264,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3i.sh
index 996795c9aee..3ca650c496f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3i.sh
@@ -285,7 +285,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3j.sh
index 66e44fb6f04..752f6635e6c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3j.sh
@@ -271,7 +271,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k.sh
index 5369b5251d1..fe474c82053 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k.sh
@@ -285,7 +285,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k2.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k2.sh
index 1902213402f..00fe73eacb8 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k2.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3k2.sh
@@ -331,7 +331,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3l.sh
index ae36ab2b65f..d15e25a50ec 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3l.sh
@@ -281,7 +281,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3m.sh
index 49656fb8aa7..5035a31837e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3m.sh
@@ -285,7 +285,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3n.sh
index e2b0b0ebb10..06c6f360ab8 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3n.sh
@@ -280,7 +280,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3o.sh
index 298eb913ff3..23ae76103e5 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3o.sh
@@ -284,7 +284,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3p.sh
index 6ec9c6fe4b8..b907ad73b1a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3p.sh
@@ -308,7 +308,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3q.sh
index 4c911ba867e..8c81726d34c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3q.sh
@@ -290,7 +290,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3r.sh
index fba4ef6d15f..6a3a5a5d871 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3r.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3r.sh
@@ -296,7 +296,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3s.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3s.sh
index daab4cad318..790eb939b79 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3s.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3s.sh
@@ -315,7 +315,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3t.sh
index 034f2bafd70..ba3e714a158 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3t.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3t.sh
@@ -311,7 +311,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3u.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3u.sh
index 97c44ad55fc..a767afefd5e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3u.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3u.sh
@@ -305,7 +305,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3v.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3v.sh
index 381a9e8686f..57957ab4585 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3v.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3v.sh
@@ -303,7 +303,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3w.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3w.sh
index 9f13b10753d..d35dfd8a18a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3w.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3w.sh
@@ -307,7 +307,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3x.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3x.sh
index 25db1450265..71399ca4fb7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3x.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3x.sh
@@ -316,7 +316,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3y.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3y.sh
index 3376652f3c2..b328a38f564 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3y.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3y.sh
@@ -321,7 +321,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3z.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3z.sh
index 25a68263dc7..50148dc8378 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_3z.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_3z.sh
@@ -325,7 +325,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4a.sh
index 0be490863dc..908ccb0c6e4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4a.sh
@@ -324,7 +324,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4b.sh
index 40ede7c5982..6955905caa7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4b.sh
@@ -321,7 +321,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4c.sh
index be9043c0527..badda8a057b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4c.sh
@@ -332,7 +332,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4d.sh
index 7f58fbebbfc..9ac4f3aeac3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4d.sh
@@ -321,7 +321,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4e.sh
index 8625cfa52c8..84d890912bd 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4e.sh
@@ -337,7 +337,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4f.sh
index 7ba4e8c6cb7..43242092b69 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4f.sh
@@ -341,7 +341,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4g.sh
index f1059f0091f..dbce2b1cb1c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4g.sh
@@ -340,7 +340,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4n.sh
index 62154dd5d71..4b57c4d072d 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4n.sh
@@ -361,7 +361,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4p.sh
index 0120c2c507d..224708375fc 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4p.sh
@@ -356,7 +356,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4q.sh
index 7d920092c30..57d13be22ce 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4q.sh
@@ -148,7 +148,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4r.sh
index 591b79352ab..9722c7c6704 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4r.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4r.sh
@@ -355,7 +355,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4s.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4s.sh
index fea6a776dbf..3c020375985 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4s.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4s.sh
@@ -355,7 +355,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4t.sh
index 0173b586700..d8ebc5b150f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4t.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4t.sh
@@ -357,7 +357,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4u.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4u.sh
index ac15f232500..86811c0c886 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4u.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4u.sh
@@ -359,7 +359,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4v.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4v.sh
index 0682615acf3..9fb9d849e75 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4v.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4v.sh
@@ -369,7 +369,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4w.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4w.sh
index 77d5013d91f..5baa90022ed 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4w.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4w.sh
@@ -372,7 +372,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4x.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4x.sh
index 9c59137bbfc..c6520ee0f1b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_4x.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_4x.sh
@@ -371,7 +371,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5a.sh
index 1d44637a8c8..e5a605af399 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5a.sh
@@ -376,7 +376,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5b.sh
index cdb769fb959..79ee6ce0aa4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5b.sh
@@ -379,7 +379,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5c.sh
index 17d8c41a82e..7f916c32993 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5c.sh
@@ -384,7 +384,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5d.sh
index f3b92944f1a..c898f769f5f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5d.sh
@@ -382,7 +382,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5e.sh
index 5a64c967907..4db6f59a868 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5e.sh
@@ -392,7 +392,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5f.sh
index c40f2ada0d3..4e8b1632a56 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5f.sh
@@ -398,7 +398,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5g.sh
index 5f59e146f65..18a9669daa7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5g.sh
@@ -452,7 +452,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5h.sh
index f8dc8886eb5..b4976c1c83a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5h.sh
@@ -409,7 +409,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5i.sh
index 7b7f67125c3..af92e3d1a94 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5i.sh
@@ -407,7 +407,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5j.sh
index bf1787c4373..f88e5885bb3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5j.sh
@@ -403,7 +403,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5k.sh
index 93f9bffdd12..fec6d8d2a36 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5k.sh
@@ -430,7 +430,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5l.sh
index f0c66c3a7cd..5b7580e8022 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5l.sh
@@ -440,7 +440,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5m.sh
index dc0f19e9261..aac8f038360 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5m.sh
@@ -405,7 +405,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5n.sh
index 51a3f6e7723..efe8193cc1b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5n.sh
@@ -435,7 +435,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5o.sh
index 4e2e6033d29..a8497ce24c9 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5o.sh
@@ -443,7 +443,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5p.sh
index 36056efce7a..47acc829fe8 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5p.sh
@@ -396,7 +396,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5q.sh
index 01a9e867b57..691f575ab34 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5q.sh
@@ -400,7 +400,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5r.sh
index a20ca2da3de..f48f8500710 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5r.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5r.sh
@@ -402,7 +402,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5s.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5s.sh
index df981a478c0..00f00103be7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5s.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5s.sh
@@ -416,7 +416,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5t.sh
index ddd08de7707..8a69ec95d9b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5t.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5t.sh
@@ -420,7 +420,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5u.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5u.sh
index 28333fd912e..998300ca1a2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5u.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5u.sh
@@ -481,7 +481,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5v.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5v.sh
index 2cdb0bb988c..507d2257adc 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5v.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5v.sh
@@ -434,7 +434,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5w.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5w.sh
index 5a33622645a..3d263c066d6 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5w.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5w.sh
@@ -445,7 +445,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5x.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5x.sh
index 0b76fe60a7b..608f14634c1 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5x.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5x.sh
@@ -452,7 +452,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5y.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5y.sh
index 3fd623e163f..eb5581f6144 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5y.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5y.sh
@@ -451,7 +451,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5z.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5z.sh
index ff3528d9660..0b0b8ada738 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_5z.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_5z.sh
@@ -443,7 +443,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6a.sh
index 194245be1e3..c8b02f54172 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6a.sh
@@ -465,7 +465,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6b.sh
index d4194a5afe4..8449fb3b76a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6b.sh
@@ -455,7 +455,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6c.sh
index 89021098c49..927a33616ea 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6c.sh
@@ -443,7 +443,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6d.sh
index 354640e0258..8909f643de0 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6d.sh
@@ -445,7 +445,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6e.sh
index 80fea19e7a2..a8d82682d24 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6e.sh
@@ -439,7 +439,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6f.sh
index f92048cfeb4..a661b5b59fb 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6f.sh
@@ -445,7 +445,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6g.sh
index fbc5e0c54b5..c14f2fca4d7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6g.sh
@@ -466,7 +466,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h.sh
index 5449671d131..5a1a7d40474 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h.sh
@@ -469,7 +469,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
index 6db0a4f5ac4..4e47c28b9f7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
@@ -159,7 +159,7 @@ if [ $stage -le 1 ]; then
          # have some stragglers.
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
     --online-ivector-dir $online_ivector_dir \
-    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --scale-opts "--acoustic-scale=1.0 " \
     --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ;
 fi
 
@@ -173,7 +173,7 @@ if [ -z "$lats_dir" ]; then
     subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
     # total slots = 80 * 6 = 480.
     steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \
-      --self-loop-scale 1.0 --acwt 1.0 --determinize true \
+      --acwt 1.0 --determinize true \
       --online-ivector-dir $online_ivector_dir \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
       $train_data_dir $lang $srcdir ${lats_dir} ;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_py.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_py.sh
index 32631f4d348..f763c5f95b3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_py.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_py.sh
@@ -152,7 +152,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6i.sh
index 093bceb2717..a00b762695f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6i.sh
@@ -473,7 +473,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6j.sh
index cf98106ea04..416ae831c21 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6j.sh
@@ -458,7 +458,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6k.sh
index 5d518aeab2a..99b00b50e1a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6k.sh
@@ -485,7 +485,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6l.sh
index c76f5a9efd3..6e30c9a62e1 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6l.sh
@@ -497,7 +497,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6m.sh
index 39d6d3cb449..d352a63c944 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6m.sh
@@ -473,7 +473,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6n.sh
index 0911711e73c..04adf6805d5 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6n.sh
@@ -475,7 +475,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6o.sh
index c07cb35ed33..959067206fe 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6o.sh
@@ -484,7 +484,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6p.sh
index 5710dbe2ef9..38065eba75d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6p.sh
@@ -479,7 +479,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6q.sh
index 3e93d79b799..b2ab4a581d5 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6q.sh
@@ -469,7 +469,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6r.sh
index 0415f4e0fb9..dda85a77550 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6r.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6r.sh
@@ -468,7 +468,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6s.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6s.sh
index 0564c0a858f..4de17d1c875 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6s.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6s.sh
@@ -478,7 +478,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6t.sh
index 98ecd477a1d..787ff971b9d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6t.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6t.sh
@@ -488,7 +488,7 @@ if [ $stage -le 15 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6u.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6u.sh
index 9e8afc3c5b8..67da19429a2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6u.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6u.sh
@@ -499,7 +499,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6v.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6v.sh
index 732b60d7c95..9c0de1c8597 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6v.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6v.sh
@@ -191,7 +191,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6w.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6w.sh
index a625859f7d4..e0db69bf5a5 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6w.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6w.sh
@@ -199,7 +199,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6x.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6x.sh
index 2e79e24ddb6..90a6793254c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6x.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6x.sh
@@ -194,7 +194,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6y.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6y.sh
index 5cf1cead63f..fd5a9342c40 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6y.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6y.sh
@@ -192,7 +192,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6z.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6z.sh
index baa42a087b7..a03cf67b4f7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6z.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6z.sh
@@ -196,7 +196,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7a.sh
index 5dd430ded8d..1ce8dcef65f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7a.sh
@@ -227,7 +227,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7b.sh
index 47dbe843d8e..c3bc49b783d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7b.sh
@@ -211,7 +211,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7c.sh
index 3335ef788a4..8006dc6051e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7c.sh
@@ -211,7 +211,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh
index dba1b99582a..a83027b2a05 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh
@@ -181,7 +181,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7e.sh
index 704411b6a76..a3dcb68c92d 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7e.sh
@@ -182,7 +182,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh
index a7a5a11dc7a..c07b5f3b936 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh
@@ -183,7 +183,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
index 0623d26a9e4..9a0cfe0b301 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
@@ -200,7 +200,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
index dbbe3c1e6fd..13323fa559f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
@@ -200,7 +200,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
index 2a8a658bf6b..483b08c6938 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
@@ -193,7 +193,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
index a9eba36ddaa..a9e91f78dd1 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
@@ -193,7 +193,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh
index 8e0b290cf87..9843df8945b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh
@@ -194,7 +194,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
index bb9ddf209d6..09dd37fe61d 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
@@ -188,7 +188,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
index 97f92c14f1f..ef99c6d42f7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
@@ -204,7 +204,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
index d9fe106e5d7..fdbbdae43b6 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
@@ -541,7 +541,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
index 99e43443f99..ee3b0a70974 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
@@ -207,7 +207,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
index 44ca3b3d279..26448a96ffc 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
@@ -231,7 +231,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh
index d19a4ef4c0b..d68475c6fad 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh
@@ -219,7 +219,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
index cea0891d5d7..6fbdd992d02 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
@@ -207,7 +207,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_a.sh
index 96046ac23c1..a18133a5609 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_a.sh
@@ -122,8 +122,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_a2.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_a2.sh
index 3a8e41a8315..94921a094bd 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_a2.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_a2.sh
@@ -121,8 +121,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh
index d4febd61e94..df1b86fee25 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh
@@ -202,7 +202,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_b.sh
index 8c623a7c01b..e02d283a171 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_b.sh
@@ -124,8 +124,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
index 4414147bf0e..460a9081d28 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
@@ -210,7 +210,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
index cd9d4dc6f2b..e60697f1300 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
@@ -213,7 +213,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh
index 18b660b4080..53485466f13 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh
@@ -211,7 +211,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
index be615e0e361..f9474506d86 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
@@ -205,7 +205,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_c.sh
index ec4634acf69..cbc6254575c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_c.sh
@@ -132,8 +132,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_d.sh
index 3a66a8cd556..94241dfc833 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_d.sh
@@ -136,8 +136,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_e.sh
index d30a513181e..f19df0c7190 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_e.sh
@@ -142,8 +142,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_f.sh
index 12450c2ae62..047cf298021 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_f.sh
@@ -147,8 +147,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_g.sh
index 70845684262..340d8fd7b61 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_g.sh
@@ -149,8 +149,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_h.sh
index 01f8743f585..91999d8459e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_h.sh
@@ -163,8 +163,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_i.sh
index 82d91bbd33e..0b17e772a95 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_i.sh
@@ -157,8 +157,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_j.sh
index 334eec7e872..8fa842cb5da 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_j.sh
@@ -164,8 +164,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_k.sh
index b64318ec4bb..0ee01ac3de4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_k.sh
@@ -159,8 +159,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_l.sh
index 6de6c79affc..9fbfcf68b01 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_l.sh
@@ -163,8 +163,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
index 43855e6f7ce..15e194d9081 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -205,7 +205,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
index 5c82ed0eb11..afcb48ff04a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -201,7 +201,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
index c3df0bf2b2c..c6e8625ff0b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -208,7 +208,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
index 3d353387239..ab301178c87 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -231,7 +231,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
index 2a2d508ecdd..3d221ad03f2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -226,7 +226,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
index 5af5463b372..9affa2aeaf2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -243,7 +243,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
index 28105a587ec..1921c9735e5 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -225,7 +225,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
index d6e81f2d8eb..579ccb45140 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -222,7 +222,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
index 060d98c9d05..4e8dddab481 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -243,7 +243,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
index 9bd39a262c5..5f7818689d4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -210,7 +210,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
index ccd6138da6e..4383dacb5ff 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -220,7 +220,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
index f702033377a..3b070d0a35b 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -208,7 +208,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
index b43577bd76c..375b00a70bd 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -220,7 +220,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
index 5bb6e7da152..eb7cf854982 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -217,7 +217,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_m.sh
index 8d357db0217..5290de020eb 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_m.sh
@@ -164,8 +164,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_n.sh
index a190a1d56dd..cdc7b8749fa 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_n.sh
@@ -174,8 +174,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_o.sh
index 5b80665268d..1e4ffbaf665 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_o.sh
@@ -178,8 +178,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
index 4db38d74508..7dee86b1741 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -225,7 +225,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
index 7e9dec67068..0a83ab53e6c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
@@ -223,7 +223,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_p.sh
index d401790449d..cc2dfe960bb 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_p.sh
@@ -171,8 +171,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_q.sh
index c6758a62fa5..1204cdb0eed 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_q.sh
@@ -181,8 +181,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_r.sh
index 73cadcc622c..95bef5b5317 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_r.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_r.sh
@@ -181,8 +181,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_s.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_s.sh
index ae10b53824f..f1ffc479eb2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_s.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_s.sh
@@ -183,8 +183,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_t.sh
index dabb2a6db87..7b7698a730f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_t.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_t.sh
@@ -186,8 +186,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_u.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_u.sh
index c83274499fa..d2f69394557 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_u.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_u.sh
@@ -191,8 +191,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_v.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_v.sh
index 38f31269d33..fe195ab33ee 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_v.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_v.sh
@@ -197,8 +197,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_w.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_w.sh
index 35d1ddd8052..cea2fd8d4c4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_w.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_w.sh
@@ -190,8 +190,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_x.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_x.sh
index 0f294033489..6cb8a63b478 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_x.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_x.sh
@@ -191,8 +191,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_y.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_y.sh
index 09217d1b196..4ce697ad42f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_y.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_y.sh
@@ -201,8 +201,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_z.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_z.sh
index 0c8524a2c90..04ed8cfb022 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_z.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_z.sh
@@ -191,8 +191,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --transition-scale 0.0 \
-      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh \
+      data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
index 02e637286b5..ed1927c648e 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
@@ -138,7 +138,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_sw1_tg $dir $graph_dir
+  utils/mkgraph.sh data/lang_sw1_tg $dir $graph_dir
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
index 67fd3c03d27..59b63a6fcee 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
@@ -140,7 +140,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_sw1_tg $dir $graph_dir
+  utils/mkgraph.sh data/lang_sw1_tg $dir $graph_dir
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
index 260116666a0..e6cc4e8bed1 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
@@ -137,7 +137,7 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_sw1_tg $dir $graph_dir
+  utils/mkgraph.sh data/lang_sw1_tg $dir $graph_dir
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh
index e1d0f06affe..2c8c6a57669 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh
@@ -130,7 +130,7 @@ if [ $stage -le 1 ]; then
   # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
   # get excellent GPU utilization though.]
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
-    --scale-opts '--transition-scale=1.0 --acoustic-scale=0.333 --self-loop-scale=0.333' \
+    --scale-opts '--acoustic-scale=0.333' \
     --frames-per-chunk $frames_per_chunk_decoding \
     --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
     --extra-left-context-initial 0 --extra-right-context-final 0 \
@@ -150,7 +150,7 @@ if [ -z "$degs_dir" ]; then
 
     steps/nnet3/get_degs.sh \
       --cmd "$decode_cmd --mem 10G" --num-threads 3 \
-      --self-loop-scale 0.333 --acwt 0.333 \
+      --acwt 0.333 \
       --max-copy-jobs $max_copy_jobs \
       --extra-left-context $extra_left_context \
       --extra-right-context $extra_right_context \
diff --git a/egs/tedlium/s5/local/chain/run_tdnn.sh b/egs/tedlium/s5/local/chain/run_tdnn.sh
index 545294dd035..96fee897a56 100755
--- a/egs/tedlium/s5/local/chain/run_tdnn.sh
+++ b/egs/tedlium/s5/local/chain/run_tdnn.sh
@@ -173,7 +173,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+  utils/mkgraph.sh data/lang_test $dir $dir/graph
 fi
 
 graph_dir=$dir/graph
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh
index 2ac8c09dad1..a53e2016f8b 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh
@@ -222,7 +222,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
index 47557f93696..8ebf33af18f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
@@ -232,7 +232,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
index 7afa1b7f902..6414898fca7 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
@@ -233,7 +233,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
index e69e499e152..8ce19734a22 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
@@ -231,7 +231,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
index 86e0352828c..cc51f149446 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
@@ -244,7 +244,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
index 0fdb2b3b63e..b39f94865f3 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
@@ -234,7 +234,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
index 70e72ee1914..36921a1ea9f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
@@ -177,7 +177,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
index 492d3efb804..8839ecf14a7 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
@@ -223,7 +223,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
index 01768c3875f..6eaf886ef5b 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
@@ -240,7 +240,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
index bb5007f4c9f..1ee826d0e5c 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
@@ -231,7 +231,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
index 1476ed1fd40..f3b0e654813 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
@@ -223,7 +223,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh
index 47f939fea1c..368b10e4ca7 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh
@@ -222,7 +222,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh
index f02025674e8..7c41c7e2a5d 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh
@@ -226,7 +226,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
index b03da27e760..a13e767e767 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -242,7 +242,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
index e896a7867b3..936129704bb 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -257,7 +257,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
index 00f72fab796..15745ba14a0 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -249,7 +249,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
index 80a9ed1c4d0..d827d2d789b 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -253,7 +253,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
index 031978f878a..563e8071df1 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -301,7 +301,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh
index 0d64c75aea8..dd26c9f0bf3 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh
@@ -135,7 +135,7 @@ if [ $stage -le 1 ]; then
   # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
   # get excellent GPU utilization though.]
   steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
-    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --scale-opts "--acoustic-scale=1.0 " \
     --frames-per-chunk $frames_per_chunk_egs \
     --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
     --extra-left-context-initial 0 --extra-right-context-final 0 \
@@ -156,7 +156,7 @@ if [ -z "$degs_dir" ]; then
     steps/nnet3/get_degs.sh \
       --cmd "$decode_cmd --mem 10G" --num-threads 3 \
       --max-copy-jobs $max_copy_jobs \
-      --self-loop-scale 1.0 --acwt 1.0 \
+      --acwt 1.0 \
       --extra-left-context $extra_left_context \
       --extra-right-context $extra_right_context \
       --extra-left-context-initial 0 --extra-right-context-final 0 \
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
index c60b8f7fefc..d3d6ca36816 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -255,7 +255,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
index 2d2048a6869..638cbec690a 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -258,7 +258,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
index a074e128270..dcf463156a6 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -256,7 +256,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
index 3bfe175806f..0a4c678c606 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -277,7 +277,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
index acbef783823..c1d9c4dad71 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -274,7 +274,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
index 173be863608..212967ca356 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -279,7 +279,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
index 94955d0472c..dbb2a8acbe1 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -270,7 +270,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
index efd3bc98725..189f8a2cb79 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -270,7 +270,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
index c0559e8d389..27dd0331dd2 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -280,7 +280,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
index 5a6dbaef8af..89de6799c27 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
@@ -284,7 +284,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
index dd38d56759f..5209710dbc4 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
@@ -279,7 +279,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
index 1378d2d176d..bb083315157 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
@@ -240,7 +240,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
index 3c4882ec2c6..82eca8f19af 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
@@ -240,7 +240,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
index 23ea14ae151..df96469ad28 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
@@ -234,7 +234,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh
index 7c44d963504..a10913133ad 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh
@@ -239,7 +239,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh
index 042ef346578..6c95f819dd2 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh
@@ -248,7 +248,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh
index 905e1845183..fc3e45dc7b0 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh
@@ -254,7 +254,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh
index 7bd96e7d82c..273068b6b28 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh
@@ -241,7 +241,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh
index f0220b17376..06b4b2451a4 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh
@@ -172,7 +172,7 @@ if [ $stage -le 16 ]; then
   # as long as phones.txt was compatible.
 
   utils/lang/check_phones_compatible.sh data/lang/phones.txt $lang/phones.txt
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
index 3e8509bf4ac..d867c99a185 100755
--- a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
@@ -222,7 +222,7 @@ if [ $stage -le 16 ]; then
   # as long as phones.txt was compatible.
 
   utils/lang/check_phones_compatible.sh data/lang/phones.txt $lang/phones.txt
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 17 ]; then
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
index 1204ff6ce4c..0289ff9dd16 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
@@ -224,7 +224,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
index 744c964db2f..a5250f4ff9d 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
@@ -231,7 +231,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh
index faac365af54..1c6c3709bdf 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh
@@ -224,7 +224,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh
index ab68ba6fb68..166b6d842a0 100755
--- a/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -223,7 +223,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 \
+    \
     data/lang_test \
     $tree_dir \
     $tree_dir/graph || exit 1;
diff --git a/egs/uw3/v1/local/chain/run_cnn_1a.sh b/egs/uw3/v1/local/chain/run_cnn_1a.sh
index e3548609da7..401d79e7217 100755
--- a/egs/uw3/v1/local/chain/run_cnn_1a.sh
+++ b/egs/uw3/v1/local/chain/run_cnn_1a.sh
@@ -216,7 +216,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_test \
+    $lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
index 844ccf80677..d68ac82ce6c 100755
--- a/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -234,7 +234,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_sp_test \
+    data/lang_sp_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh b/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh
index 1ddb3c305ac..0d07afa4519 100755
--- a/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh
+++ b/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh
@@ -175,13 +175,13 @@ if [ $stage -le 4 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_nosp_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_nosp_test_tgpr \
+    data/lang_nosp_test_tgpr \
     $dir $treedir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_nosp_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_nosp_test_bd_tgpr \
+    data/lang_nosp_test_bd_tgpr \
     $dir $treedir/graph_bd_tgpr || exit 1;
 fi
 
@@ -203,7 +203,7 @@ if [ $stage -le 5 ]; then
           $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_nosp_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh b/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh
index be82e80d5fe..714ace8a633 100755
--- a/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh
+++ b/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh
@@ -189,13 +189,13 @@ if [ $stage -le 4 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_char_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_tgpr \
+    data/lang_char_test_tgpr \
     $dir $treedir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_char_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_bd_tgpr \
+    data/lang_char_test_bd_tgpr \
     $dir $treedir/graph_bd_tgpr || exit 1;
 fi
 
@@ -219,7 +219,7 @@ if [ $stage -le 5 ]; then
           $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_char_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh
index 4ab0cf58d53..70f22f39903 100755
--- a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh
+++ b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh
@@ -181,13 +181,13 @@ if [ $stage -le 4 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_char_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_tgpr \
+    data/lang_char_test_tgpr \
     $dir $treedir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_char_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_bd_tgpr \
+    data/lang_char_test_bd_tgpr \
     $dir $treedir/graph_bd_tgpr || exit 1;
 fi
 
@@ -209,7 +209,7 @@ if [ $stage -le 5 ]; then
           $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_char_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh
index 4e66fae8baa..615c2535f7d 100755
--- a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh
+++ b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh
@@ -183,13 +183,13 @@ if [ $stage -le 4 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_char_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_tgpr \
+    data/lang_char_test_tgpr \
     $dir $treedir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_char_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_char_test_bd_tgpr \
+    data/lang_char_test_bd_tgpr \
     $dir $treedir/graph_bd_tgpr || exit 1;
 fi
 
@@ -211,7 +211,7 @@ if [ $stage -le 5 ]; then
           $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_char_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index e656b67e529..5a8a20496cd 100755
--- a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -269,13 +269,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -300,7 +300,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -338,7 +338,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
index 9db76e94430..e5265d3b31d 100755
--- a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
@@ -272,13 +272,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -303,7 +303,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -341,7 +341,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
index 36ec5bb61af..e1394fb65da 100755
--- a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
@@ -260,13 +260,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -287,7 +287,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -325,7 +325,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
index 8d44db6f917..a3950d71d1f 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -276,13 +276,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -307,7 +307,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -345,7 +345,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh
index 544b9b04a0a..26a88900b0f 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -252,13 +252,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -283,7 +283,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -321,7 +321,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh
index b268ed7feda..654fc25a49a 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -253,13 +253,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -283,7 +283,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -321,7 +321,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh
index d1a7f9d0663..a8549470006 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -255,13 +255,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -285,7 +285,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -323,7 +323,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh
index e20069fbfa1..585399e1367 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh
@@ -264,13 +264,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -294,7 +294,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -332,7 +332,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
index 86df0779841..ad97730bf01 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -258,13 +258,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -288,7 +288,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -326,7 +326,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
index 9927a0c28d3..9ba6dfb912d 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -266,13 +266,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -296,7 +296,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -334,7 +334,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 6e4f220c1f2..58c31f67ff7 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -276,13 +276,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -307,7 +307,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -343,7 +343,7 @@ if [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -377,7 +377,7 @@ if $test_online_decoding && [ $stage -le 20 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
index 2d113e58a93..41389e4d07a 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -567,13 +567,13 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgpr \
+    data/lang_test_tgpr \
     $tree_dir $tree_dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    data/lang_test_bd_tgpr \
     $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -598,7 +598,7 @@ if [ $stage -le 18 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -634,7 +634,7 @@ if [ $stage -le 19 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -668,7 +668,7 @@ if $test_online_decoding && [ $stage -le 20 ]; then
           $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
index f2a4ed37ae5..03554c61b57 100755
--- a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
@@ -218,12 +218,12 @@ if [ $stage -le 16 ]; then
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgpr/phones.txt $lang/phones.txt
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_test_tgpr \
+  utils/mkgraph.sh data/lang_test_tgpr \
                    $dir $dir/graph_tgpr || exit 1;
 
   utils/lang/check_phones_compatible.sh \
     data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
-  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_test_bd_tgpr \
+  utils/mkgraph.sh data/lang_test_bd_tgpr \
       $dir $dir/graph_bd_tgpr || exit 1;
 fi
 
@@ -248,7 +248,7 @@ if [ $stage -le 17 ]; then
           $dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 0.333 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -284,7 +284,7 @@ if [ $stage -le 18 ]; then
           $dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 0.333 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -318,7 +318,7 @@ if $test_online_decoding && [ $stage -le 19 ]; then
           $dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
       done
       steps/lmrescore.sh \
-        --self-loop-scale 0.333 \
+        \
         --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
         data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/steps/align_basis_fmllr.sh b/egs/wsj/s5/steps/align_basis_fmllr.sh
index e5510c5ab7e..a9ceab29b4f 100755
--- a/egs/wsj/s5/steps/align_basis_fmllr.sh
+++ b/egs/wsj/s5/steps/align_basis_fmllr.sh
@@ -19,7 +19,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 basis_fmllr_opts="--fmllr-min-count=22  --num-iters=10 --size-scale=0.2 --step-size-iters=3"
 beam=10
 retry_beam=40
diff --git a/egs/wsj/s5/steps/align_basis_fmllr_lats.sh b/egs/wsj/s5/steps/align_basis_fmllr_lats.sh
index 426168496cc..dd3db90bd76 100755
--- a/egs/wsj/s5/steps/align_basis_fmllr_lats.sh
+++ b/egs/wsj/s5/steps/align_basis_fmllr_lats.sh
@@ -16,7 +16,6 @@ stage=0
 nj=4
 cmd=run.pl
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
 acoustic_scale=0.1
 beam=10
 retry_beam=40
@@ -112,18 +111,18 @@ if [ $stage -le 0 ]; then
   echo "$0: compiling training graphs"
   tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
   $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
-    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
     "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
 fi
 
 
 if [ $stage -le 1 ]; then
-  # Note: we need to set --transition-scale=0.0 --self-loop-scale=0.0 because,
+  # Note: we need to set   because,
   # as explained above, we compiled the transition probs into the training
   # graphs.
   echo "$0: aligning data in $data using $alimdl and speaker-independent features."
   $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
-    gmm-align-compiled --transition-scale=0.0 --self-loop-scale=0.0 --acoustic-scale=$acoustic_scale \
+    gmm-align-compiled   --acoustic-scale=$acoustic_scale \
         --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
     "ark:gunzip -c $dir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
 fi
diff --git a/egs/wsj/s5/steps/align_fmllr.sh b/egs/wsj/s5/steps/align_fmllr.sh
index 327978e680f..c1ec67ec7dc 100755
--- a/egs/wsj/s5/steps/align_fmllr.sh
+++ b/egs/wsj/s5/steps/align_fmllr.sh
@@ -18,7 +18,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 careful=false
diff --git a/egs/wsj/s5/steps/align_fmllr_lats.sh b/egs/wsj/s5/steps/align_fmllr_lats.sh
index b331b40d73c..e561a6f0d29 100755
--- a/egs/wsj/s5/steps/align_fmllr_lats.sh
+++ b/egs/wsj/s5/steps/align_fmllr_lats.sh
@@ -12,7 +12,6 @@ stage=0
 nj=4
 cmd=run.pl
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
 acoustic_scale=0.1
 beam=10
 retry_beam=40
@@ -100,18 +99,18 @@ if [ $stage -le 0 ]; then
   echo "$0: compiling training graphs"
   tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
   $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
-    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
     "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
 fi
 
 
 if [ $stage -le 1 ]; then
-  # Note: we need to set --transition-scale=0.0 --self-loop-scale=0.0 because,
+  # Note: we need to set   because,
   # as explained above, we compiled the transition probs into the training
   # graphs.
   echo "$0: aligning data in $data using $alimdl and speaker-independent features."
   $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
-    gmm-align-compiled --transition-scale=0.0 --self-loop-scale=0.0 --acoustic-scale=$acoustic_scale \
+    gmm-align-compiled   --acoustic-scale=$acoustic_scale \
         --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
     "ark:gunzip -c $dir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
 fi
diff --git a/egs/wsj/s5/steps/align_lvtln.sh b/egs/wsj/s5/steps/align_lvtln.sh
index 9efba2b9096..671c3e45c71 100755
--- a/egs/wsj/s5/steps/align_lvtln.sh
+++ b/egs/wsj/s5/steps/align_lvtln.sh
@@ -13,7 +13,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10.0
 retry_beam=40
 boost_silence=1.0 # factor by which to boost silence during alignment.
diff --git a/egs/wsj/s5/steps/align_raw_fmllr.sh b/egs/wsj/s5/steps/align_raw_fmllr.sh
index 639dde559a4..5cec25c096a 100755
--- a/egs/wsj/s5/steps/align_raw_fmllr.sh
+++ b/egs/wsj/s5/steps/align_raw_fmllr.sh
@@ -18,7 +18,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # factor by which to boost silence during alignment.
diff --git a/egs/wsj/s5/steps/align_sgmm2.sh b/egs/wsj/s5/steps/align_sgmm2.sh
index d2f829f7e3e..951e241284e 100755
--- a/egs/wsj/s5/steps/align_sgmm2.sh
+++ b/egs/wsj/s5/steps/align_sgmm2.sh
@@ -18,7 +18,7 @@ use_gselect=false # use gselect info from srcdir [regardless, we use
    # Gaussian-selection info, we might have to compute it though.]
 gselect=15  # Number of Gaussian-selection indices for SGMMs.
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 transform_dir=  # directory to find fMLLR transforms in.
diff --git a/egs/wsj/s5/steps/align_si.sh b/egs/wsj/s5/steps/align_si.sh
index 0bfebe6b0fc..749124dfadf 100755
--- a/egs/wsj/s5/steps/align_si.sh
+++ b/egs/wsj/s5/steps/align_si.sh
@@ -15,7 +15,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 careful=false
diff --git a/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh b/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
index cc8da298d2f..67d92e0b73a 100755
--- a/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
+++ b/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
@@ -111,10 +111,6 @@ cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true
 
 if [ -f $srcdir/frame_subsampling_factor ]; then
   echo "$0: guessing that this is a chain system, checking parameters."
-  if [ -z $scale_opts ]; then
-    echo "$0: setting scale_opts"
-    scale_opts="--self-loop-scale=1.0 --transition-scale=1.0"
-  fi
   if [ $acwt == 0.1 ]; then
     echo "$0: setting acwt=1.0"
     acwt=1.0
diff --git a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
index 9bb67abeff9..27136be6fb5 100755
--- a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
+++ b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
@@ -12,7 +12,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+scale_opts=""
 acoustic_scale=0.1
 beam=15.0
 lattice_beam=8.0
@@ -197,4 +197,3 @@ if [ $stage -le 3 ]; then
     sort -i -b -k1,1 -k4,4nr -k2,2 -k3,3 > $dir/analysis/ops_details.txt
 
 fi
-
diff --git a/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh b/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh
index b18efe35a3c..68b8497f4e2 100755
--- a/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh
+++ b/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh
@@ -12,7 +12,7 @@ nj=8
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+scale_opts=""
 acoustic_scale=0.1
 beam=15.0
 lattice_beam=8.0
diff --git a/egs/wsj/s5/steps/cleanup/make_biased_lm_graphs.sh b/egs/wsj/s5/steps/cleanup/make_biased_lm_graphs.sh
index d957ce4d5c7..9233d142946 100755
--- a/egs/wsj/s5/steps/cleanup/make_biased_lm_graphs.sh
+++ b/egs/wsj/s5/steps/cleanup/make_biased_lm_graphs.sh
@@ -17,7 +17,7 @@ set -e
 # Begin configuration section.
 nj=10
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+scale_opts=""
 top_n_words=100 # Number of common words that we compile into each graph (most frequent
                 # in $data/text.orig.
 top_n_words_weight=1.0  # this weight is before renormalization; it can be more
@@ -49,7 +49,7 @@ if [ $# != 4 ]; then
    echo "Main options (for others, see top of script file)"
    echo "  --scale-opts <scale-opts>                 # Options relating to language"
    echo "                                            # model scale; default is "
-   echo "                                            # '--transition-scale=1.0 --self-loop-scale=0.1'"
+   echo "                                            # ''"
    echo "  --top-n-words <N>                         # Number of most-common-words to add with"
    echo "                                            # unigram probabilities into graph (default: 100)"
    echo "  --top-n-words-weight <float>              # Weight given to top-n-words portion of graph"
diff --git a/egs/wsj/s5/steps/cleanup/make_segmentation_graph.sh b/egs/wsj/s5/steps/cleanup/make_segmentation_graph.sh
index 6705ab6db54..ab18d801c2e 100755
--- a/egs/wsj/s5/steps/cleanup/make_segmentation_graph.sh
+++ b/egs/wsj/s5/steps/cleanup/make_segmentation_graph.sh
@@ -6,8 +6,6 @@
 # Begin configuration section.
 nj=4
 cmd=run.pl
-tscale=1.0      # transition scale.
-loopscale=0.1   # scale for self-loops.
 cleanup=true
 ngram_order=1
 srilm_options="-wbdiscount"   # By default, use Witten-Bell discounting in SRILM
@@ -35,8 +33,6 @@ if [ $# -ne 4 ]; then
   echo "Options:"
   echo "    --ngram-order           # order of n-gram language model"
   echo "    --srilm-options         # options for ngram-count in SRILM tool"
-  echo "    --tscale                # transition scale"
-  echo "    --loopscale             # scale for self-loops"
   echo "    --cleanup               # if true, removes the intermediate files"
   exit 1;
 fi
@@ -87,7 +83,7 @@ fi
 
 mkdir -p $graph_dir/split$nj
 mkdir -p $graph_dir/log
- 
+
 split_texts=""
 for n in $(seq $nj); do
   mkdir -p $graph_dir/split$nj/$n
@@ -97,7 +93,6 @@ utils/split_scp.pl $data/text.orig $split_texts
 
 $cmd JOB=1:$nj $graph_dir/log/make_utterance_graph.JOB.log \
   steps/cleanup/make_utterance_graph.sh --cleanup $cleanup \
-  --tscale $tscale --loopscale $loopscale \
   --ngram-order $ngram_order --srilm-options "$srilm_options" \
   $graph_dir/split$nj/JOB/text $lang \
   $model_dir $graph_dir/split$nj/JOB || exit 1;
diff --git a/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh b/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh
index 277c5a2da1c..a784c8777a8 100755
--- a/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh
+++ b/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh
@@ -4,8 +4,6 @@
 # Apache 2.0
 
 # Begin configuration section.
-tscale=1.0      # transition scale.
-loopscale=0.1   # scale for self-loops.
 cleanup=true
 ngram_order=1
 srilm_options="-wbdiscount"   # By default, use Witten-Bell discounting in SRILM
@@ -34,8 +32,6 @@ if [ $# -ne 4 ]; then
   echo "Options:"
   echo "    --ngram-order           # order of n-gram language model"
   echo "    --srilm-options         # options for ngram-count in SRILM tool"
-  echo "    --tscale                # transition scale"
-  echo "    --loopscale             # scale for self-loops"
   echo "    --cleanup               # if true, removes the intermediate files"
   exit 1;
 fi
@@ -134,7 +130,7 @@ cat $text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
   fstisstochastic $wdir/CLG.fst  || echo "$0: $uttid/CLG.fst not stochastic."
 
   make-h-transducer --disambig-syms-out=$wdir/disambig_tid.int \
-    --transition-scale=$tscale $wdir/ilabels_${N}_${P} \
+    $wdir/ilabels_${N}_${P} \
     $model_dir/tree $model_dir/final.mdl > $wdir/Ha.fst
 
   # Builds HCLGa.fst
@@ -145,13 +141,10 @@ cat $text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
   fstisstochastic $wdir/HCLGa.fst ||\
     echo "$0: $uttid/HCLGa.fst is not stochastic"
 
-  add-self-loops --self-loop-scale=$loopscale --reorder=true \
-    $model_dir/final.mdl < $wdir/HCLGa.fst > $wdir/HCLG.fst
+  add-self-loops $model_dir/final.mdl < $wdir/HCLGa.fst > $wdir/HCLG.fst
 
-  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
-    fstisstochastic $wdir/HCLG.fst ||\
-      echo "$0: $uttid/HCLG.fst is not stochastic."
-  fi
+  fstisstochastic $wdir/HCLG.fst ||\
+    echo "$0: $uttid/HCLG.fst is not stochastic."
 
   echo "$uttid $wdir/HCLG.fst" >> $graph_dir/sub_graphs/HCLG.fsts.scp
   echo
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
index f0df1e7730c..d42cda1b9d3 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
@@ -171,7 +171,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   echo "$0: guessing that this is a chain system, checking parameters."
   if [ -z $scale_opts ]; then
     echo "$0: setting scale_opts"
-    scale_opts="--self-loop-scale=1.0 --transition-scale=1.0"
+    scale_opts=" "
   fi
   if [ $acwt == 0.1 ]; then
     echo "$0: setting acwt=1.0"
diff --git a/egs/wsj/s5/steps/decode_basis_fmllr.sh b/egs/wsj/s5/steps/decode_basis_fmllr.sh
index afb914e7f0d..7e39048f463 100755
--- a/egs/wsj/s5/steps/decode_basis_fmllr.sh
+++ b/egs/wsj/s5/steps/decode_basis_fmllr.sh
@@ -37,7 +37,7 @@ acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in
               # lattice generation.
 
 # Parameters in alignment of training data
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 align_beam=10
 retry_beam=40
 
diff --git a/egs/wsj/s5/steps/decode_fromlats.sh b/egs/wsj/s5/steps/decode_fromlats.sh
index ee719c0e132..73c8954fb48 100755
--- a/egs/wsj/s5/steps/decode_fromlats.sh
+++ b/egs/wsj/s5/steps/decode_fromlats.sh
@@ -22,7 +22,7 @@ beam=20.0
 lattice_beam=7.0
 acwt=0.083333
 batch_size=75 # Limits memory blowup in compile-train-graphs-fsts
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+scale_opts=""
 skip_scoring=false
 # End configuration.
 
diff --git a/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh b/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh
index 1cdd9885314..a953aeb90e7 100755
--- a/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh
+++ b/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh
@@ -33,7 +33,7 @@ vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for
 use_fmllr=false
 fmllr_iters=10
 fmllr_min_count=1000
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+scale_opts=""
 skip_scoring=false
 # End configuration section.
 
diff --git a/egs/wsj/s5/steps/get_fmllr_basis.sh b/egs/wsj/s5/steps/get_fmllr_basis.sh
index 9b60af1fa51..3f145714ef1 100755
--- a/egs/wsj/s5/steps/get_fmllr_basis.sh
+++ b/egs/wsj/s5/steps/get_fmllr_basis.sh
@@ -8,7 +8,7 @@
 
 stage=0
 # Parameters in alignment of training data
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 per_utt=true # If true, then treat each utterance as a separate speaker for purposes of
   # basis training... this is recommended if the number of actual speakers in your
   # training set is less than (feature-dim) * (feature-dim+1).
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
index 4a39ed9dae6..144bc879e51 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
@@ -76,12 +76,10 @@ def prepare_initial_acoustic_model(dir, alidir, run_opts,
         common_train_lib.prepare_initial_network(dir, run_opts,
                                                  srand=srand)
 
-    # Convert to .mdl, train the transitions, set the priors.
+    # Convert to .mdl, set the priors.
     common_lib.execute_command(
         """{command} {dir}/log/init_mdl.log \
-                nnet3-am-init {alidir}/final.mdl {raw_mdl} - \| \
-                nnet3-am-train-transitions - \
-                "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
+                nnet3-am-init {alidir}/final.mdl {raw_mdl} {dir}/0.mdl
         """.format(command=run_opts.command,
                    dir=dir, alidir=alidir,
                    raw_mdl=(input_model if input_model is not None
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py
index db4cb392f10..21874ad6923 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py
@@ -247,3 +247,284 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
             configs.append(line)
             cur_node = '{0}.{1}'.format(self.name, nonlinearity)
         return configs
+
+
+# This class is for parsing lines like
+#  'attention-block dim=768  bottleneck-dim=128 num-heads=8 value-dim=50 key-dim=50 time-stride=3 num-left-inputs=30 num-right-inputs=10 bypass-scale=0.66'
+#
+#  It is a little like a TDNNF-layer, but with attention in the middle and no
+#  ReLU.  Note: as of now, there is no nonlinearity other than what comes from
+#  the attention component itself (it has a softmax).  Imagine the input and
+#  output dim of the layer is largish, like 768.
+#
+#  So we go, 768 --(linear with orthonormal)--> 128 --(affine)--> attention-input-dim  --(attention)--> (50+context-dim)*8  \
+#            --(linear with orthonormal)-->128 -->(linear) 768 -> batchnorm, then add residual connection from original 768-dim input.
+#
+#  ... where attention-input-dim equals value-dim + 2*key-dim + context-dim
+#  and context-dim = (num-left-inputs + 1 + num-right-inputs + 1)
+#     in this case it's 50 + 2*50 + (30+10+1) = 191.
+#
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'               [Descriptor giving the input of the layer.]
+#   bottleneck-dim=-1              [bottleneck dimension, e.g. 128.]
+#   num-heads=-1               [Number of attention heads, e.g. 8]
+#   value-dim=-1               [Dimension of values (the things which get weighted-averaged
+#                               and then output. E.g. 50]
+#   key-dim=-1                 [Dimension of the keys, e.g. 50.  Affects the query
+#                               dimension, but that's larger by context_dim,
+#                               where context_dim == num-left-inputs+1+num-right-inputs.
+#                               That's for the encoding of the position of the input frame.]
+#   dim=-1                     [Dimension of the output of this layer (after the bottleneck;
+#                               e.g. 768].  Defaults to the dimension of the input.]
+#   time-stride=1              [Time stride, dictates the spacing of the inputs to this
+#                               layer.  E.g. might be 3 in typical TDNN-F setups.]
+#   num-left-inputs=-1         [Number of inputs to the left that we use.  Must be specified.]
+#   num-right-inputs=-1         [Number of inputs to the right that we use.  Must be specified.]
+#   num-left-inputs-required: -1   [This affects the left/right context that the network will
+#                                have, i.e. how many frames of input it will insist on having.
+#                                It affects the behavior at chunk boundaries; larger will tend
+#                                to be slower but more accurate.  Note: the default of -1 means:
+#                                use the same as num-left-inputs].
+#   num-right-inputs-required: -1  [See comment for num-left-inputs-required]
+#   output-context:  True        [If true, the softmax weights will be an additional
+#                                output of the attention heads.]
+#   key-scale: 0.0               [If >0.0, becomes a scaling factor on the keys.  Otherwise, we
+#                                 use the default value of 1.0 / sqrt(key-dim).]
+#
+#
+#  bypass-scale : 0.66          [Scale on the input in the residual connection.]
+#  target-rms:   1.0            [Scaling on the output of the batchnorm]
+#
+#  Extra configs that are passed into the affine and linear components:
+#   learning-rate-factor=1.0   [This can be used to make the affine component
+#                               train faster or slower].
+#   max-change=0.75    [maximum change per iteration, per component]
+#   l2-regularize=0.0  [l2 regularization constant for linear and affine components.]
+#
+#  use-relu=False      [If true, add relu]
+#
+#   Documentation for the rest of the parameters (related to the
+#   attention component) can be found in nnet-attention-component.h
+
+
+class XconfigAttentionBlock(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        # Here we just list some likely combinations.. you can just add any
+        # combinations you want to use, to this list.
+        assert first_token == 'attention-block'
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        # note: self.config['input'] is a descriptor, '[-1]' means output
+        # the most recent layer.
+        self.config = { 'input':'[-1]',
+                        'dim': -1,
+                        'bottleneck-dim': -1,
+                        'num-heads': -1,
+                        'value-dim': -1,
+                        'key-dim': -1,
+                        'dim': -1,
+                        'time-stride': 1,
+                        'num-left-inputs': -1,
+                        'num-right-inputs': -1,
+                        'learning-rate-factor': 1.0,
+                        'max-change' : 0.75,
+                        'ng-affine-options' : '',
+                        'l2-regularize': 0.0,
+                        'num-left-inputs-required': -1,
+                        'num-right-inputs-required': -1,
+                        'output-context': True,
+                        'target-rms': 1.0,
+                        'key-scale': 0.0,
+                        'bypass-scale': 0.66,
+                        'use-relu': False}
+
+
+    def check_configs(self):
+        for x in [ 'bottleneck-dim', 'num-heads', 'value-dim', 'key-dim' ]:
+            if self.config[x] <= 0:
+                raise RuntimeError("Expected {} to be positive, got {}".format(x, self.config[x]))
+        for x in ['num-left-inputs', 'num-right-inputs' ]:
+            if self.config[x] < 0:
+                raise RuntimeError("Expected {} to be nonnegative, got {}".format(x, self.config[x]))
+        # Not checking everything here.
+        if self.config['learning-rate-factor'] <= 0.0:
+            raise RuntimeError("learning-rate-factor has invalid value {0}"
+                               .format(self.config['learning-rate-factor']))
+        if self.config['key-scale'] == 0.0:
+            self.config['key-scale'] = 1.0 / math.sqrt(self.config['key-dim'])
+
+    def output_name(self, auxiliary_output=None):
+        # at a later stage we might want to expose even the pre-nonlinearity
+        # vectors
+        return '{0}.noop'.format(self.name)
+
+    def attention_input_dim(self):
+        context_dim = (self.config['num-left-inputs'] +
+                       self.config['num-right-inputs'] + 1)
+        num_heads = self.config['num-heads']
+        key_dim = self.config['key-dim']
+        value_dim = self.config['value-dim']
+        query_dim = key_dim + context_dim;
+        return num_heads * (key_dim + value_dim + query_dim)
+
+    def attention_output_dim(self):
+        context_dim = (self.config['num-left-inputs'] +
+                       self.config['num-right-inputs'] + 1)
+        num_heads = self.config['num-heads']
+        value_dim = self.config['value-dim']
+        return (num_heads *
+                (value_dim +
+                 (context_dim if self.config['output-context'] else 0)))
+
+    def output_dim(self, auxiliary_output = None):
+        dim = self.config['dim']
+        if dim > 0:
+            return dim
+        else:
+            return self.descriptors['input']['dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        output_dim = self.config['dim']
+        if output_dim <= 0:
+            output_dim = input_dim
+        bottleneck_dim = self.config['bottleneck-dim']
+        attention_input_dim = self.attention_input_dim()
+        attention_output_dim = self.attention_output_dim()
+        target_rms = self.config['target-rms']
+        max_change = self.config['max-change']
+        l2_regularize = self.config['l2-regularize']
+        learning_rate_factor=self.config['learning-rate-factor']
+
+        learning_rate_option=('learning-rate-factor={0}'.format(learning_rate_factor)
+                              if learning_rate_factor != 1.0 else '')
+        l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize)
+                                if l2_regularize != 0.0 else '')
+
+        common_options=("{lroption} {l2option} max-change={max_change} "
+                        "".format(lroption = learning_rate_option,
+                                  l2option = l2_regularize_option,
+                                  max_change = max_change))
+
+
+        configs = []
+
+
+        # The first linear component
+        line = ('component name={0}.linear1 type=LinearComponent '
+                'input-dim={1} output-dim={2} '
+                '{3} orthonormal-constraint=-1 '
+                ''.format(self.name, input_dim, bottleneck_dim,
+                          common_options))
+
+        configs.append(line)
+        line = ('component-node name={0}.linear1 component={0}.linear1 input={1} '
+                ''.format(self.name, input_desc))
+        configs.append(line)
+
+        # The first affine component
+        line = ('component name={0}.affine1 type=NaturalGradientAffineComponent '
+                'input-dim={1} output-dim={2} '
+                '{3}'.format(self.name, bottleneck_dim, attention_input_dim,
+                             common_options))
+        configs.append(line)
+        line = ('component-node name={0}.affine1 component={0}.affine1 input={0}.linear1'
+                ''.format(self.name, input_desc))
+        configs.append(line)
+
+
+        line = ('component name={0}.layernorm1 type=NormalizeComponent dim={1} '
+                ' '.format(self.name, attention_input_dim))
+        configs.append(line)
+        line = ('component-node name={0}.layernorm1 component={0}.layernorm1 '
+                'input={0}.affine1 '.format(self.name))
+        configs.append(line)
+        cur_name='layernorm1'
+
+        # The attention component
+        line = ('component name={name}.attention type=RestrictedAttentionComponent '
+                'value-dim={v} key-dim={k} num-left-inputs={nl} '
+                'num-right-inputs={nr} num-left-inputs-required={nlr}'
+                ' num-right-inputs-required={nrr} output-context={oc}'
+                ' time-stride={ts} num-heads={nh} key-scale={ks}'
+                ''.format(name=self.name,
+                          v=self.config['value-dim'], k=self.config['key-dim'],
+                          nl=self.config['num-left-inputs'],
+                          nr=self.config['num-right-inputs'],
+                          nlr=self.config['num-left-inputs-required'],
+                          nrr=self.config['num-right-inputs-required'],
+                          oc=self.config['output-context'],
+                          ts=self.config['time-stride'],
+                          nh=self.config['num-heads'],
+                          ks=self.config['key-scale']))
+        configs.append(line)
+        line = ('component-node name={0}.attention component={0}.attention input={0}.{1}'
+                ''.format(self.name, cur_name))
+        configs.append(line)
+
+        # The second linear component
+        line = ('component name={0}.linear2 type=LinearComponent '
+                'input-dim={1} output-dim={2} orthonormal-constraint=-1 '
+                '{3}'.format(self.name, attention_output_dim, bottleneck_dim,
+                             common_options))
+        configs.append(line)
+        line = ('component-node name={0}.linear2 component={0}.linear2 '
+                'input={0}.attention '.format(self.name))
+        configs.append(line)
+
+        # The third linear component
+        line = ('component name={0}.linear3 type=LinearComponent '
+                'input-dim={1} output-dim={2} '
+                '{3}'.format(self.name, bottleneck_dim, output_dim,
+                             common_options))
+        configs.append(line)
+        line = ('component-node name={0}.linear3 component={0}.linear3 '
+                'input={0}.linear2 '.format(self.name))
+        configs.append(line)
+
+
+        if self.config['use-relu']:
+            line = ('component name={0}.relu type=RectifiedLinearComponent dim={1} '
+                    ''.format(self.name, output_dim))
+            configs.append(line)
+            line = ('component-node name={0}.relu component={0}.relu '
+                    'input={0}.linear3 '.format(self.name))
+            configs.append(line)
+            cur_name = 'relu'
+        else:
+            cur_name = 'linear3'
+
+
+        line = ('component name={0}.layernorm2 type=NormalizeComponent dim={1} '
+                'target-rms={2} '.format(self.name, output_dim, target_rms))
+        configs.append(line)
+        line = ('component-node name={0}.layernorm2 component={0}.layernorm2 '
+                'input={0}.{1} '.format(self.name, cur_name))
+        configs.append(line)
+
+
+        line = ('component name={0}.noop type=NoOpComponent dim={1}'.format(
+            self.name, output_dim))
+        configs.append(line)
+        line = ('component-node name={name}.noop component={name}.noop input=Sum(Scale({b}, {i}), {name}.layernorm2)'
+                ''.format(name=self.name, b=self.config['bypass-scale'], i=input_desc))
+        configs.append(line)
+
+        return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 5e21c4c0274..1fdf4759ec1 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -86,6 +86,7 @@
         'dim-range-component': xlayers.XconfigDimRangeComponent,
         'offset-component':  xlayers.XconfigPerElementOffsetComponent,
         'combine-feature-maps-layer': xlayers.XconfigCombineFeatureMapsLayer,
+        'attention-block': xlayers.XconfigAttentionBlock,
         'delta-layer': xlayers.XconfigDeltaLayer
 }
 
diff --git a/egs/wsj/s5/steps/lmrescore.sh b/egs/wsj/s5/steps/lmrescore.sh
index 88db8ae15dc..abe5e740166 100755
--- a/egs/wsj/s5/steps/lmrescore.sh
+++ b/egs/wsj/s5/steps/lmrescore.sh
@@ -7,7 +7,6 @@ mode=4  # mode can be 1 through 5.  They should all give roughly similar results
         # See the comments in the case statement for more details.
 cmd=run.pl
 skip_scoring=false
-self_loop_scale=0.1  # only matters for mode 4.
 acoustic_scale=0.1   # only matters for mode 5.
 # End configuration section.
 
@@ -22,8 +21,6 @@ if [ $# != 5 ]; then
    echo " --cmd   <cmd-string>       # How to run commands (e.g. run.pl, queue.pl)"
    echo " --mode  (1|2|3|4|5)        # Mode of LM rescoring to use (default: 4)."
    echo "                            # These should give very similar results."
-   echo " --self-loop-scale  <scale> # Self-loop-scale, only relevant in mode 4."
-   echo "                            # Default: 0.1."
    echo " --acoustic-scale  <scale>  # Acoustic scale, only relevant in mode 5."
    echo "                            # Default: 0.1."
    exit 1;
@@ -109,8 +106,6 @@ case "$mode" in
      # grammar and transition weights.
     mdl=`dirname $indir`/final.mdl
     [ ! -f $mdl ] && echo No such model $mdl && exit 1;
-    [[ -f `dirname $indir`/frame_subsampling_factor && "$self_loop_scale" == 0.1 ]] && \
-      echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";
     $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
       gunzip -c $indir/lat.JOB.gz \| \
       lattice-scale --lm-scale=0.0 ark:- ark:- \| \
@@ -118,8 +113,7 @@ case "$mode" in
       lattice-compose ark:- $outdir/Ldet.fst ark:- \| \
       lattice-determinize ark:- ark:- \| \
       lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
-      lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=$self_loop_scale \
-      $mdl ark:- ark:- \| \
+      lattice-add-trans-probs $mdl ark:- ark:- \| \
       gzip -c \>$outdir/lat.JOB.gz  || exit 1;
     ;;
   5) # Mode 5 uses the binary lattice-lmrescore-pruned to do the LM rescoring
diff --git a/egs/wsj/s5/steps/make_phone_graph.sh b/egs/wsj/s5/steps/make_phone_graph.sh
index aaf88cc66d2..c7b57374f1c 100755
--- a/egs/wsj/s5/steps/make_phone_graph.sh
+++ b/egs/wsj/s5/steps/make_phone_graph.sh
@@ -15,8 +15,6 @@ stage=0
 cmd=run.pl
 N=3  # change N and P for non-trigram systems.
 P=1
-tscale=1.0 # transition scale.
-loopscale=0.1 # scale for self-loops.
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -117,7 +115,7 @@ fi
 if [ $stage -le 5 ]; then
   echo "$0: creating Ha.fst"
   make-h-transducer --disambig-syms-out=$dir/phone_graph/disambig_tid.int \
-    --transition-scale=$tscale $dir/phone_graph/ilabels_${N}_${P} $dir/tree $dir/final.mdl \
+    $dir/phone_graph/ilabels_${N}_${P} $dir/tree $dir/final.mdl \
        > $dir/phone_graph/Ha.fst
 fi
 
@@ -131,13 +129,9 @@ if [ $stage -le 6 ]; then
 fi
 
 if [ $stage -le 7 ]; then
-  add-self-loops --self-loop-scale=$loopscale --reorder=true \
-    $dir/final.mdl < $dir/phone_graph/HCLGa.fst > $dir/phone_graph/HCLG.fst || exit 1;
+  add-self-loops $dir/final.mdl < $dir/phone_graph/HCLGa.fst > $dir/phone_graph/HCLG.fst || exit 1;
 
-  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
-    # No point doing this test if transition-scale not 1, as it is bound to fail.
-    fstisstochastic $dir/phone_graph/HCLG.fst || echo "[info]: final HCLG is not stochastic."
-  fi
+  fstisstochastic $dir/phone_graph/HCLG.fst || echo "[info]: final HCLG is not stochastic."
 
   # $lang/phones.txt is the symbol table that corresponds to the output
   # symbols on the graph; decoding scripts expect it as words.txt.
diff --git a/egs/wsj/s5/steps/nnet/align.sh b/egs/wsj/s5/steps/nnet/align.sh
index f976711fa58..1a05e620394 100755
--- a/egs/wsj/s5/steps/nnet/align.sh
+++ b/egs/wsj/s5/steps/nnet/align.sh
@@ -10,7 +10,7 @@ nj=4
 cmd=run.pl
 stage=0
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 nnet_forward_opts="--no-softmax=true --prior-scale=1.0"
@@ -19,7 +19,7 @@ text= # (optional) transcipts we align to,
 
 align_to_lats=false # optionally produce alignment in lattice format
  lats_decode_opts="--acoustic-scale=0.1 --beam=20 --lattice_beam=10"
- lats_graph_scales="--transition-scale=1.0 --self-loop-scale=0.1"
+ lats_graph_scales=""
 
 use_gpu="no" # yes|no|optionaly
 # End configuration options.
diff --git a/egs/wsj/s5/steps/nnet2/align.sh b/egs/wsj/s5/steps/nnet2/align.sh
index fa040d692ad..5b89655ef37 100755
--- a/egs/wsj/s5/steps/nnet2/align.sh
+++ b/egs/wsj/s5/steps/nnet2/align.sh
@@ -9,7 +9,7 @@
 nj=4
 cmd=run.pl
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 transform_dir=
diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh
index aa2de2ee1a5..75102f2c1d7 100755
--- a/egs/wsj/s5/steps/nnet3/align.sh
+++ b/egs/wsj/s5/steps/nnet3/align.sh
@@ -13,7 +13,7 @@
 nj=4
 cmd=run.pl
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 iter=final
@@ -104,10 +104,10 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
   cp $srcdir/frame_subsampling_factor $dir
   if [ "$frame_subsampling_factor" -gt 1 ] && \
-     [ "$scale_opts" == "--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" ]; then
+     [ "$scale_opts" == "--acoustic-scale=0.1" ]; then
     echo "$0: frame-subsampling-factor is not 1 (so likely a chain system),"
     echo "...  but the scale opts are the defaults.  You probably want"
-    echo "--scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0'"
+    echo "--scale-opts '--acoustic-scale=1.0 '"
     sleep 1
   fi
 fi
diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh
index 201cc3552ba..c3b55e8547c 100755
--- a/egs/wsj/s5/steps/nnet3/align_lats.sh
+++ b/egs/wsj/s5/steps/nnet3/align_lats.sh
@@ -13,7 +13,6 @@ nj=4
 cmd=run.pl
 stage=-1
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
 acoustic_scale=0.1
 beam=20
 iter=final
@@ -94,13 +93,10 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   cp $srcdir/frame_subsampling_factor $dir
   if [[ $frame_subsampling_factor -gt 1 ]]; then
     # Assume a chain system, check agrument sanity.
-    if [[ ! ($scale_opts == *--self-loop-scale=1.0* &&
-             $scale_opts == *--transition-scale=1.0* &&
-             $acoustic_scale = '1.0') ]]; then
+    if [[  $acoustic_scale = '1.0') ]]; then
       echo "$0: ERROR: frame-subsampling-factor is not 1, assuming a chain system."
       echo "... You should pass the following options to this script:"
-      echo "  --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'" \
-           "--acoustic_scale 1.0"
+      echo "--acoustic_scale 1.0"
     fi
   fi
 fi
@@ -122,7 +118,6 @@ if [ $stage -le 0 ]; then
   ## because the other scripts write them without transition probs.
   $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
     $prog --read-disambig-syms=$lang/phones/disambig.int \
-    $scale_opts \
     $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" \
     "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1
 fi
diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
index 757963f13a7..ebb9e24902f 100755
--- a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
@@ -23,7 +23,6 @@ context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for q
 cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
 frame_subsampling_factor=1
 alignment_subsampling_factor=
-leftmost_questions_truncate=-1  # note: this option is deprecated and has no effect
 tree_stats_opts=
 cluster_phones_opts=
 repeat_frames=false
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh b/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
index 07d5ee8cfb8..3503cc57f50 100755
--- a/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
@@ -20,7 +20,7 @@ type=mono             # Can be either mono or biphone -- either way
                       # the resulting tree is full (i.e. it doesn't do any tying)
 ci_silence=false      # If true, silence phones will be treated as context independent
 
-scale_opts="--transition-scale=0.0 --self-loop-scale=0.0"
+scale_opts=" "
 tie=false             # If true, gmm-init-biphone will do some tying when
                       # creating the full biphone tree (it won't be full anymore).
                       # Specifically, it will revert to monophone if the data
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo.pl b/egs/wsj/s5/steps/nnet3/chain/gen_topo.pl
deleted file mode 100755
index 32dfa272a97..00000000000
--- a/egs/wsj/s5/steps/nnet3/chain/gen_topo.pl
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env perl
-
-# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-# Generate a topology file.  This allows control of the number of states in the
-# non-silence HMMs, and in the silence HMMs.  This is a modified version of
-# 'utils/gen_topo.pl' that generates a different type of topology, one that we
-# believe should be useful in the 'chain' model.  Note: right now it doesn't
-# have any real options, and it treats silence and nonsilence the same.  The
-# intention is that you write different versions of this script, or add options,
-# if you experiment with it.
-
-if (@ARGV != 2) {
-  print STDERR "Usage: utils/gen_topo.pl <colon-separated-nonsilence-phones> <colon-separated-silence-phones>\n";
-  print STDERR "e.g.:  utils/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n";
-  exit (1);
-}
-
-($nonsil_phones, $sil_phones) = @ARGV;
-
-$nonsil_phones =~ s/:/ /g;
-$sil_phones =~ s/:/ /g;
-$nonsil_phones =~ m/^\d[ \d]+$/ || die "$0: bad arguments @ARGV\n";
-$sil_phones =~ m/^\d[ \d]*$/ || die "$0: bad arguments @ARGV\n";
-
-print "<Topology>\n";
-print "<TopologyEntry>\n";
-print "<ForPhones>\n";
-print "$nonsil_phones $sil_phones\n";
-print "</ForPhones>\n";
-# The next two lines may look like a bug, but they are as intended.  State 0 has
-# no self-loop, it happens exactly once.  And it can go either to state 1 (with
-# a self-loop) or to state 2, so we can have zero or more instances of state 1
-# following state 0.
-# We make the transition-probs 0.5 so they normalize, to keep the code happy.
-# In fact, we always set the transition probability scale to 0.0 in the 'chain'
-# code, so they are never used.
-print "<State> 0 <PdfClass> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>\n";
-print "<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>\n";
-print "<State> 2 </State>\n";
-print "</TopologyEntry>\n";
-print "</Topology>\n";
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo.py
index 88def77451b..f587d1b8448 100755
--- a/egs/wsj/s5/steps/nnet3/chain/gen_topo.py
+++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo.py
@@ -32,17 +32,15 @@
 nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
 all_phones = silence_phones +  nonsilence_phones
 
+
 print("<Topology>")
 print("<TopologyEntry>")
 print("<ForPhones>")
 print(" ".join([str(x) for x in all_phones]))
 print("</ForPhones>")
-# We make the transition-probs 0.5 so they normalize, to keep the code happy.
-# In fact, we always set the transition probability scale to 0.0 in the 'chain'
-# code, so they are never used.
-# Note: the <ForwardPdfClass> will actually happen on the incoming arc because
-# we always build the graph with "reorder=true".
-print("<State> 0 <ForwardPdfClass> 0 <SelfLoopPdfClass> 1 <Transition> 0 0.5 <Transition> 1 0.5 </State>")
-print("<State> 1 </State>")
+print("0  1  1  0.0")
+print("1  1  2  0.69314718055")
+print("1  0.69314718055")
+print("")
 print("</TopologyEntry>")
 print("</Topology>")
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo2.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo2.py
deleted file mode 100755
index a33dab666e6..00000000000
--- a/egs/wsj/s5/steps/nnet3/chain/gen_topo2.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-# Generate a topology file.  This allows control of the number of states in the
-# non-silence HMMs, and in the silence HMMs.  This is a modified version of
-# 'utils/gen_topo.pl' that generates a different type of topology, one that we
-# believe should be useful in the 'chain' model.  Note: right now it doesn't
-# have any real options, and it treats silence and nonsilence the same.  The
-# intention is that you write different versions of this script, or add options,
-# if you experiment with it.
-
-from __future__ import print_function
-import argparse
-
-
-parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
-                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
-                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
-                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
-parser.add_argument("nonsilence_phones", type=str,
-                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
-parser.add_argument("silence_phones", type=str,
-                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
-
-args = parser.parse_args()
-
-silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
-nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
-all_phones = silence_phones +  nonsilence_phones
-
-print("<Topology>")
-print("<TopologyEntry>")
-print("<ForPhones>")
-print(" ".join([str(x) for x in all_phones]))
-print("</ForPhones>")
-
-# the pdf-classes are as follows:
-#  pdf-class 0 is in a 1-frame sequence, the initial and final state.
-#  pdf-class 1 is in a sequence with >=3 frames, the 'middle' states.  (important that
-#   it be numbered 1, which is the default list of pdf-classes used in 'cluster-phones').
-#  pdf-class 2 is the initial-state in a sequence with >= 2 frames.
-#  pdf-class 3 is the final-state in a sequence with >= 2 frames.
-# state 0 is nonemitting in this topology.
-
-print("<State> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>")  # initial nonemitting state.
-print("<State> 1 <PdfClass> 0 <Transition> 5 1.0 </State>")  # 1-frame sequence.
-print("<State> 2 <PdfClass> 2 <Transition> 3 0.5 <Transition> 4 0.5 </State>")  # 2 or more frames
-print("<State> 3 <PdfClass> 1 <Transition> 3 0.5 <Transition> 4 0.5 </State>")  # 3 or more frames
-print("<State> 4 <PdfClass> 3 <Transition> 5 1.0 </State>") # 2 or more frames.
-print("<State> 5 </State>")  # final nonemitting state
-
-print("</TopologyEntry>")
-print("</Topology>")
-
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo3.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo3.py
deleted file mode 100755
index f43f5046813..00000000000
--- a/egs/wsj/s5/steps/nnet3/chain/gen_topo3.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-# Generate a topology file.  This allows control of the number of states in the
-# non-silence HMMs, and in the silence HMMs.  This is a modified version of
-# 'utils/gen_topo.pl' that generates a different type of topology, one that we
-# believe should be useful in the 'chain' model.  Note: right now it doesn't
-# have any real options, and it treats silence and nonsilence the same.  The
-# intention is that you write different versions of this script, or add options,
-# if you experiment with it.
-
-from __future__ import print_function
-import argparse
-
-
-parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
-                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
-                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
-                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
-parser.add_argument("nonsilence_phones", type=str,
-                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
-parser.add_argument("silence_phones", type=str,
-                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
-
-args = parser.parse_args()
-
-silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
-nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
-all_phones = silence_phones +  nonsilence_phones
-
-print("<Topology>")
-print("<TopologyEntry>")
-print("<ForPhones>")
-print(" ".join([str(x) for x in all_phones]))
-print("</ForPhones>")
-print("<State> 0 <PdfClass> 0 <Transition> 0 0.5 <Transition> 1 0.5 </State>")
-print("<State> 1 </State>")
-print("</TopologyEntry>")
-print("</Topology>")
-
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo4.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo4.py
deleted file mode 100755
index 6d88a6e4449..00000000000
--- a/egs/wsj/s5/steps/nnet3/chain/gen_topo4.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-# Generate a topology file.  This allows control of the number of states in the
-# non-silence HMMs, and in the silence HMMs.  This is a modified version of
-# 'utils/gen_topo.pl' that generates a different type of topology, one that we
-# believe should be useful in the 'chain' model.  Note: right now it doesn't
-# have any real options, and it treats silence and nonsilence the same.  The
-# intention is that you write different versions of this script, or add options,
-# if you experiment with it.
-
-from __future__ import print_function
-import argparse
-
-
-parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
-                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
-                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
-                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
-parser.add_argument("nonsilence_phones", type=str,
-                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
-parser.add_argument("silence_phones", type=str,
-                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
-
-args = parser.parse_args()
-
-silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
-nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
-all_phones = silence_phones +  nonsilence_phones
-
-print("<Topology>")
-print("<TopologyEntry>")
-print("<ForPhones>")
-print(" ".join([str(x) for x in all_phones]))
-print("</ForPhones>")
-# state 0 is obligatory (occurs once)
-print("<State> 0 <PdfClass> 0 <Transition> 1 0.3333 <Transition> 2 0.3333 <Transition> 3 0.3333 </State> ")
-# state 1 is used only when >2 frames
-print("<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
-# state 2 is used only when >=2 frames (and occurs once)
-print("<State> 2 <PdfClass> 2 <Transition> 3 1.0 </State>")
-print("<State> 3 </State>")  # final nonemitting state
-print("</TopologyEntry>")
-print("</Topology>")
-
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py
index 1583966b58c..9df502545a5 100755
--- a/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py
+++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py
@@ -2,6 +2,9 @@
 
 # Copyright 2012  Johns Hopkins University (author: Daniel Povey)
 
+# This script was modified around 11.11.2016, when the code was extended to
+# support having a different pdf-class on the self loop.
+
 # Generate a topology file.  This allows control of the number of states in the
 # non-silence HMMs, and in the silence HMMs.  This is a modified version of
 # 'utils/gen_topo.pl' that generates a different type of topology, one that we
@@ -29,22 +32,17 @@
 nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
 all_phones = silence_phones +  nonsilence_phones
 
+
 print("<Topology>")
 print("<TopologyEntry>")
 print("<ForPhones>")
 print(" ".join([str(x) for x in all_phones]))
 print("</ForPhones>")
-# state 0 is nonemitting
-print("<State> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
-# state 1 is for when we traverse it in 1 state
-print("<State> 1 <PdfClass> 0 <Transition> 4 1.0 </State>")
-# state 2 is for when we traverse it in >1 state, for the first state.
-print("<State> 2 <PdfClass> 2 <Transition> 3 1.0 </State>")
-# state 3 is for the self-loop.  Use pdf-class 1 here so that the default
-# phone-class clustering (which uses only pdf-class 1 by default) gets only
-# stats from longer phones.
-print("<State> 3 <PdfClass> 1 <Transition> 3 0.5 <Transition> 4 0.5 </State>")
-print("<State> 4 </State>")
+print("0  1  1  0.69314718055")
+print("0  2  3  0.69314718055")
+print("1  1  2  0.69314718055")
+print("1  0.69314718055")
+print("2  0.0")
+print("")
 print("</TopologyEntry>")
 print("</Topology>")
-
diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh
index 7853daa4563..4007419bc47 100755
--- a/egs/wsj/s5/steps/nnet3/get_degs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_degs.sh
@@ -59,9 +59,6 @@ nj=200
 iter=final
 
 
-# decoding-graph option
-self_loop_scale=0.1  # for decoding graph.. should be 1.0 for chain models.
-
 # options relating to decoding.
 frames_per_chunk_decoding=150
 beam=13.0
@@ -156,21 +153,10 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   # e.g. for 'chain' systems
   frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
   cp $srcdir/frame_subsampling_factor $dir
-  if [ $frame_subsampling_factor -ne 1 ] && [ "$self_loop_scale" == "0.1" ]; then
-    echo "$0: warning: frame_subsampling_factor is not 1 (so likely a chain system),"
-    echo "...  but self-loop-scale is 0.1.  Make sure this is not a mistake."
-    sleep 1
-  fi
 else
   frame_subsampling_factor=1
 fi
 
-if [ "$self_loop_scale" == "1.0" ] && [ "$acwt" == 0.1 ]; then
-  echo "$0: warning: you set --self-loop-scale=1.0 (so likely a chain system)",
-  echo " ... but the acwt is still 0.1 (you probably want --acwt 1.0)"
-  sleep 1
-fi
-
 ## Make the decoding graph.
 if [ $stage -le 0 ]; then
   new_lang="$dir/"$(basename "$lang")
@@ -183,7 +169,7 @@ if [ $stage -le 0 ]; then
     utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
     || exit 1;
 
-  utils/mkgraph.sh --self-loop-scale $self_loop_scale $new_lang $srcdir $dir/dengraph || exit 1;
+  utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1;
 fi
 
 # copy alignments into ark,scp format which allows us to use different num-jobs
diff --git a/egs/wsj/s5/steps/nnet3/make_denlats.sh b/egs/wsj/s5/steps/nnet3/make_denlats.sh
index 36da179bbaf..29df25cbf91 100755
--- a/egs/wsj/s5/steps/nnet3/make_denlats.sh
+++ b/egs/wsj/s5/steps/nnet3/make_denlats.sh
@@ -17,7 +17,6 @@ sub_split=1
 beam=13.0
 frames_per_chunk=50
 lattice_beam=7.0
-self_loop_scale=0.1
 acwt=0.1
 max_active=5000
 min_active=200
@@ -102,7 +101,7 @@ else
    awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
     utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
     || exit 1;
-  utils/mkgraph.sh --self-loop-scale $self_loop_scale $new_lang $srcdir $dir/dengraph || exit 1;
+  utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1;
 fi
 cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
 cp $srcdir/cmvn_opts $dir 2>/dev/null
diff --git a/egs/wsj/s5/steps/online/nnet2/align.sh b/egs/wsj/s5/steps/online/nnet2/align.sh
index c24bbf0291e..249947ecfd9 100755
--- a/egs/wsj/s5/steps/online/nnet2/align.sh
+++ b/egs/wsj/s5/steps/online/nnet2/align.sh
@@ -14,7 +14,7 @@
 nj=4
 cmd=run.pl
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 iter=final
diff --git a/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py b/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py
index 12c9bb1e902..e1d6702212e 100755
--- a/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py
+++ b/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py
@@ -41,13 +41,6 @@ def get_args():
         duration constraint.""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
-    parser.add_argument("--transition-scale", type=float, default=1.0,
-                        help="""Scale on transition probabilities relative to
-                        LM weights""")
-    parser.add_argument("--loopscale", type=float, default=0.1,
-                        help="""Scale on self-loop log-probabilities relative
-                        to LM weights""")
-
     parser.add_argument("--min-silence-duration", type=float, default=0.03,
                         help="""Minimum duration for silence""")
     parser.add_argument("--min-speech-duration", type=float, default=0.3,
diff --git a/egs/wsj/s5/steps/tandem/align_fmllr.sh b/egs/wsj/s5/steps/tandem/align_fmllr.sh
index 0b012e24146..12526f6f792 100755
--- a/egs/wsj/s5/steps/tandem/align_fmllr.sh
+++ b/egs/wsj/s5/steps/tandem/align_fmllr.sh
@@ -19,7 +19,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # factor by which to boost silence during alignment.
diff --git a/egs/wsj/s5/steps/tandem/align_sgmm2.sh b/egs/wsj/s5/steps/tandem/align_sgmm2.sh
index 48eb1fbef43..ab41834dfcb 100755
--- a/egs/wsj/s5/steps/tandem/align_sgmm2.sh
+++ b/egs/wsj/s5/steps/tandem/align_sgmm2.sh
@@ -19,7 +19,7 @@ use_gselect=false # use gselect info from srcdir [regardless, we use
    # Gaussian-selection info, we might have to compute it though.]
 gselect=15  # Number of Gaussian-selection indices for SGMMs.
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 transform_dir=  # directory to find fMLLR transforms in.
diff --git a/egs/wsj/s5/steps/tandem/align_si.sh b/egs/wsj/s5/steps/tandem/align_si.sh
index 4e52c51e308..1cd9e534165 100755
--- a/egs/wsj/s5/steps/tandem/align_si.sh
+++ b/egs/wsj/s5/steps/tandem/align_si.sh
@@ -16,7 +16,7 @@ nj=4
 cmd=run.pl
 use_graphs=false
 # Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # Factor by which to boost silence during alignment.
diff --git a/egs/wsj/s5/steps/tandem/train_deltas.sh b/egs/wsj/s5/steps/tandem/train_deltas.sh
index d6a1baa6623..70fb30dcb9c 100755
--- a/egs/wsj/s5/steps/tandem/train_deltas.sh
+++ b/egs/wsj/s5/steps/tandem/train_deltas.sh
@@ -8,7 +8,7 @@
 stage=-4 #  This allows restarting after partway, when something when wrong.
 config=
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 num_iters=35    # Number of iterations of training
 max_iter_inc=25 # Last iter to increase #Gauss on.
diff --git a/egs/wsj/s5/steps/tandem/train_lda_mllt.sh b/egs/wsj/s5/steps/tandem/train_lda_mllt.sh
index a5fa4ea8786..67ca80b11ff 100755
--- a/egs/wsj/s5/steps/tandem/train_lda_mllt.sh
+++ b/egs/wsj/s5/steps/tandem/train_lda_mllt.sh
@@ -8,7 +8,7 @@
 cmd=run.pl
 config=
 stage=-5
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 mllt_iters="2 4 6 12";
 num_iters=35    # Number of iterations of training
diff --git a/egs/wsj/s5/steps/tandem/train_mllt.sh b/egs/wsj/s5/steps/tandem/train_mllt.sh
index 7d46074baec..e8796c8f5db 100755
--- a/egs/wsj/s5/steps/tandem/train_mllt.sh
+++ b/egs/wsj/s5/steps/tandem/train_mllt.sh
@@ -12,7 +12,7 @@
 cmd=run.pl
 config=
 stage=-5
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 mllt_iters="2 4 6 12";
 num_iters=35    # Number of iterations of training
diff --git a/egs/wsj/s5/steps/tandem/train_mono.sh b/egs/wsj/s5/steps/tandem/train_mono.sh
index b5c55f6f369..486478709d6 100755
--- a/egs/wsj/s5/steps/tandem/train_mono.sh
+++ b/egs/wsj/s5/steps/tandem/train_mono.sh
@@ -11,7 +11,7 @@
 # Begin configuration section.
 nj=4
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 num_iters=40    # Number of iterations of training
 max_iter_inc=30 # Last iter to increase #Gauss on.
 totgauss=1000 # Target #Gaussians.  
diff --git a/egs/wsj/s5/steps/tandem/train_sat.sh b/egs/wsj/s5/steps/tandem/train_sat.sh
index 09e3f625674..2bfd2130b55 100755
--- a/egs/wsj/s5/steps/tandem/train_sat.sh
+++ b/egs/wsj/s5/steps/tandem/train_sat.sh
@@ -14,7 +14,7 @@
 stage=-5
 fmllr_update_type=full
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
diff --git a/egs/wsj/s5/steps/tandem/train_sgmm2.sh b/egs/wsj/s5/steps/tandem/train_sgmm2.sh
index daa0437b47b..2df69708c8a 100755
--- a/egs/wsj/s5/steps/tandem/train_sgmm2.sh
+++ b/egs/wsj/s5/steps/tandem/train_sgmm2.sh
@@ -16,7 +16,7 @@ cmd=run.pl
 stage=-6 # use this to resume partially finished training
 context_opts= # e.g. set it to "--context-width=5 --central-position=2"  for a
 # quinphone system.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 num_iters=25   # Total number of iterations of training
 num_iters_alimdl=3 # Number of iterations for estimating alignment model.
 max_iter_inc=15 # Last iter to increase #substates on.
diff --git a/egs/wsj/s5/steps/train_deltas.sh b/egs/wsj/s5/steps/train_deltas.sh
index 7deace6b13e..452cb2852cd 100755
--- a/egs/wsj/s5/steps/train_deltas.sh
+++ b/egs/wsj/s5/steps/train_deltas.sh
@@ -7,7 +7,7 @@
 stage=-4 #  This allows restarting after partway, when something when wrong.
 config=
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 num_iters=35    # Number of iterations of training
 max_iter_inc=25 # Last iter to increase #Gauss on.
diff --git a/egs/wsj/s5/steps/train_lda_mllt.sh b/egs/wsj/s5/steps/train_lda_mllt.sh
index a1828aa6fcb..60ce32bbd35 100755
--- a/egs/wsj/s5/steps/train_lda_mllt.sh
+++ b/egs/wsj/s5/steps/train_lda_mllt.sh
@@ -14,7 +14,7 @@
 cmd=run.pl
 config=
 stage=-5
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 mllt_iters="2 4 6 12";
 num_iters=35    # Number of iterations of training
diff --git a/egs/wsj/s5/steps/train_lvtln.sh b/egs/wsj/s5/steps/train_lvtln.sh
index 111e0598edf..264171da00a 100755
--- a/egs/wsj/s5/steps/train_lvtln.sh
+++ b/egs/wsj/s5/steps/train_lvtln.sh
@@ -17,7 +17,7 @@
 stage=-6 #  This allows restarting after partway, when something when wrong.
 config=
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 num_iters=35    # Number of iterations of training
 max_iter_inc=25 # Last iter to increase #Gauss on.
diff --git a/egs/wsj/s5/steps/train_mono.sh b/egs/wsj/s5/steps/train_mono.sh
index 5a0b79a4a1c..3b4744db2ed 100755
--- a/egs/wsj/s5/steps/train_mono.sh
+++ b/egs/wsj/s5/steps/train_mono.sh
@@ -11,7 +11,7 @@
 # Begin configuration section.
 nj=4
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 num_iters=40    # Number of iterations of training
 max_iter_inc=30 # Last iter to increase #Gauss on.
 initial_beam=6 # beam used in the first iteration (set smaller to speed up initialization)
diff --git a/egs/wsj/s5/steps/train_quick.sh b/egs/wsj/s5/steps/train_quick.sh
index 3325c4964e9..4e3c807484a 100755
--- a/egs/wsj/s5/steps/train_quick.sh
+++ b/egs/wsj/s5/steps/train_quick.sh
@@ -10,7 +10,7 @@
 
 # Begin configuration..
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 15"; # Only realign twice.
 num_iters=20    # Number of iterations of training
 maxiterinc=15 # Last iter to increase #Gauss on.
diff --git a/egs/wsj/s5/steps/train_raw_sat.sh b/egs/wsj/s5/steps/train_raw_sat.sh
index aa5e8813d71..615988096e7 100755
--- a/egs/wsj/s5/steps/train_raw_sat.sh
+++ b/egs/wsj/s5/steps/train_raw_sat.sh
@@ -14,7 +14,7 @@
 # Begin configuration section.
 stage=-6
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
diff --git a/egs/wsj/s5/steps/train_sat.sh b/egs/wsj/s5/steps/train_sat.sh
index 92b744dc75c..4219b52f804 100755
--- a/egs/wsj/s5/steps/train_sat.sh
+++ b/egs/wsj/s5/steps/train_sat.sh
@@ -17,7 +17,7 @@ exit_stage=-100 # you can use this to require it to exit at the
                 # supported.
 fmllr_update_type=full
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 careful=false
diff --git a/egs/wsj/s5/steps/train_sat_basis.sh b/egs/wsj/s5/steps/train_sat_basis.sh
index 5245ea0c619..c2b5591a773 100755
--- a/egs/wsj/s5/steps/train_sat_basis.sh
+++ b/egs/wsj/s5/steps/train_sat_basis.sh
@@ -13,7 +13,7 @@
 # Begin configuration section.
 stage=-5
 cmd=run.pl
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
diff --git a/egs/wsj/s5/steps/train_segmenter.sh b/egs/wsj/s5/steps/train_segmenter.sh
index 515005c0257..64d006c2e75 100755
--- a/egs/wsj/s5/steps/train_segmenter.sh
+++ b/egs/wsj/s5/steps/train_segmenter.sh
@@ -8,7 +8,7 @@ stage=-4 # For restarting a process that went part way.
 config=
 cmd=run.pl
 
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 realign_iters="10 20 30";
 num_iters=35    # Number of iterations of training
 max_iter_inc=25 # Last iter to increase #Gauss on.
diff --git a/egs/wsj/s5/steps/train_sgmm2.sh b/egs/wsj/s5/steps/train_sgmm2.sh
index 7f7df2e046a..812387599af 100755
--- a/egs/wsj/s5/steps/train_sgmm2.sh
+++ b/egs/wsj/s5/steps/train_sgmm2.sh
@@ -14,7 +14,7 @@ cmd=run.pl
 stage=-6 # use this to resume partially finished training
 context_opts= # e.g. set it to "--context-width=5 --central-position=2"  for a
 # quinphone system.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 num_iters=25   # Total number of iterations of training
 num_iters_alimdl=3 # Number of iterations for estimating alignment model.
 max_iter_inc=15 # Last iter to increase #substates on.
diff --git a/egs/wsj/s5/steps/train_sgmm2_group.sh b/egs/wsj/s5/steps/train_sgmm2_group.sh
index 7263e2d5e8e..59cfb51e9ab 100755
--- a/egs/wsj/s5/steps/train_sgmm2_group.sh
+++ b/egs/wsj/s5/steps/train_sgmm2_group.sh
@@ -17,7 +17,7 @@ cmd=run.pl
 stage=-6 # use this to resume partially finished training
 context_opts= # e.g. set it to "--context-width=5 --central-position=2"  for a
 # quinphone system.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--acoustic-scale=0.1"
 num_iters=25   # Total number of iterations of training
 num_iters_alimdl=3 # Number of iterations for estimating alignment model.
 max_iter_inc=15 # Last iter to increase #substates on.
diff --git a/egs/wsj/s5/utils/gen_topo.pl b/egs/wsj/s5/utils/gen_topo.pl
index 1c02ed0eaeb..896f41d870d 100755
--- a/egs/wsj/s5/utils/gen_topo.pl
+++ b/egs/wsj/s5/utils/gen_topo.pl
@@ -4,6 +4,9 @@
 
 # Generate a topology file.  This allows control of the number of states in the
 # non-silence HMMs, and in the silence HMMs.
+# This is the topology we use for GMM training, which is, when configured
+# with 3 states, the Bakis model.  For chain (lattice-free MMI) training, see
+# steps/chain/gen_topo.pl.
 
 if (@ARGV != 4) {
   print STDERR "Usage: utils/gen_topo.pl <num-nonsilence-states> <num-silence-states> <colon-separated-nonsilence-phones> <colon-separated-silence-phones>\n";
@@ -28,52 +31,75 @@
 print "<ForPhones>\n";
 print "$nonsil_phones\n";
 print "</ForPhones>\n";
-for ($state = 0; $state < $num_nonsil_states; $state++) {
-  $statep1 = $state+1;
-  print "<State> $state <PdfClass> $state <Transition> $state 0.75 <Transition> $statep1 0.25 </State>\n";
+# The following is the single transition leaving the start-state.  It has pdf-id
+# 1, corresponding to state 1 which it enters..  The cost is 0.0 = log(1); there
+# is only one choice here.  Note: there are actually $num_nonsil_states + 1
+# states, but in HMM terms it's equivalent to $num_nonsil_states states;
+# and that's the length of the shortest successful path.
+print "0  1  1  0.0\n";
+for ($state = 1; $state <= $num_nonsil_states; $state++) {
+  $pdf_class = $state;
+  $next_state = $state + 1;
+  $next_pdf_class = $next_state;
+  # self-loop.
+  print "$state $state $pdf_class 0.6931471806\n";
+  if ($next_state <= $num_nonsil_states) {
+    print "$state $next_state $next_pdf_class 0.6931471806\n";
+  } else {
+    print "$state 0.6931471806\n";  # final-prob.
+  }
 }
-print "<State> $num_nonsil_states </State>\n"; # non-emitting final state.
+print "\n";  # terminate the FSA.. empty line marks its end.
 print "</TopologyEntry>\n";
 # Now silence phones.  They have a different topology-- apart from the first and
 # last states, it's fully connected, as long as you have >= 3 states.
 
+print "<TopologyEntry>\n";
+print "<ForPhones>\n";
+print "$sil_phones\n";
+print "</ForPhones>\n";
+
+
+print "0  1  1  0.0\n";
 if ($num_sil_states > 1) {
-  $transp = 1.0 / ($num_sil_states-1);
-  print "<TopologyEntry>\n";
-  print "<ForPhones>\n";
-  print "$sil_phones\n";
-  print "</ForPhones>\n";
-  print "<State> 0 <PdfClass> 0 ";
-  for ($nextstate = 0; $nextstate < $num_sil_states-1; $nextstate++) { # Transitions to all but last
-    # emitting state.
-    print "<Transition> $nextstate $transp ";
+  # Note: actually it must be >= 3, we checked this above;
+  # 2 is disallowed (I know, it's odd).
+  # Also note: $num_sil_states is not actually the number of states
+  # in the FSA; it's the number of states in its HMM equivalent.
+  # the FSA has one extra state, state 0.
+  # we'll treat the final state, numbered $num_sil_states,
+  # separately; it doesn't have the transition back to
+  # lower-numbered states.
+
+  $self_loop_cost = 0.6931471806;  # -log(0.5)
+  $non_self_loop_cost = -log(0.5 / ($num_sil_states - 2));
+
+  $state = 1;
+  $pdf_id = $state;
+  print "$state  $state  $pdf_id  $self_loop_cost\n";
+  for ($next_state = 2; $next_state < $num_sil_states; $next_state++) {
+    $next_pdf_id = $next_state;
+    print "$state  $next_state  $next_pdf_id  $non_self_loop_cost\n";
   }
-  print "</State>\n";
-  for ($state = 1; $state < $num_sil_states-1; $state++) { # the central states all have transitions to
-    # themselves and to the last emitting state.
-    print "<State> $state <PdfClass> $state ";
-    for ($nextstate = 1; $nextstate < $num_sil_states; $nextstate++) {
-      print "<Transition> $nextstate $transp ";
+
+  for ($state = 2; $state < $num_sil_states; $state++) {
+    $pdf_id = $state;
+    for ($next_state = 2; $next_state <= $num_sil_states; $next_state++) {
+      my $cost = ($next_state == $state ? $self_loop_cost : $non_self_loop_cost);
+      $next_pdf_id = $next_state;
+      print "$state  $next_state  $next_pdf_id  $cost\n";
     }
-    print "</State>\n";
   }
-  # Final emitting state (non-skippable).
-  $state = $num_sil_states-1;
-  print "<State> $state <PdfClass> $state <Transition> $state 0.75 <Transition> $num_sil_states 0.25 </State>\n";
-  # Final nonemitting state:
-  print "<State> $num_sil_states </State>\n";
-  print "</TopologyEntry>\n";
+  $final_state = $num_sil_states;
+  $pdf_id = $final_state;
+  print "$final_state  $final_state  $pdf_id  $self_loop_cost\n";
+  print "$final_state 0.6931471806\n";
+  print "\n";
 } else {
-  print "<TopologyEntry>\n";
-  print "<ForPhones>\n";
-  print "$sil_phones\n";
-  print "</ForPhones>\n";
-  print "<State> 0 <PdfClass> 0 ";
-  print "<Transition> 0 0.75 ";
-  print "<Transition> 1 0.25 ";
-  print "</State>\n";
-  print "<State> $num_sil_states </State>\n"; # non-emitting final state.
-  print "</TopologyEntry>\n";
+  print "0  0  1  0.6931471806\n";
+  print "1  1  1  0.6931471806\n";
+  print "1  0.6931471806\n";
+  print "\n";
 }
-
+print "</TopologyEntry>\n";
 print "</Topology>\n";
diff --git a/egs/wsj/s5/utils/mkgraph.sh b/egs/wsj/s5/utils/mkgraph.sh
index 31e86cd38f6..8346c69ffb7 100755
--- a/egs/wsj/s5/utils/mkgraph.sh
+++ b/egs/wsj/s5/utils/mkgraph.sh
@@ -15,17 +15,12 @@
 
 set -o pipefail
 
-tscale=1.0
-loopscale=0.1
-
 remove_oov=false
 
 for x in `seq 4`; do
   [ "$1" == "--mono" -o "$1" == "--left-biphone" -o "$1" == "--quinphone" ] && shift && \
     echo "WARNING: the --mono, --left-biphone and --quinphone options are now deprecated and ignored."
   [ "$1" == "--remove-oov" ] && remove_oov=true && shift;
-  [ "$1" == "--transition-scale" ] && tscale=$2 && shift 2;
-  [ "$1" == "--self-loop-scale" ] && loopscale=$2 && shift 2;
 done
 
 if [ $# != 3 ]; then
@@ -34,8 +29,6 @@ if [ $# != 3 ]; then
    echo " Options:"
    echo " --remove-oov       #  If true, any paths containing the OOV symbol (obtained from oov.int"
    echo "                    #  in the lang directory) are removed from the G.fst during compilation."
-   echo " --transition-scale #  Scaling factor on transition probabilities."
-   echo " --self-loop-scale  #  Please see: http://kaldi-asr.org/doc/hmm.html#hmm_scale."
    echo "Note: the --mono, --left-biphone and --quinphone options are now deprecated"
    echo "and will be ignored."
    exit 1;
@@ -75,8 +68,6 @@ fi
 N=$(tree-info $tree | grep "context-width" | cut -d' ' -f2) || { echo "Error when getting context-width"; exit 1; }
 P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error when getting central-position"; exit 1; }
 
-[[ -f $2/frame_subsampling_factor && "$loopscale" == "0.1" ]] && \
-  echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";
 
 if [ -f $lang/phones/nonterm_phones_offset.int ]; then
   if [[ $N != 2  || $P != 1 ]]; then
@@ -124,7 +115,7 @@ trap "rm -f $dir/Ha.fst.$$" EXIT HUP INT PIPE TERM
 if [[ ! -s $dir/Ha.fst || $dir/Ha.fst -ot $model  \
     || $dir/Ha.fst -ot $lang/tmp/ilabels_${N}_${P} ]]; then
   make-h-transducer $nonterm_opt --disambig-syms-out=$dir/disambig_tid.int \
-    --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \
+     $lang/tmp/ilabels_${N}_${P} $tree $model \
      > $dir/Ha.fst.$$  || exit 1;
   mv $dir/Ha.fst.$$ $dir/Ha.fst
 fi
@@ -146,14 +137,11 @@ fi
 
 trap "rm -f $dir/HCLG.fst.$$" EXIT HUP INT PIPE TERM
 if [[ ! -s $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then
-  add-self-loops --self-loop-scale=$loopscale --reorder=true $model $dir/HCLGa.fst | \
+  add-self-loops $model $dir/HCLGa.fst | \
     $prepare_grammar_command | \
     fstconvert --fst_type=const > $dir/HCLG.fst.$$ || exit 1;
   mv $dir/HCLG.fst.$$ $dir/HCLG.fst
-  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
-    # No point doing this test if transition-scale not 1, as it is bound to fail.
-    fstisstochastic $dir/HCLG.fst || echo "[info]: final HCLG is not stochastic."
-  fi
+  fstisstochastic $dir/HCLG.fst || echo "[info]: final HCLG is not stochastic."
 fi
 
 # note: the empty FST has 66 bytes.  this check is for whether the final FST
diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl
index 8dba2a0ca69..88230d428e1 100755
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@@ -627,19 +627,20 @@ sub check_summation {
   %phones_in_topo_int_hash = ( );
   %phones_in_topo_hash = ( );
   while (<T>) {
-    chomp;
-    next if (m/^<.*>[ ]*$/);
-    foreach $i (split(" ", $_)) {
-      if (defined $phones_in_topo_int_hash{$i}) {
-        $topo_ok = 0;
-        $exit = 1; print "--> ERROR: $lang/topo has phone $i twice\n";
-      }
-      if (!defined $pint2sym{$i}) {
-        $topo_ok = 0;
-        $exit = 1; print "--> ERROR: $lang/topo has phone $i which is not in phones.txt\n";
+    if (m/<ForPhones>/) {
+      my $line = <T>;
+      foreach $phone (split(" ", $line)) {
+        if (defined $phones_in_topo_int_hash{$phone}) {
+          $topo_ok = 0;
+          $exit = 1; print "--> ERROR: $lang/topo has phone $phone twice\n";
+        }
+        if (!defined $pint2sym{$phone}) {
+          $topo_ok = 0;
+          $exit = 1; print "--> ERROR: $lang/topo has phone $phone which is not in phones.txt\n";
+        }
+        $phones_in_topo_int_hash{$phone} = 1;
+        $phones_in_topo_hash{$pint2sym{$phone}} = 1;
       }
-      $phones_in_topo_int_hash{$i} = 1;
-      $phones_in_topo_hash{$pint2sym{$i}} = 1;
     }
   }
   close(T);
@@ -816,8 +817,8 @@ sub check_summation {
 
 # Check validity of L.fst, L_disambig.fst, and word_boundary.int.
 # First we generate a random word/subword sequence. We then compile it into fst and compose it with L.fst/L_disambig.fst.
-# For subword case the last subword of the sequence must be a end-subword 
-# (i.e. the subword can only be at the end of word or is a single word itself) 
+# For subword case the last subword of the sequence must be a end-subword
+# (i.e. the subword can only be at the end of word or is a single word itself)
 # to guarantee the composition would not fail.
 # We then get the corresponging phones sequence and apply a transition matrix on it to get the number of valid boundaries.
 # In word case, the number of valid boundaries should be equal to the number of words.
@@ -883,14 +884,14 @@ sub check_summation {
           $end_subword ++;
         }
       }
-    } 
+    }
 
     # generate the last word (subword)
     $id = int(rand(scalar(keys %wint2sym)));
     if ($subword_check) {
       $subword = $wint2sym{$id};
       $suffix = substr($subword, -$separator_length, $separator_length);
-      # the last subword can not followed by separator  
+      # the last subword can not followed by separator
       while (defined $wdisambig_words_hash{$id} or
            $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
            $wint2sym{$id} =~ m/^#nonterm/ or $id == 0 or $suffix eq $separator) {
@@ -952,7 +953,7 @@ sub check_summation {
       }
     }
     if (!$exit) {
-      if ($subword_check) { 
+      if ($subword_check) {
         $wlen = $end_subword;
       }
       if ($num_words != $wlen) {
diff --git a/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh
index 700b57d9fce..90fae4d4015 100755
--- a/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh
+++ b/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh
@@ -102,7 +102,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} $data_dir/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 
@@ -227,7 +227,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $data_dir/$lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh b/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh
index bb5352943f6..6adde439b00 100755
--- a/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh
+++ b/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh
@@ -155,7 +155,7 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $data_dir/$lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_fa/v1/run.sh b/egs/yomdle_fa/v1/run.sh
index a7547b1ee69..da75679a8b5 100755
--- a/egs/yomdle_fa/v1/run.sh
+++ b/egs/yomdle_fa/v1/run.sh
@@ -99,7 +99,7 @@ if [ $stage -le 6 ]; then
     echo "$0: Aligning the training data using the e2e chain model..."
     echo "Date: $(date)."
     steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
-        --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+         \
         $data_dir/train_aug $data_dir/lang $exp_dir/chain/e2e_cnn_1a $exp_dir/chain/e2e_ali_train
 fi
 
diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index 03333f6d229..ad00b8d4774 100755
--- a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -98,7 +98,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -216,7 +216,7 @@ if [ $stage -le 6 ] && $decode_chain; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index fd9cdc8921d..3e9197e7e42 100755
--- a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -95,7 +95,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
index f6b2c1bac42..5fa8d3a0d29 100755
--- a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
+++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
@@ -100,7 +100,7 @@ for f in data/$supervised_set/feats.scp \
 done
 
 if [ ! -f $graphdir/HCLG.fst ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+  utils/mkgraph.sh $lang_decode $sup_chain_dir $graphdir
 fi
 
 # Decode unsupervised data and write lattices in non-compact
@@ -312,7 +312,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+  utils/mkgraph.sh $lang_decode $dir $dir/graph
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
index 8185fa2645d..cef080071b1 100755
--- a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
+++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
@@ -99,7 +99,7 @@ for f in data/$supervised_set/feats.scp \
 done
 
 if [ ! -f $graphdir/HCLG.fst ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+  utils/mkgraph.sh $lang_decode $sup_chain_dir $graphdir
 fi
 
 # Decode unsupervised data and write lattices in non-compact
@@ -310,7 +310,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+  utils/mkgraph.sh $lang_decode $dir $dir/graph
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/yomdle_korean/v1/run_end2end.sh b/egs/yomdle_korean/v1/run_end2end.sh
index 65f5beb4b08..193e6eebff3 100755
--- a/egs/yomdle_korean/v1/run_end2end.sh
+++ b/egs/yomdle_korean/v1/run_end2end.sh
@@ -127,7 +127,7 @@ fi
 if [ $stage -le 7 ]; then
   echo "$(date) stage 7: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
-    --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+     \
     data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
@@ -152,7 +152,7 @@ if [ $stage -le 10 ] && $decode_e2e; then
   echo "$(date) stage 10: decoding end2end setup..."
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
 
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
@@ -170,7 +170,7 @@ if [ $stage -le 11 ] && $decode_chain; then
   echo "$(date) stage 11: decoding chain alignment setup..."
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     exp/chain/cnn_e2eali_1a/ exp/chain/cnn_e2eali_1a/graph || exit 1;
 
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
diff --git a/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index cd582472993..969f50dc857 100755
--- a/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -90,7 +90,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
diff --git a/egs/yomdle_russian/v1/run_end2end.sh b/egs/yomdle_russian/v1/run_end2end.sh
index 12beebeaa05..03525a22d54 100755
--- a/egs/yomdle_russian/v1/run_end2end.sh
+++ b/egs/yomdle_russian/v1/run_end2end.sh
@@ -127,7 +127,7 @@ fi
 if [ $stage -le 7 ]; then
   echo "$0: $(date) stage 7: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
-    --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+     \
     data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
@@ -152,7 +152,7 @@ if [ $stage -le 10 ] && $decode_e2e; then
   echo "$0: $(date) stage 10: decoding end2end setup..."
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
 
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
@@ -170,7 +170,7 @@ if [ $stage -le 11 ] && $decode_chain; then
   echo "$0: $(date) stage 11: decoding chain alignment setup..."
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     exp/chain/cnn_e2eali_1a/ exp/chain/cnn_e2eali_1a/graph || exit 1;
 
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
diff --git a/egs/yomdle_tamil/v1/local/chain/run_e2e_cnn.sh b/egs/yomdle_tamil/v1/local/chain/run_e2e_cnn.sh
index f553467d4a6..7145dd365a4 100755
--- a/egs/yomdle_tamil/v1/local/chain/run_e2e_cnn.sh
+++ b/egs/yomdle_tamil/v1/local/chain/run_e2e_cnn.sh
@@ -141,7 +141,7 @@ if [ $stage -le 4 ] && $decode_e2e; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index 03333f6d229..ad00b8d4774 100755
--- a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
+++ b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -98,7 +98,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -216,7 +216,7 @@ if [ $stage -le 6 ] && $decode_chain; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index fb15ce10dde..a531d966dad 100755
--- a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
+++ b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -96,7 +96,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
@@ -216,7 +216,7 @@ if [ $stage -le 6 ] && $decode_chain; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
+    $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
index f6b2c1bac42..5fa8d3a0d29 100755
--- a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
+++ b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
@@ -100,7 +100,7 @@ for f in data/$supervised_set/feats.scp \
 done
 
 if [ ! -f $graphdir/HCLG.fst ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+  utils/mkgraph.sh $lang_decode $sup_chain_dir $graphdir
 fi
 
 # Decode unsupervised data and write lattices in non-compact
@@ -312,7 +312,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+  utils/mkgraph.sh $lang_decode $dir $dir/graph
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
index 17d59642b05..dae34d51f20 100755
--- a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
+++ b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
@@ -99,7 +99,7 @@ for f in data/$supervised_set/feats.scp \
 done
 
 if [ ! -f $graphdir/HCLG.fst ]; then
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+  utils/mkgraph.sh $lang_decode $sup_chain_dir $graphdir
 fi
 
 # Decode unsupervised data and write lattices in non-compact
@@ -308,7 +308,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+  utils/mkgraph.sh $lang_decode $dir $dir/graph
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/yomdle_tamil/v1/run_end2end.sh b/egs/yomdle_tamil/v1/run_end2end.sh
index e6a8e0a4432..55a4d7bc83d 100755
--- a/egs/yomdle_tamil/v1/run_end2end.sh
+++ b/egs/yomdle_tamil/v1/run_end2end.sh
@@ -155,7 +155,7 @@ if [ $stage -le 8 ]; then
   echo "$(date) stage 8: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
     --use-gpu false \
-    --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+     \
     data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh
index 357ce6a1f8e..d12366f7923 100755
--- a/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh
+++ b/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh
@@ -89,7 +89,7 @@ if [ $stage -le 2 ]; then
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
-                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            \
                             ${train_data_dir} $data_dir/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 
@@ -215,7 +215,7 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $data_dir/$lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh b/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh
index 28ea2863e38..9ed5df36729 100755
--- a/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh
+++ b/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh
@@ -156,7 +156,7 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $data_dir/$lang_test \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/yomdle_zh/v1/run.sh b/egs/yomdle_zh/v1/run.sh
index eb8e9e11927..75f11b13dcb 100755
--- a/egs/yomdle_zh/v1/run.sh
+++ b/egs/yomdle_zh/v1/run.sh
@@ -102,7 +102,7 @@ if [ $stage -le 6 ]; then
     echo "$0: Aligning the training data using the e2e chain model..."
     echo "Date: $(date)."
     steps/nnet3/align.sh --nj $nj --cmd "$cmd" --use-gpu false \
-        --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+         \
         $data_dir/train_aug $data_dir/lang $exp_dir/chain/e2e_cnn_1a $exp_dir/chain/e2e_ali_train
 fi
 
diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh
index 14b9a8d6c8e..02706d98602 100755
--- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -252,7 +252,7 @@ if [ $stage -le 13 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgsmall/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
index 28b36243ba3..5372a5862fa 100755
--- a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -260,7 +260,7 @@ if [ $stage -le 13 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test_tgsmall/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    data/lang_test_tgsmall \
     $tree_dir $tree_dir/graph_tgsmall || exit 1;
 fi
 
diff --git a/kaldi b/kaldi
new file mode 120000
index 00000000000..e8310385c56
--- /dev/null
+++ b/kaldi
@@ -0,0 +1 @@
+src
\ No newline at end of file
diff --git a/src/Makefile b/src/Makefile
index 07b7947f3b1..a287fc3d25a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -5,16 +5,16 @@
 SHELL := /bin/bash
 
 SUBDIRS = base matrix util feat cudafeat tree gmm transform \
-          fstext hmm lm decoder lat kws cudamatrix nnet \
+          fstext hmm lm decoder lat kws cudamatrix \
           bin fstbin gmmbin fgmmbin featbin cudafeatbin \
-          nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 rnnlm chain nnet3bin nnet2bin kwsbin \
+          latbin nnet3 rnnlm chain nnet3bin kwsbin \
           ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin \
-          cudadecoder cudadecoderbin
+          cudadecoder cudadecoderbin cblasext
 
 MEMTESTDIRS = base matrix util feat cudafeat tree gmm transform \
-          fstext hmm lm decoder lat nnet kws chain \
+          fstext hmm lm decoder lat kws chain \
           bin fstbin gmmbin fgmmbin featbin cudafeatbin \
-          nnetbin latbin sgmm2 nnet2 nnet3 rnnlm nnet2bin nnet3bin sgmm2bin kwsbin \
+          latbin nnet3 rnnlm nnet3bin kwsbin \
           ivector ivectorbin online2 online2bin lmbin
 
 CUDAMEMTESTDIR = cudamatrix
@@ -23,9 +23,6 @@ SUBDIRS_LIB = $(filter-out %bin, $(SUBDIRS))
 
 KALDI_SONAME ?= libkaldi.so
 
-# Optional subdirectories
-EXT_SUBDIRS = online onlinebin  # python-kaldi-decoding
-EXT_SUBDIRS_LIB = $(filter-out %bin, $(EXT_SUBDIRS))
 
 include kaldi.mk
 
@@ -72,19 +69,6 @@ endif
 endif
 endif
 
-biglibext: $(EXT_SUBDIRS_LIB)
-ifeq ($(KALDI_FLAVOR), dynamic)
-ifeq ($(shell uname), Darwin)
-	$(CXX) -dynamiclib -o $(KALDILIBDIR)/libkaldi_ext.dylib -install_name @rpath/libkaldi_ext.dylib -framework Accelerate $(LDFLAGS) $(EXT_SUBDIRS_LIB:=/*.dylib)
-else
-ifeq ($(shell uname), Linux)
-	#$(warning The following command will probably fail, in that case add -fPIC to your CXXFLAGS and remake all.)
-	$(CXX) -shared -o $(KALDILIBDIR)/libkaldi_ext.so -Wl,-soname=libkaldi_ext.so,--whole-archive  $(EXT_SUBDIRS_LIB:=/kaldi-*.a) -Wl,--no-whole-archive
-else
-	$(error Dynamic libraries not supported on this platform. Run configure with --static flag. )
-endif
-endif
-endif
 
 kaldi.mk:
 	@[ -f kaldi.mk ] || { echo "kaldi.mk does not exist; you have to run ./configure"; exit 1; }
@@ -143,39 +127,37 @@ $(EXT_SUBDIRS) : checkversion kaldi.mk mklibdir ext_depend
 ### Dependency list ###
 # this is necessary for correct parallel compilation
 #1)The tools depend on all the libraries
-bin fstbin gmmbin fgmmbin sgmm2bin featbin cudafeatbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin cudadecoderbin: \
- base matrix util feat cudafeat tree gmm transform sgmm2 fstext hmm \
- lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm \
- cudadecoder
+
+bin fstbin gmmbin fgmmbin featbin cudafeatbin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin cudadecoderbin: \
+ base matrix util feat cudafeat tree gmm transform fstext hmm \
+ lm decoder lat cudadecoder cudamatrix nnet3 ivector chain kws online2 rnnlm
 
 #2)The libraries have inter-dependencies
 base: base/.depend.mk
-matrix: base
-util: base matrix
-feat: base matrix util gmm transform tree
-tree: base util matrix
-gmm: base util matrix tree
-transform: base util matrix gmm tree
-sgmm2: base util matrix gmm tree transform hmm
-fstext: base util matrix tree
-hmm: base tree matrix util
-lm: base util matrix fstext
-decoder: base util matrix gmm hmm tree transform lat fstext
-lat: base util hmm tree matrix
-cudamatrix: base util matrix
-nnet: base util hmm tree matrix cudamatrix
-nnet2: base util matrix lat gmm hmm tree transform cudamatrix
-nnet3: base util matrix decoder lat gmm hmm tree transform cudamatrix chain fstext
-rnnlm: base util matrix cudamatrix nnet3 lm hmm
-chain: lat hmm tree fstext matrix cudamatrix util base
-ivector: base util matrix transform tree gmm
-#3)Dependencies for optional parts of Kaldi
-onlinebin: base matrix util feat tree gmm transform sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online
-# python-kaldi-decoding: base matrix util feat tree gmm transform sgmm2 fstext hmm decoder lat online
-cudafeat: base matrix util gmm transform tree feat cudamatrix online2
-cudafeatbin: base matrix util gmm transform tree feat cudamatrix cudafeat online2
-online: decoder gmm transform feat matrix util base lat hmm tree
-online2: decoder gmm transform feat matrix util base lat hmm tree ivector cudamatrix nnet2 nnet3 chain
-kws: base util hmm tree matrix lat
+cblasext: base
+matrix: base cblasext
+util: base matrix cblasext
+feat: base cudamatrix matrix cblasext util gmm transform tree
+tree: base util matrix cblasext
+gmm: base util matrix cblasext tree
+transform: base util matrix cblasext gmm tree
+fstext: base util matrix cblasext tree
+hmm: base tree matrix cblasext util
+lm: base util matrix cblasext fstext
+decoder: base util matrix cblasext gmm hmm tree transform lat fstext
+lat: base util hmm tree matrix cblasext
+cudamatrix: base util matrix cblasext
+nnet3: base util matrix cblasext decoder lat gmm hmm tree transform cudamatrix chain fstext
+rnnlm: base util matrix cblasext cudamatrix nnet3 lm hmm
+chain: lat hmm tree fstext matrix cblasext cudamatrix util base
+ivector: base util matrix cblasext transform tree gmm
+cudafeat: base cudamatrix matrix cblasext util gmm transform tree feat cudamatrix online2
+onlinebin: base cudamatrix matrix cblasext util feat tree gmm transform fstext hmm lm decoder lat cudamatrix online
+# python-kaldi-decoding: base cudamatrix matrix cblasext util feat tree gmm transform fstext hmm decoder lat online
+
+cudafeatbin: base cudamatrix matrix cblasext util gmm transform tree feat cudamatrix cudafeat online2
+online: decoder gmm transform feat matrix cblasext util base lat hmm tree
+online2: decoder gmm transform feat matrix cblasext util base lat hmm tree ivector cudamatrix nnet3 chain
+kws: base util hmm tree matrix cblasext lat
 cudadecoder:  cudamatrix cudafeat online2 nnet3 ivector feat fstext lat chain transform
-cudadecoderbin: cudadecoder cudafeat cudamatrix online2 nnet3 ivector feat fstext lat chain transform
+
diff --git a/src/bin/Makefile b/src/bin/Makefile
index 7cb01b50120..855a43bf350 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -1,6 +1,5 @@
 
 all:
-	-rm -f arpa2fst
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
@@ -22,7 +21,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         matrix-sum build-pfile-from-ali get-post-on-ali tree-info am-info \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
         transform-vec align-text matrix-dim post-to-smat compile-graph \
-        compare-int-vector
+        compare-int-vector cuda-gpu-available
 
 
 OBJFILES =
@@ -30,9 +29,9 @@ OBJFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
-
+          ../tree/kaldi-tree.a ../cudamatrix/kaldi-cudamatrix.a \
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a
 
 TESTFILES =
 
diff --git a/src/bin/acc-lda.cc b/src/bin/acc-lda.cc
index b664135bdc7..a0451218513 100644
--- a/src/bin/acc-lda.cc
+++ b/src/bin/acc-lda.cc
@@ -21,7 +21,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 #include "transform/lda-estimate.h"
 
@@ -57,7 +57,7 @@ int main(int argc, char *argv[]) {
     std::string posteriors_rspecifier = po.GetArg(3);
     std::string acc_wxfilename = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_rxfilename, &binary_read);
diff --git a/src/bin/acc-tree-stats.cc b/src/bin/acc-tree-stats.cc
index 8b9ce9065b4..c0eb31f6064 100644
--- a/src/bin/acc-tree-stats.cc
+++ b/src/bin/acc-tree-stats.cc
@@ -22,7 +22,7 @@
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
 #include "tree/build-tree-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/tree-accu.h"
 
 /** @brief Accumulate tree statistics for decision tree training. The
@@ -62,7 +62,7 @@ int main(int argc, char *argv[]) {
 
     AccumulateTreeStatsInfo acc_tree_stats_info(opts);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/bin/add-self-loops.cc b/src/bin/add-self-loops.cc
index b223dfe317d..ebaf219aff1 100644
--- a/src/bin/add-self-loops.cc
+++ b/src/bin/add-self-loops.cc
@@ -18,7 +18,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "tree/context-dep.h"
 #include "util/common-utils.h"
@@ -46,20 +46,14 @@ int main(int argc, char *argv[]) {
         "is recommended as the decoding will in that case be faster.\n"
         "Usage:   add-self-loops [options] transition-gmm/acoustic-model [fst-in] [fst-out]\n"
         "e.g.: \n"
-        " add-self-loops --self-loop-scale=0.1 1.mdl HCLGa.fst HCLG.fst\n"
-        "or:  add-self-loops --self-loop-scale=0.1 1.mdl <HCLGa.fst >HCLG.fst\n";
+        " add-self-loops1.mdl HCLGa.fst HCLG.fst\n"
+        "or:  add-self-loops 1.mdl <HCLGa.fst >HCLG.fst\n";
 
-    BaseFloat self_loop_scale = 1.0;
-    bool reorder = true;
     std::string disambig_in_filename;
 
     ParseOptions po(usage);
-    po.Register("self-loop-scale", &self_loop_scale,
-                "Scale for self-loop probabilities relative to LM.");
     po.Register("disambig-syms", &disambig_in_filename,
                 "List of disambiguation symbols on input of fst-in [input file]");
-    po.Register("reorder", &reorder,
-                "If true, reorder symbols for more decoding efficiency");
     po.Read(argc, argv);
 
     if (po.NumArgs() < 1 || po.NumArgs() > 3) {
@@ -88,7 +82,7 @@ int main(int argc, char *argv[]) {
                       "standard input" : disambig_in_filename);
     }
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
 
@@ -97,13 +91,14 @@ int main(int argc, char *argv[]) {
     if (!fst)
       KALDI_ERR << "add-self-loops: error reading input FST.";
 
-    bool check_no_self_loops = true;
+    BaseFloat self_loop_scale = 1.0;
+    bool currently_self_loop_free = true;
 
     // The work gets done here.
     AddSelfLoops(trans_model,
                  disambig_syms_in,
                  self_loop_scale,
-                 reorder, check_no_self_loops, fst);
+                 currently_self_loop_free, fst);
 
     if (! fst->Write(fst_out_filename) )
       KALDI_ERR << "add-self-loops: error writing FST to "
diff --git a/src/bin/ali-to-pdf.cc b/src/bin/ali-to-pdf.cc
index 61b5138cf31..3c978ca62f0 100644
--- a/src/bin/ali-to-pdf.cc
+++ b/src/bin/ali-to-pdf.cc
@@ -21,7 +21,7 @@
 */
 #include "base/kaldi-common.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 #include "fst/fstlib.h"
@@ -48,7 +48,7 @@ int main(int argc, char *argv[]) {
         alignments_rspecifier = po.GetArg(2),
         pdfs_wspecifier = po.GetArg(3);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_filename, &trans_model);
 
     SequentialInt32VectorReader reader(alignments_rspecifier);
@@ -60,7 +60,7 @@ int main(int argc, char *argv[]) {
       std::vector<int32> alignment = reader.Value();
 
       for (size_t i = 0; i < alignment.size(); i++)
-        alignment[i] = trans_model.TransitionIdToPdf(alignment[i]);
+        alignment[i] = trans_model.TransitionIdToPdfFast(alignment[i]);
 
       writer.Write(key, alignment);
       num_done++;
diff --git a/src/bin/ali-to-phones.cc b/src/bin/ali-to-phones.cc
index 602e32e9768..ed7f99758cd 100644
--- a/src/bin/ali-to-phones.cc
+++ b/src/bin/ali-to-phones.cc
@@ -20,7 +20,7 @@
 
 
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 #include "fst/fstlib.h"
@@ -68,7 +68,7 @@ int main(int argc, char *argv[]) {
     std::string model_filename = po.GetArg(1),
         alignments_rspecifier = po.GetArg(2);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_filename, &trans_model);
 
     SequentialInt32VectorReader reader(alignments_rspecifier);
@@ -98,7 +98,7 @@ int main(int argc, char *argv[]) {
         BaseFloat phone_start = 0.0;
         for (size_t i = 0; i < split.size(); i++) {
           KALDI_ASSERT(!split[i].empty());
-          int32 phone = trans_model.TransitionIdToPhone(split[i][0]);
+          int32 phone = trans_model.InfoForTransitionId(split[i][0]).phone;
           int32 num_repeats = split[i].size();
           ctm_writer.Stream() << key << " 1 " << phone_start << " "
                       << (frame_shift * num_repeats) << " " << phone << std::endl;
@@ -108,7 +108,7 @@ int main(int argc, char *argv[]) {
         std::vector<int32> phones;
         for (size_t i = 0; i < split.size(); i++) {
           KALDI_ASSERT(!split[i].empty());
-          int32 phone = trans_model.TransitionIdToPhone(split[i][0]);
+          int32 phone = trans_model.InfoForTransitionId(split[i][0]).phone;
           int32 num_repeats = split[i].size();
           //KALDI_ASSERT(num_repeats!=0);
           if (per_frame)
@@ -122,7 +122,7 @@ int main(int argc, char *argv[]) {
         std::vector<std::pair<int32, int32> > pairs;
         for (size_t i = 0; i < split.size(); i++) {
           KALDI_ASSERT(split[i].size() > 0);
-          int32 phone = trans_model.TransitionIdToPhone(split[i][0]);
+          int32 phone = trans_model.InfoForTransitionId(split[i][0]).phone;
           int32 num_repeats = split[i].size();
           //KALDI_ASSERT(num_repeats!=0);
           pairs.push_back(std::make_pair(phone, num_repeats));
diff --git a/src/bin/ali-to-post.cc b/src/bin/ali-to-post.cc
index ac87d676c06..00c026c0692 100644
--- a/src/bin/ali-to-post.cc
+++ b/src/bin/ali-to-post.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/posterior.h"
 
diff --git a/src/bin/align-compiled-mapped.cc b/src/bin/align-compiled-mapped.cc
index 98ffebd6eaa..a47231f7b5a 100644
--- a/src/bin/align-compiled-mapped.cc
+++ b/src/bin/align-compiled-mapped.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
@@ -50,17 +50,11 @@ int main(int argc, char *argv[]) {
     AlignConfig align_config;
     bool binary = true;
     BaseFloat acoustic_scale = 1.0;
-    BaseFloat transition_scale = 1.0;
-    BaseFloat self_loop_scale = 1.0;
 
     align_config.Register(&po);
     po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("transition-scale", &transition_scale,
-                "Transition-probability scale [relative to acoustics]");
     po.Register("acoustic-scale", &acoustic_scale,
                 "Scaling factor for acoustic likelihoods");
-    po.Register("self-loop-scale", &self_loop_scale,
-                "Scale of self-loop versus non-self-loop log probs [relative to acoustics]");
     po.Read(argc, argv);
 
     if (po.NumArgs() < 4 || po.NumArgs() > 5) {
@@ -74,7 +68,7 @@ int main(int argc, char *argv[]) {
     std::string alignment_wspecifier = po.GetArg(4);
     std::string scores_wspecifier = po.GetOptArg(5);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
     SequentialBaseFloatMatrixReader loglikes_reader(feature_rspecifier);
@@ -110,13 +104,6 @@ int main(int argc, char *argv[]) {
         continue;
       }
 
-      {  // Add transition-probs to the FST.
-        std::vector<int32> disambig_syms;  // empty.
-        AddTransitionProbs(trans_model, disambig_syms,
-                           transition_scale, self_loop_scale,
-                           &decode_fst);
-      }
-
       DecodableMatrixScaledMapped decodable(trans_model, loglikes, acoustic_scale);
 
       AlignUtteranceWrapper(align_config, utt,
diff --git a/src/bin/align-equal-compiled.cc b/src/bin/align-equal-compiled.cc
index c4ab9d4205a..f5900727aef 100644
--- a/src/bin/align-equal-compiled.cc
+++ b/src/bin/align-equal-compiled.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/training-graph-compiler.h"
 
diff --git a/src/bin/align-equal.cc b/src/bin/align-equal.cc
index a3bc40dc236..80caff00168 100644
--- a/src/bin/align-equal.cc
+++ b/src/bin/align-equal.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/training-graph-compiler.h"
 
@@ -65,13 +65,13 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;
     ReadKaldiObject(tree_in_filename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
     // need VectorFst because we will change it by adding subseq symbol.
     VectorFst<StdArc> *lex_fst = fst::ReadFstKaldi(lex_in_filename);
 
-    TrainingGraphCompilerOptions gc_opts(1.0, true);  // true -> Dan style graph.
+    TrainingGraphCompilerOptions gc_opts;
 
     std::vector<int32> disambig_syms;
     if (disambig_rxfilename != "")
diff --git a/src/bin/align-mapped.cc b/src/bin/align-mapped.cc
index c78401fffdd..e8249c4a123 100644
--- a/src/bin/align-mapped.cc
+++ b/src/bin/align-mapped.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "decoder/training-graph-compiler.h"
@@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;
     ReadKaldiObject(tree_in_filename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
     VectorFst<StdArc> *lex_fst = fst::ReadFstKaldi(lex_in_filename);
diff --git a/src/bin/am-info.cc b/src/bin/am-info.cc
index 6afb0c5014e..f2516c436f8 100644
--- a/src/bin/am-info.cc
+++ b/src/bin/am-info.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -45,7 +45,7 @@ int main(int argc, char *argv[]) {
 
     std::string model_in_filename = po.GetArg(1);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
@@ -56,8 +56,6 @@ int main(int argc, char *argv[]) {
     std::cout << "number of pdfs " << trans_model.NumPdfs() << '\n';
     std::cout << "number of transition-ids " << trans_model.NumTransitionIds()
               << '\n';
-    std::cout << "number of transition-states "
-              << trans_model.NumTransitionStates() << '\n';
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
diff --git a/src/bin/build-pfile-from-ali.cc b/src/bin/build-pfile-from-ali.cc
index fadb873825f..e1967c77d8c 100644
--- a/src/bin/build-pfile-from-ali.cc
+++ b/src/bin/build-pfile-from-ali.cc
@@ -25,7 +25,7 @@ using std::vector;
 
 #include "base/kaldi-common.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 
@@ -64,7 +64,7 @@ int main(int argc, char *argv[]) {
         feature_rspecifier = po.GetArg(3),
         pfile_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
@@ -115,7 +115,7 @@ int main(int argc, char *argv[]) {
           }
           // Output the class label
           ss << " ";
-          ss << trans_model.TransitionIdToPdf(alignment[i]);
+          ss << trans_model.TransitionIdToPdfFast(alignment[i]);
 
           ko.Stream() << ss.str().c_str();
           ko.Stream() << "\n";
diff --git a/src/bin/build-tree-two-level.cc b/src/bin/build-tree-two-level.cc
index c7cd553484e..005c5d80532 100644
--- a/src/bin/build-tree-two-level.cc
+++ b/src/bin/build-tree-two-level.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/hmm-topology.h"
+#include "hmm/topology.h"
 #include "tree/context-dep.h"
 #include "tree/build-tree.h"
 #include "tree/build-tree-utils.h"
@@ -112,7 +112,7 @@ int main(int argc, char *argv[]) {
       ReadRootsFile(ki.Stream(), &phone_sets, &is_shared_root, &is_split_root);
     }
 
-    HmmTopology topo;
+    Topology topo;
     ReadKaldiObject(topo_filename, &topo);
 
     BuildTreeStatsType stats;
diff --git a/src/bin/build-tree.cc b/src/bin/build-tree.cc
index 72774900d61..b37c9c7d184 100644
--- a/src/bin/build-tree.cc
+++ b/src/bin/build-tree.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/hmm-topology.h"
+#include "hmm/topology.h"
 #include "tree/context-dep.h"
 #include "tree/build-tree.h"
 #include "tree/build-tree-utils.h"
@@ -91,7 +91,7 @@ int main(int argc, char *argv[]) {
       ReadRootsFile(ki.Stream(), &phone_sets, &is_shared_root, &is_split_root);
     }
 
-    HmmTopology topo;
+    Topology topo;
     ReadKaldiObject(topo_filename, &topo);
 
     BuildTreeStatsType stats;
diff --git a/src/bin/cluster-phones.cc b/src/bin/cluster-phones.cc
index 1d5b3824252..24627ca3bfc 100644
--- a/src/bin/cluster-phones.cc
+++ b/src/bin/cluster-phones.cc
@@ -49,7 +49,7 @@ int main(int argc, char *argv[]) {
 
     // bool binary = true;
     int32 P = 1, N = 3; // Note: N does not matter.
-    std::string pdf_class_list_str = "1";  // 1 is just the central position of 3.
+    std::string pdf_class_list_str = "2";  // 2 is just the central position of 3.
     std::string mode = "questions";
     int32 num_classes = -1;
 
@@ -57,7 +57,7 @@ int main(int argc, char *argv[]) {
     // po.Register("binary", &binary, "Write output in binary mode");
     po.Register("central-position", &P, "Central position in context window [must match acc-tree-stats]");
     po.Register("context-width", &N, "Does not have any effect-- included for scripting convenience.");
-    po.Register("pdf-class-list", &pdf_class_list_str, "Colon-separated list of HMM positions to consider [Default = 1: just central position for 3-state models].");
+    po.Register("pdf-class-list", &pdf_class_list_str, "Colon-separated list of HMM positions to consider [Default = 2: just central position for 3-state models].");
     po.Register("mode", &mode, "Mode of operation: \"questions\"->sets suitable for decision trees; \"k-means\"->k-means algorithm, output k classes (set num-classes options)\n");
     po.Register("num-classes", &num_classes, "For k-means mode, number of classes.");
 
@@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
     std::vector<int32> pdf_class_list;
     if (!SplitStringToIntegers(pdf_class_list_str, ":", false, &pdf_class_list)
        || pdf_class_list.empty()) {
-      KALDI_ERR << "Invalid pdf-class-list string [expecting colon-separated list of integers]: " 
+      KALDI_ERR << "Invalid pdf-class-list string [expecting colon-separated list of integers]: "
                  << pdf_class_list_str;
     }
 
diff --git a/src/bin/compile-graph.cc b/src/bin/compile-graph.cc
index 7174fdf8113..dea332aced0 100644
--- a/src/bin/compile-graph.cc
+++ b/src/bin/compile-graph.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "fstext/fstext-lib.h"
 #include "fstext/push-special.h"
@@ -48,19 +48,12 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
 
 
-    BaseFloat transition_scale = 1.0;
-    BaseFloat self_loop_scale = 1.0;  // Caution: the script default is 0.1.
     int32 nonterm_phones_offset = -1;
     std::string disambig_rxfilename;
 
 
     po.Register("read-disambig-syms", &disambig_rxfilename, "File containing "
                 "list of disambiguation symbols in phone symbol table");
-    po.Register("transition-scale", &transition_scale, "Scale of transition "
-                "probabilities (excluding self-loops).");
-    po.Register("self-loop-scale", &self_loop_scale, "Scale of self-loop vs. "
-                "non-self-loop probability mass.  Caution: the default of "
-                "mkgraph.sh is 0.1, but this defaults to 1.0.");
     po.Register("nonterm-phones-offset", &nonterm_phones_offset, "Integer "
                 "value of symbol #nonterm_bos in phones.txt, if present. "
                 "(Only relevant for grammar decoding).");
@@ -81,7 +74,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;  // the tree.
     ReadKaldiObject(tree_rxfilename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     VectorFst<StdArc> *lex_fst = fst::ReadFstKaldi(lex_rxfilename),
@@ -141,20 +134,18 @@ int main(int argc, char *argv[]) {
     lg_fst.DeleteStates();
 
     HTransducerConfig h_cfg;
-    h_cfg.transition_scale = transition_scale;
     h_cfg.nonterm_phones_offset = nonterm_phones_offset;
     std::vector<int32> disambig_syms_h; // disambiguation symbols on
                                         // input side of H.
-    VectorFst<StdArc> *h_fst = GetHTransducer(ilabels,
-                                              ctx_dep,
-                                              trans_model,
-                                              h_cfg,
-                                              &disambig_syms_h);
+    std::unique_ptr<VectorFst<StdArc>> h_fst = GetHTransducer(ilabels,
+                                                              ctx_dep,
+                                                              trans_model,
+                                                              h_cfg,
+                                                              &disambig_syms_h);
 
     VectorFst<StdArc> hclg_fst;  // transition-id to word.
     TableCompose(*h_fst, clg_fst, &hclg_fst);
     clg_fst.DeleteStates();
-    delete h_fst;
 
     KALDI_ASSERT(hclg_fst.Start() != fst::kNoStateId);
 
@@ -170,13 +161,12 @@ int main(int argc, char *argv[]) {
     MinimizeEncoded(&hclg_fst);
 
     std::vector<int32> disambig;
-    bool check_no_self_loops = true,
-        reorder = true;
+    bool currently_self_loop_free = true,
+        use_weights = true;
     AddSelfLoops(trans_model,
                  disambig,
-                 self_loop_scale,
-                 reorder,
-                 check_no_self_loops,
+                 currently_self_loop_free,
+                 use_weights,
                  &hclg_fst);
 
     if (nonterm_phones_offset >= 0)
diff --git a/src/bin/compile-questions.cc b/src/bin/compile-questions.cc
index f9694140ae8..bf734ac01da 100644
--- a/src/bin/compile-questions.cc
+++ b/src/bin/compile-questions.cc
@@ -19,12 +19,12 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/hmm-topology.h"
+#include "hmm/topology.h"
 #include "tree/build-tree-questions.h"
 
 
 namespace kaldi {
-int32 ProcessTopo(const HmmTopology &topo, const std::vector<std::vector<int32> > &questions) {
+int32 ProcessTopo(const Topology &topo, const std::vector<std::vector<int32> > &questions) {
   std::vector<int32> seen_phones;  // ids of phones seen in questions.
   for (size_t i = 0; i < questions.size(); i++)
     for (size_t j= 0; j < questions[i].size(); j++) seen_phones.push_back(questions[i][j]);
@@ -93,7 +93,7 @@ int main(int argc, char *argv[]) {
         questions_rxfilename = po.GetArg(2),
         questions_out_filename = po.GetArg(3);
 
-    HmmTopology topo;  // just needed for checking, and to get the
+    Topology topo;  // just needed for checking, and to get the
     // largest number of pdf-classes for any phone.
     ReadKaldiObject(topo_filename, &topo);
 
@@ -130,13 +130,13 @@ int main(int argc, char *argv[]) {
     }
 
     QuestionsForKey pdfclass_opts(num_iters_refine);
-    std::vector<std::vector<int32> > pdfclass_questions(max_num_pdfclasses-1);
-    for (int32 i = 0; i < max_num_pdfclasses - 1; i++)
-      for (int32 j = 0; j <= i; j++)
-        pdfclass_questions[i].push_back(j);
-    // E.g. if max_num_pdfclasses == 3,  pdfclass_questions is now [ [0], [0, 1] ].
+    std::vector<std::vector<int32> > pdfclass_questions(max_num_pdfclasses - 1);
+    for (int32 i = 1; i <= max_num_pdfclasses - 1; i++)
+      for (int32 j = 1; j <= i; j++)
+        pdfclass_questions[i-1].push_back(j);
+    // E.g. if max_num_pdfclasses == 3,  pdfclass_questions is now [ 1], [1, 2] ].
     pdfclass_opts.initial_questions = pdfclass_questions;
-    KALDI_LOG << "Setting questions for hmm-position [hmm-position ranges from 0 to "<< (max_num_pdfclasses-1) <<"]";
+    KALDI_LOG << "Setting questions for pdf-class [pdf-class ranges from 1 to "<< max_num_pdfclasses <<"]";
     qo.SetQuestionsOf(kPdfClass, pdfclass_opts);
 
     WriteKaldiObject(qo, questions_out_filename, binary);
diff --git a/src/bin/compile-train-graphs-fsts.cc b/src/bin/compile-train-graphs-fsts.cc
index 00ec1038943..8d0203c0a5e 100644
--- a/src/bin/compile-train-graphs-fsts.cc
+++ b/src/bin/compile-train-graphs-fsts.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/training-graph-compiler.h"
 
@@ -52,9 +52,6 @@ int main(int argc, char *argv[]) {
 
     TrainingGraphCompilerOptions gopts;
     int32 batch_size = 250;
-    gopts.transition_scale = 0.0;  // Change the default to 0.0 since we will generally add the
-    // transition probs in the alignment phase (since they change each time)
-    gopts.self_loop_scale = 0.0;  // Ditto for self-loop probs.
     std::string disambig_rxfilename;
     gopts.Register(&po);
 
@@ -63,7 +60,7 @@ int main(int argc, char *argv[]) {
                 "more memory.  E.g. 500");
     po.Register("read-disambig-syms", &disambig_rxfilename, "File containing "
                 "list of disambiguation symbols in phone symbol table");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 5) {
@@ -80,7 +77,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;  // the tree.
     ReadKaldiObject(tree_rxfilename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     // need VectorFst because we will change it by adding subseq symbol.
@@ -103,7 +100,7 @@ int main(int argc, char *argv[]) {
 
     SequentialTableReader<fst::VectorFstHolder> fst_reader(fsts_rspecifier);
     TableWriter<fst::VectorFstHolder> fst_writer(fsts_wspecifier);
-    
+
     int num_succeed = 0, num_fail = 0;
 
     if (batch_size == 1) {  // We treat batch_size of 1 as a special case in order
diff --git a/src/bin/compile-train-graphs.cc b/src/bin/compile-train-graphs.cc
index 874d079376e..3e3532fbd98 100644
--- a/src/bin/compile-train-graphs.cc
+++ b/src/bin/compile-train-graphs.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/training-graph-compiler.h"
 
@@ -46,9 +46,7 @@ int main(int argc, char *argv[]) {
 
     TrainingGraphCompilerOptions gopts;
     int32 batch_size = 250;
-    gopts.transition_scale = 0.0;  // Change the default to 0.0 since we will generally add the
-    // transition probs in the alignment phase (since they change eacm time)
-    gopts.self_loop_scale = 0.0;  // Ditto for self-loop probs.
+
     std::string disambig_rxfilename;
     gopts.Register(&po);
 
@@ -57,7 +55,7 @@ int main(int argc, char *argv[]) {
                 "more memory.  E.g. 500");
     po.Register("read-disambig-syms", &disambig_rxfilename, "File containing "
                 "list of disambiguation symbols in phone symbol table");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 5) {
@@ -74,7 +72,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;  // the tree.
     ReadKaldiObject(tree_rxfilename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     // need VectorFst because we will change it by adding subseq symbol.
@@ -85,7 +83,7 @@ int main(int argc, char *argv[]) {
       if (!ReadIntegerVectorSimple(disambig_rxfilename, &disambig_syms))
         KALDI_ERR << "fstcomposecontext: Could not read disambiguation symbols from "
                   << disambig_rxfilename;
-    
+
     TrainingGraphCompiler gc(trans_model, ctx_dep, lex_fst, disambig_syms, gopts);
 
     lex_fst = NULL;  // we gave ownership to gc.
diff --git a/src/bin/convert-ali.cc b/src/bin/convert-ali.cc
index 89fe838638c..d245d93a0f8 100644
--- a/src/bin/convert-ali.cc
+++ b/src/bin/convert-ali.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/tree-accu.h" // for ReadPhoneMap
 
@@ -38,7 +38,6 @@ int main(int argc, char *argv[]) {
         " convert-ali old/final.mdl new/0.mdl new/tree ark:old/ali.1 ark:new/ali.1\n";
 
     int32 frame_subsampling_factor = 1;
-    bool reorder = true;
     bool repeat_frames = false;
 
     std::string phone_map_rxfilename;
@@ -46,9 +45,6 @@ int main(int argc, char *argv[]) {
     po.Register("phone-map", &phone_map_rxfilename,
                 "File name containing old->new phone mapping (each line is: "
                 "old-integer-id new-integer-id)");
-    po.Register("reorder", &reorder,
-                "True if you want the converted alignments to be 'reordered' "
-                "versus the way they appear in the HmmTopology object");
     po.Register("repeat-frames", &repeat_frames,
                 "Only relevant when frame-subsampling-factor != 1.  If true, "
                 "repeat frames of alignment by 'frame-subsampling-factor' "
@@ -79,10 +75,10 @@ int main(int argc, char *argv[]) {
     SequentialInt32VectorReader alignment_reader(old_alignments_rspecifier);
     Int32VectorWriter alignment_writer(new_alignments_wspecifier);
 
-    TransitionModel old_trans_model;
+    Transitions old_trans_model;
     ReadKaldiObject(old_model_filename, &old_trans_model);
 
-    TransitionModel new_trans_model;
+    Transitions new_trans_model;
     ReadKaldiObject(new_model_filename, &new_trans_model);
 
     if (!(old_trans_model.GetTopo() == new_trans_model.GetTopo()))
@@ -105,7 +101,6 @@ int main(int argc, char *argv[]) {
                            old_alignment,
                            frame_subsampling_factor,
                            repeat_frames,
-                           reorder,
                            (phone_map_rxfilename != "" ? &phone_map : NULL),
                            &new_alignment)) {
         alignment_writer.Write(key, new_alignment);
diff --git a/src/bin/copy-gselect.cc b/src/bin/copy-gselect.cc
index e6c92013b58..ee427d59b8e 100644
--- a/src/bin/copy-gselect.cc
+++ b/src/bin/copy-gselect.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
diff --git a/src/bin/copy-transition-model.cc b/src/bin/copy-transition-model.cc
index 62a5d0c51dd..b05c64d28bf 100644
--- a/src/bin/copy-transition-model.cc
+++ b/src/bin/copy-transition-model.cc
@@ -17,7 +17,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fst/fstlib.h"
 #include "util/common-utils.h"
 
@@ -54,7 +54,7 @@ int main(int argc, char *argv[]) {
         transition_model_wxfilename = po.GetArg(2);
 
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(transition_model_rxfilename, &trans_model);
 
     WriteKaldiObject(trans_model, transition_model_wxfilename, binary);
diff --git a/src/bin/copy-tree.cc b/src/bin/copy-tree.cc
index c412366b151..69ab0c309ad 100644
--- a/src/bin/copy-tree.cc
+++ b/src/bin/copy-tree.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/hmm-topology.h"
+#include "hmm/topology.h"
 #include "tree/context-dep.h"
 #include "tree/clusterable-classes.h"
 #include "util/text-utils.h"
diff --git a/src/bin/cuda-gpu-available.cc b/src/bin/cuda-gpu-available.cc
new file mode 100644
index 00000000000..67063fc0f96
--- /dev/null
+++ b/src/bin/cuda-gpu-available.cc
@@ -0,0 +1,111 @@
+// nnetbin/cuda-gpu-available.cc
+
+// Copyright 2015 Brno University of Technology (author: Karel Vesely)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _MSC_VER
+  #include <unistd.h>
+  #include <errno.h>
+#endif
+
+#include "base/kaldi-common.h"
+#include "util/parse-options.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-matrix.h"
+
+using namespace kaldi;
+
+#if HAVE_CUDA == 1
+/**
+ * With incorrect CUDA setup, this will trigger "invalid device function" error.
+ */
+void TestGpuComputation() {
+  CuMatrix<BaseFloat> m(100,100);
+  m.SetRandn();
+  m.ApplySoftMaxPerRow(m);
+}
+#endif
+
+int main(int argc, char *argv[]) try {
+
+  /* only for Doxygen documentation, never shown in command line */
+  const char *usage =
+        "Test if there is a GPU available, and if the GPU setup is correct.\n"
+        "A GPU is acquired and a small computation is done\n"
+        "(generating a random matrix and computing softmax for its rows).\n"
+        "\n"
+        "exit-code: 0 = success, 1 = compiled without GPU support, -1 = error\n"
+        "\n"
+        "Usage:  cuda-gpu-available\n";
+
+  ParseOptions po(usage);
+  po.Read(argc, argv);
+
+  char hostname[100] = "UNKNOWN-HOSTNAME";
+#if !defined(_MSC_VER) && !defined(__CYGWIN__)
+  if (gethostname(hostname, 100)) {
+    KALDI_WARN << "Cannot get hostname, " << strerror(errno);
+  }
+#endif
+  KALDI_LOG << "\n\n### IS CUDA GPU AVAILABLE? '" << hostname << "' ###";
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().SelectGpuId("yes");
+  fprintf(stderr, "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ##\n\n");
+  fprintf(stderr, "### Testing CUDA setup with a small computation "
+                  "(setup = cuda-toolkit + gpu-driver + kaldi):\n");
+  // the test of setup by computation,
+  try {
+    TestGpuComputation();
+  } catch (const std::exception &e) {
+    fprintf(stderr, "%s\n", e.what());
+    KALDI_LOG << "...\n"
+      << "### The CUDA setup is wrong! "
+      << "(\"invalid device function\" == problem with 'compute capability' "
+      << "in compiled kaldi)\n"
+      << "### Before posting the error to forum, please try following:\n"
+      << "### 1) update kaldi & cuda-toolkit (& GPU driver),\n"
+      << "### 2) re-run 'src/configure',\n"
+      << "### 3) re-compile kaldi by 'make clean; make -j depend; make -j'\n"
+      << "###\n"
+      << "### If the problem persists, please send us your:\n"
+      << "### - GPU model name, cuda-toolkit version, driver version "
+      << "(run nvidia-smi), variable $(CUDA_ARCH) from src/kaldi.mk";
+    return -1;
+  }
+  fprintf(stderr, "### Test OK!\n");
+  return 0;
+#else
+  std::cerr
+    << "### CUDA WAS NOT COMPILED IN! ###\n"
+    << "To support CUDA, you must run 'configure' on a machine "
+    << "that has the CUDA compiler 'nvcc' available.\n";
+  return 1;
+#endif
+} catch (const std::exception &e) {
+  fprintf(stderr, "%s\n", e.what());
+  KALDI_LOG << "...\n"
+    << "### WE DID NOT GET A CUDA GPU!!! ###\n"
+    << "### If your system has a 'free' CUDA GPU, try re-installing "
+    << "latest 'CUDA toolkit' from NVidia (this updates GPU drivers too).\n"
+    << "### Otherwise 'nvidia-smi' shows the status of GPUs:\n"
+    << "### - The versions should match ('NVIDIA-SMI' and 'Driver Version'), "
+    << "otherwise reboot or reload kernel module,\n"
+    << "### - The GPU should be unused "
+    << "(no 'process' in list, low 'memory-usage' (<100MB), low 'gpu-fan' (<30%)),\n"
+    << "### - You should see your GPU (burnt GPUs may disappear from the list until reboot),";
+  return -1;
+}
diff --git a/src/bin/decode-faster-mapped.cc b/src/bin/decode-faster-mapped.cc
index c7411592504..4606933411f 100644
--- a/src/bin/decode-faster-mapped.cc
+++ b/src/bin/decode-faster-mapped.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/faster-decoder.h"
 #include "decoder/decodable-matrix.h"
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetArg(4),
         alignment_wspecifier = po.GetOptArg(5);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
     Int32VectorWriter words_writer(words_wspecifier);
diff --git a/src/bin/decode-faster.cc b/src/bin/decode-faster.cc
index cbcdb771d56..a1e112b129f 100644
--- a/src/bin/decode-faster.cc
+++ b/src/bin/decode-faster.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/faster-decoder.h"
 #include "decoder/decodable-matrix.h"
diff --git a/src/bin/draw-tree.cc b/src/bin/draw-tree.cc
index d107ab1cfac..f95478e7c52 100644
--- a/src/bin/draw-tree.cc
+++ b/src/bin/draw-tree.cc
@@ -33,7 +33,7 @@ void MakeEvent(std::string &qry, fst::SymbolTable *phone_syms,
     EventValueType value;
     if (key == kPdfClass) {
       value = static_cast<EventValueType>(atoi(valstr.c_str()));
-      if (value < 0) { // not valid pdf-class
+      if (value < 1) { // not valid pdf-class
         KALDI_ERR << "Bad query: invalid pdf-class (" << valstr << ')';
       }
     }
diff --git a/src/bin/est-mllt.cc b/src/bin/est-mllt.cc
index 48021304b80..2a01f0dbb78 100644
--- a/src/bin/est-mllt.cc
+++ b/src/bin/est-mllt.cc
@@ -20,7 +20,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/mllt.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/bin/get-post-on-ali.cc b/src/bin/get-post-on-ali.cc
index 6d6dfd0d3df..471bbfbfff2 100644
--- a/src/bin/get-post-on-ali.cc
+++ b/src/bin/get-post-on-ali.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/posterior.h"
 
diff --git a/src/bin/hmm-info.cc b/src/bin/hmm-info.cc
index 4ece5e88171..6daa0bc6385 100644
--- a/src/bin/hmm-info.cc
+++ b/src/bin/hmm-info.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -43,7 +43,7 @@ int main(int argc, char *argv[]) {
 
     std::string model_in_filename = po.GetArg(1);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
@@ -54,8 +54,6 @@ int main(int argc, char *argv[]) {
     std::cout << "number of pdfs " << trans_model.NumPdfs() << '\n';
     std::cout << "number of transition-ids " << trans_model.NumTransitionIds()
               << '\n';
-    std::cout << "number of transition-states "
-              << trans_model.NumTransitionStates() << '\n';
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
diff --git a/src/bin/latgen-faster-mapped-parallel.cc b/src/bin/latgen-faster-mapped-parallel.cc
index 4479ec8b73e..415fd1a3584 100644
--- a/src/bin/latgen-faster-mapped-parallel.cc
+++ b/src/bin/latgen-faster-mapped-parallel.cc
@@ -24,7 +24,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "decoder/decodable-matrix.h"
@@ -74,7 +74,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
     bool determinize = config.determinize_lattice;
diff --git a/src/bin/latgen-faster-mapped.cc b/src/bin/latgen-faster-mapped.cc
index 610d9aa6d7d..3a65d78be04 100644
--- a/src/bin/latgen-faster-mapped.cc
+++ b/src/bin/latgen-faster-mapped.cc
@@ -23,7 +23,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "decoder/decodable-matrix.h"
@@ -70,7 +70,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
     bool determinize = config.determinize_lattice;
diff --git a/src/bin/logprob-to-post.cc b/src/bin/logprob-to-post.cc
index f221580a484..0edfba0189d 100644
--- a/src/bin/logprob-to-post.cc
+++ b/src/bin/logprob-to-post.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/posterior.h"
 
diff --git a/src/bin/make-h-transducer.cc b/src/bin/make-h-transducer.cc
index c54b9250cf7..e3a66a99536 100644
--- a/src/bin/make-h-transducer.cc
+++ b/src/bin/make-h-transducer.cc
@@ -16,7 +16,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "tree/context-dep.h"
 #include "util/common-utils.h"
@@ -71,17 +71,18 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;
     ReadKaldiObject(tree_filename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_filename, &trans_model);
 
     std::vector<int32> disambig_syms_out;
 
     // The work gets done here.
-    fst::VectorFst<fst::StdArc> *H = GetHTransducer (ilabel_info,
-                                                     ctx_dep,
-                                                     trans_model,
-                                                     hcfg,
-                                                     &disambig_syms_out);
+    std::unique_ptr<fst::VectorFst<fst::StdArc>> 
+      H = GetHTransducer (ilabel_info,
+                          ctx_dep,
+                          trans_model,
+                          hcfg,
+                          &disambig_syms_out);
 #if _MSC_VER
     if (fst_out_filename == "")
       _setmode(_fileno(stdout),  _O_BINARY);
@@ -101,7 +102,6 @@ int main(int argc, char *argv[]) {
                  << (fst_out_filename == "" ?
                      "standard output" : fst_out_filename);
 
-    delete H;
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/bin/make-ilabel-transducer.cc b/src/bin/make-ilabel-transducer.cc
index a78cefafd3a..70a5d6d4e18 100644
--- a/src/bin/make-ilabel-transducer.cc
+++ b/src/bin/make-ilabel-transducer.cc
@@ -16,7 +16,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "tree/context-dep.h"
 #include "util/common-utils.h"
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;
     ReadKaldiObject(tree_filename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_filename, &trans_model);
 
 
diff --git a/src/bin/make-pdf-to-tid-transducer.cc b/src/bin/make-pdf-to-tid-transducer.cc
index 907380a974d..b4ed45192e6 100644
--- a/src/bin/make-pdf-to-tid-transducer.cc
+++ b/src/bin/make-pdf-to-tid-transducer.cc
@@ -16,7 +16,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 #include "fst/fstlib.h"
@@ -47,10 +47,11 @@ int main(int argc, char *argv[]) {
     std::string trans_model_filename = po.GetArg(1);
     std::string fst_out_filename = po.GetOptArg(2);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(trans_model_filename, &trans_model);
 
-    fst::VectorFst<fst::StdArc> *fst = GetPdfToTransitionIdTransducer(trans_model);
+    std::unique_ptr<fst::VectorFst<fst::StdArc>> fst =
+      GetPdfToTransitionIdTransducer(trans_model);
 
 #if _MSC_VER
     if (fst_out_filename == "")
@@ -60,7 +61,6 @@ int main(int argc, char *argv[]) {
     if (!fst->Write(fst_out_filename))
       KALDI_ERR << "Error writing fst to "
                 << (fst_out_filename == "" ? "standard output" : fst_out_filename);
-    delete fst;
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
diff --git a/src/bin/phones-to-prons.cc b/src/bin/phones-to-prons.cc
index 0d7ab12c232..23c17a58385 100644
--- a/src/bin/phones-to-prons.cc
+++ b/src/bin/phones-to-prons.cc
@@ -20,7 +20,7 @@
 
 
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 #include "fst/fstlib.h"
diff --git a/src/bin/post-to-pdf-post.cc b/src/bin/post-to-pdf-post.cc
index 99aa5770aa5..6c2227806b4 100644
--- a/src/bin/post-to-pdf-post.cc
+++ b/src/bin/post-to-pdf-post.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/posterior.h"
 
@@ -50,7 +50,7 @@ int main(int argc, char *argv[]) {
         posteriors_rspecifier = po.GetArg(2),
         posteriors_wspecifier = po.GetArg(3);
 
-    TransitionModel trans_model;    
+    Transitions trans_model;    
     {
       bool binary_in;
       Input ki(model_rxfilename, &binary_in);
diff --git a/src/bin/post-to-phone-post.cc b/src/bin/post-to-phone-post.cc
index 871f03a91a1..cf97c631243 100644
--- a/src/bin/post-to-phone-post.cc
+++ b/src/bin/post-to-phone-post.cc
@@ -21,7 +21,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 
 int main(int argc, char *argv[]) {
@@ -64,7 +64,7 @@ int main(int argc, char *argv[]) {
     kaldi::SequentialPosteriorReader posterior_reader(post_rspecifier);
     kaldi::PosteriorWriter posterior_writer(phone_post_wspecifier);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_in;
       Input ki(model_rxfilename, &binary_in);
@@ -98,11 +98,11 @@ int main(int argc, char *argv[]) {
 
       for (int32 i = 1; i <= num_tids; i++) {
         BaseFloat count = transition_counts(i);
-        int32 phone = trans_model.TransitionIdToPhone(i),
-            pdf_id = trans_model.TransitionIdToPdf(i);
+        const Transitions::TransitionIdInfo
+            &info = trans_model.InfoForTransitionId(i);
         // Relying on C++11 value-initialization thingies that should make the
         // map's elements default to zero.
-        pdf_to_phones[pdf_id][phone] += count;
+        pdf_to_phones[info.pdf_id][info.phone] += count;
       }
 
       for (int32 i = 0; i < num_pdfs; i++) {
diff --git a/src/bin/post-to-tacc.cc b/src/bin/post-to-tacc.cc
index afa5315d6b4..842356f8ffb 100644
--- a/src/bin/post-to-tacc.cc
+++ b/src/bin/post-to-tacc.cc
@@ -21,7 +21,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 
 int main(int argc, char *argv[]) {
@@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
 
       bool binary_in;
       Input ki(model_rxfilename, &binary_in);
-      TransitionModel trans_model;
+      Transitions trans_model;
       trans_model.Read(ki.Stream(), binary_in);
       num_transition_ids = trans_model.NumTransitionIds();
 
@@ -90,7 +90,7 @@ int main(int argc, char *argv[]) {
       int32 num_pdf_ids = trans_model.NumPdfs();
       Vector<double> pdf_accs(num_pdf_ids);
       for (int32 i = 1; i < num_transition_ids; i++) {
-        int32 pid = trans_model.TransitionIdToPdf(i);
+        int32 pid = trans_model.TransitionIdToPdfFast(i);
         pdf_accs(pid) += transition_accs(i);
       }
       Vector<BaseFloat> pdf_accs_float(pdf_accs);
diff --git a/src/bin/prob-to-post.cc b/src/bin/prob-to-post.cc
index 4266d34ca47..7bdff6f1e78 100644
--- a/src/bin/prob-to-post.cc
+++ b/src/bin/prob-to-post.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/posterior.h"
 
diff --git a/src/bin/prons-to-wordali.cc b/src/bin/prons-to-wordali.cc
index a6331043500..8579c79ea02 100644
--- a/src/bin/prons-to-wordali.cc
+++ b/src/bin/prons-to-wordali.cc
@@ -19,7 +19,7 @@
 
 
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 #include "fst/fstlib.h"
diff --git a/src/bin/show-alignments.cc b/src/bin/show-alignments.cc
index 06bc907005f..f8c79d2d79b 100644
--- a/src/bin/show-alignments.cc
+++ b/src/bin/show-alignments.cc
@@ -19,7 +19,7 @@
 
 
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/common-utils.h"
 #include "fst/fstlib.h"
@@ -47,7 +47,7 @@ int main(int argc, char *argv[]) {
         model_filename = po.GetArg(2),
         alignments_rspecifier = po.GetArg(3);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_filename, &trans_model);
 
     fst::SymbolTable *phones_symtab = NULL;
@@ -80,8 +80,7 @@ int main(int argc, char *argv[]) {
         split_str[i] = ss.str();
 
         int32 tid = split[i][0],
-            tstate = trans_model.TransitionIdToTransitionState(tid),
-            phone = trans_model.TransitionStateToPhone(tstate);
+            phone = trans_model.InfoForTransitionId(tid).phone;
         split_str_phones[i] =
             phones_symtab->Find(phone) + " ";
         std::string space;
diff --git a/src/bin/show-transitions.cc b/src/bin/show-transitions.cc
index bdc780b060a..db72d47f988 100644
--- a/src/bin/show-transitions.cc
+++ b/src/bin/show-transitions.cc
@@ -18,7 +18,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fst/fstlib.h"
 #include "util/common-utils.h"
 
@@ -59,7 +59,7 @@ int main(int argc, char *argv[]) {
     for (size_t i = 0; i < syms->NumSymbols(); i++)
       names[i] = syms->Find(i);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(transition_model_filename, &trans_model);
 
     Vector<double> occs;
diff --git a/src/bin/tree-info.cc b/src/bin/tree-info.cc
index ce3c5c9cfc1..a1f4f21e983 100644
--- a/src/bin/tree-info.cc
+++ b/src/bin/tree-info.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/hmm-topology.h"
+#include "hmm/topology.h"
 #include "tree/context-dep.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/bin/weight-silence-post.cc b/src/bin/weight-silence-post.cc
index dba935d1cd3..3c8478752c8 100644
--- a/src/bin/weight-silence-post.cc
+++ b/src/bin/weight-silence-post.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "hmm/posterior.h"
 
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
       KALDI_WARN <<"No silence phones, this will have no effect";
     ConstIntegerSet<int32> silence_set(silence_phones);  // faster lookup.
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     int32 num_posteriors = 0;
diff --git a/src/cblasext/Makefile b/src/cblasext/Makefile
new file mode 100644
index 00000000000..a3d684cdee7
--- /dev/null
+++ b/src/cblasext/Makefile
@@ -0,0 +1,21 @@
+
+
+all:
+
+OPENFST_CXXFLAGS =
+OPENFST_LDLIBS =
+
+include ../kaldi.mk
+
+
+# you can uncomment matrix-lib-speed-test if you want to do the speed tests.
+
+TESTFILES =
+
+OBJFILES = cblas-extensions.o
+
+LIBNAME = kaldi-cblasext
+
+ADDLIBS = ../base/kaldi-base.a
+
+include ../makefiles/default_rules.mk
diff --git a/src/cblasext/cblas-extensions.cc b/src/cblasext/cblas-extensions.cc
new file mode 100644
index 00000000000..8d23ae6ab2d
--- /dev/null
+++ b/src/cblasext/cblas-extensions.cc
@@ -0,0 +1,162 @@
+// cblasext/cblas-extensions.cc
+
+// Copyright 2019       Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cblasext/cblas-wrappers.h"
+#include "cblasext/cblas-extensions.h"
+
+namespace kaldi {
+
+template<typename Real>
+void cblasext_Xgemv_sparsevec(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                     KaldiBlasInt num_cols, Real alpha, const Real *Mdata,
+                     KaldiBlasInt stride, const Real *xdata,
+                     KaldiBlasInt incX, Real beta, Real *ydata,
+                     KaldiBlasInt incY) {
+  if (trans == CblasNoTrans) {
+    if (beta != 1.0) cblas_Xscal(num_rows, beta, ydata, incY);
+    for (KaldiBlasInt i = 0; i < num_cols; i++) {
+      Real x_i = xdata[i * incX];
+      if (x_i == 0.0) continue;
+      // Add to ydata, the i'th column of M, times alpha * x_i
+      cblas_Xaxpy(num_rows, x_i * alpha, Mdata + i, stride, ydata, incY);
+    }
+  } else {
+    if (beta != 1.0) cblas_Xscal(num_cols, beta, ydata, incY);
+    for (KaldiBlasInt i = 0; i < num_rows; i++) {
+      Real x_i = xdata[i * incX];
+      if (x_i == 0.0) continue;
+      // Add to ydata, the i'th row of M, times alpha * x_i
+      cblas_Xaxpy(num_cols, x_i * alpha,
+                  Mdata + (i * stride), 1, ydata, incY);
+    }
+  }
+}
+
+
+template
+void cblasext_Xgemv_sparsevec(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                              KaldiBlasInt num_cols, float alpha, const float *Mdata,
+                              KaldiBlasInt stride, const float *xdata,
+                              KaldiBlasInt incX, float beta, float *ydata,
+                              KaldiBlasInt incY);
+template
+void cblasext_Xgemv_sparsevec(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                              KaldiBlasInt num_cols, double alpha, const double *Mdata,
+                              KaldiBlasInt stride, const double *xdata,
+                              KaldiBlasInt incX, double beta, double *ydata,
+                              KaldiBlasInt incY);
+
+
+template <typename Real>
+void cblasext_mul_elements_vec(
+    const KaldiBlasInt dim,
+    const Real *a,
+    Real *b) { // does b *= a, elementwise.
+  Real c1, c2, c3, c4;
+  KaldiBlasInt i;
+  for (i = 0; i + 4 <= dim; i += 4) {
+    c1 = a[i] * b[i];
+    c2 = a[i+1] * b[i+1];
+    c3 = a[i+2] * b[i+2];
+    c4 = a[i+3] * b[i+3];
+    b[i] = c1;
+    b[i+1] = c2;
+    b[i+2] = c3;
+    b[i+3] = c4;
+  }
+  for (; i < dim; i++)
+    b[i] *= a[i];
+}
+
+template void cblasext_mul_elements_vec(const KaldiBlasInt dim,
+                                        const float *a, float *b);
+template void cblasext_mul_elements_vec(const KaldiBlasInt dim,
+                                        const double *a, double *b);
+
+
+template <typename Real>
+void cblasext_mul_elements_mat(
+    const Real *Adata,
+    KaldiBlasInt a_num_rows,
+    KaldiBlasInt a_num_cols,
+    KaldiBlasInt a_stride,
+    Real *Bdata,
+    KaldiBlasInt b_stride) {
+  if (a_num_cols == a_stride && a_num_cols == b_stride) {
+    cblasext_mul_elements_vec(a_num_rows * a_num_cols, Adata, Bdata);
+  } else {
+    for (KaldiBlasInt i = 0; i < a_num_rows; i++) {
+      cblasext_mul_elements_vec(a_num_cols, Adata, Bdata);
+      Adata += a_stride;
+      Bdata += b_stride;
+    }
+  }
+}
+
+
+template void cblasext_mul_elements_mat(
+    const float *Adata, KaldiBlasInt a_num_rows,
+    KaldiBlasInt a_num_cols, KaldiBlasInt a_stride,
+    float *Bdata, KaldiBlasInt b_stride);
+template void cblasext_mul_elements_mat(
+    const double *Adata, KaldiBlasInt a_num_rows,
+    KaldiBlasInt a_num_cols, KaldiBlasInt a_stride,
+    double *Bdata, KaldiBlasInt b_stride);
+
+
+template <typename Real>
+Real cblasext_trace_mat_mat(
+    const Real *a_data,
+    KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols,
+    KaldiBlasInt a_stride, KaldiBlasInt a_col_stride,
+    const Real *b_data, CBLAS_TRANSPOSE b_trans,
+    KaldiBlasInt b_stride, KaldiBlasInt b_col_stride) {
+  Real ans = 0.0;
+  if (b_trans == CblasNoTrans) {
+    for (KaldiBlasInt i = 0; i < a_num_rows;
+         i++, a_data += a_stride, b_data += b_col_stride) {
+      ans += cblas_Xdot(a_num_cols, a_data, a_col_stride, b_data, b_stride);
+    }
+    return ans;
+  } else {
+    for (KaldiBlasInt i = 0; i < a_num_rows;
+         i++, a_data += a_stride, b_data += b_stride) {
+      ans += cblas_Xdot(a_num_cols, a_data, a_col_stride,
+                        b_data, b_col_stride);
+    }
+    return ans;
+  }
+}
+
+template float cblasext_trace_mat_mat(
+    const float *a_data,
+    KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols,
+    KaldiBlasInt a_stride, KaldiBlasInt a_col_stride,
+    const float *b_data, CBLAS_TRANSPOSE b_trans,
+    KaldiBlasInt b_stride, KaldiBlasInt b_col_stride);
+template double cblasext_trace_mat_mat(
+    const double *a_data,
+    KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols,
+    KaldiBlasInt a_stride, KaldiBlasInt a_col_stride,
+    const double *b_data, CBLAS_TRANSPOSE b_trans,
+    KaldiBlasInt b_stride, KaldiBlasInt b_col_stride);
+
+
+
+} // namespace kaldi
diff --git a/src/cblasext/cblas-extensions.h b/src/cblasext/cblas-extensions.h
new file mode 100644
index 00000000000..f3c3dbe3be9
--- /dev/null
+++ b/src/cblasext/cblas-extensions.h
@@ -0,0 +1,110 @@
+// cblasext/cblas-extensions.h
+
+// Copyright 2012-2019  Johns Hopkins University (author: Daniel Povey);
+//                      Haihua Xu; Wei Shi
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+#ifndef KALDI_MATRIX_CBLAS_EXTENSIONS_H_
+#define KALDI_MATRIX_CBLAS_EXTENSIONS_H_ 1
+
+
+#include "cblasext/kaldi-blas.h"
+#include "cblasext/cblas-wrappers.h"
+
+// In directories other than this directory, this file is intended to mostly be
+// included from .cc files, not from headers, since it includes cblas headers
+// (via kaldi-blas.h) and those can be quite polluting.
+
+// This file contains templated wrappers for CBLAS functions, which enable C++
+// code calling these functions to be templated.
+namespace kaldi {
+
+
+
+// This has the same interface as cblas_Xgemv, i.e. it does y = alpha M x + beta y;
+// it is just specialized for the case where the vector 'x' has a lot of zeros.
+template<typename Real>
+void cblasext_Xgemv_sparsevec(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                              KaldiBlasInt num_cols, Real alpha, const Real *Mdata,
+                              KaldiBlasInt stride, const Real *xdata,
+                              KaldiBlasInt incX, Real beta, Real *ydata,
+                              KaldiBlasInt incY);
+
+
+
+/**
+   Does, elementwise for 0 <= i < dim,
+     b[i] *= a[i].
+*/
+template <typename Real>
+void cblasext_mul_elements_vec(
+    const KaldiBlasInt dim,
+    const Real *a,
+    Real *b);
+
+
+/**
+   Does b *=  where a and b are matrices of the same dimension.
+   Does not currently support transpose.
+
+   Requires that a and b do not overlap (but this is not checked).
+*/
+template <typename Real>
+void cblasext_mul_elements_mat(
+    const Real *Adata,
+    KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols, KaldiBlasInt a_stride,
+    Real *Bdata,
+    KaldiBlasInt b_stride);
+
+/**
+   For matrices A and B (possibly with column strides as well as
+   row strides): if transB = false, compute
+      tr(A B) = \sum_{i,j} A(i, j) B(j, i)
+   or if transB = true, compute
+      tr(A B) = \sum_{i,j} A(i, j) B(i, j).
+     @param [in] Adata        Data pointer of matrix A
+     @param [in] a_num_rows   Number of rows of matrix A
+     @param [in] a_num_cols   Number of columns of matrix A
+     @param [in] a_stride     Row stride of matrix A; may have any value.
+     @param [in] a_col_stride Column stride of A, would be 1 for
+                              a normal matrix; must be positive.
+     @param [in] b_data        Data pointer of matrix B; may be
+                              the same as Adata.
+     @param [in] b_trans       True if B is transposed.  Note: the
+                              expression would have the same value
+                              if the transpose was applied to A
+                              instead.
+     @param [in] b_stride     Row stride of matrix B; may have any
+                              value.
+     @param [in] b_col_stride Column stride of matrix B; must be
+                              positive, will normally be 1.
+ */
+template <typename Real>
+Real cblasext_trace_mat_mat(
+    const Real *a_data,
+    KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols,
+    KaldiBlasInt a_stride, KaldiBlasInt a_col_stride,
+    const Real *b_data, CBLAS_TRANSPOSE b_trans,
+    KaldiBlasInt b_stride, KaldiBlasInt b_col_stride);
+
+
+
+
+
+}
+// namespace kaldi
+
+#endif
diff --git a/src/cblasext/cblas-wrappers.h b/src/cblasext/cblas-wrappers.h
new file mode 100644
index 00000000000..39fa12931ca
--- /dev/null
+++ b/src/cblasext/cblas-wrappers.h
@@ -0,0 +1,408 @@
+// matrix/cblas-wrappers.h
+
+// Copyright 2012-2019  Johns Hopkins University (author: Daniel Povey);
+//                      Haihua Xu; Wei Shi
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+#ifndef KALDI_MATRIX_CBLAS_WRAPPERS_H_
+#define KALDI_MATRIX_CBLAS_WRAPPERS_H_ 1
+
+
+#include "cblasext/kaldi-blas.h"
+
+// In directories other than this directory, this file is intended to mostly be
+// included from .cc files, not from headers, since it includes cblas headers
+// (via kaldi-blas.h) and those can be quite polluting.
+
+// This file contains templated wrappers for CBLAS functions, which enable C++
+// code calling these functions to be templated.
+namespace kaldi {
+
+
+inline void cblas_Xcopy(const KaldiBlasInt N, const float *X, const KaldiBlasInt incX, float *Y,
+                        const KaldiBlasInt incY) {
+  cblas_scopy(N, X, incX, Y, incY);
+}
+
+inline void cblas_Xcopy(const KaldiBlasInt N, const double *X, const KaldiBlasInt incX, double *Y,
+                        const KaldiBlasInt incY) {
+  cblas_dcopy(N, X, incX, Y, incY);
+}
+
+inline float cblas_Xasum(const KaldiBlasInt N, const float *X, const KaldiBlasInt incX) {
+  return cblas_sasum(N, X, incX);
+}
+
+inline double cblas_Xasum(const KaldiBlasInt N, const double *X, const KaldiBlasInt incX) {
+  return cblas_dasum(N, X, incX);
+}
+
+inline void cblas_Xrot(const KaldiBlasInt N, float *X, const KaldiBlasInt incX, float *Y,
+                       const KaldiBlasInt incY, const float c, const float s) {
+  cblas_srot(N, X, incX, Y, incY, c, s);
+}
+inline void cblas_Xrot(const KaldiBlasInt N, double *X, const KaldiBlasInt incX, double *Y,
+                       const KaldiBlasInt incY, const double c, const double s) {
+  cblas_drot(N, X, incX, Y, incY, c, s);
+}
+inline float cblas_Xdot(const KaldiBlasInt N, const float *const X,
+                        const KaldiBlasInt incX, const float *const Y,
+                        const KaldiBlasInt incY) {
+  return cblas_sdot(N, X, incX, Y, incY);
+}
+inline double cblas_Xdot(const KaldiBlasInt N, const double *const X,
+                        const KaldiBlasInt incX, const double *const Y,
+                        const KaldiBlasInt incY) {
+  return cblas_ddot(N, X, incX, Y, incY);
+}
+inline void cblas_Xaxpy(const KaldiBlasInt N, const float alpha, const float *X,
+                        const KaldiBlasInt incX, float *Y, const KaldiBlasInt incY) {
+  cblas_saxpy(N, alpha, X, incX, Y, incY);
+}
+inline void cblas_Xaxpy(const KaldiBlasInt N, const double alpha, const double *X,
+                        const KaldiBlasInt incX, double *Y, const KaldiBlasInt incY) {
+  cblas_daxpy(N, alpha, X, incX, Y, incY);
+}
+inline void cblas_Xscal(const KaldiBlasInt N, const float alpha, float *data,
+                        const KaldiBlasInt inc) {
+  cblas_sscal(N, alpha, data, inc);
+}
+inline void cblas_Xscal(const KaldiBlasInt N, const double alpha, double *data,
+                        const KaldiBlasInt inc) {
+  cblas_dscal(N, alpha, data, inc);
+}
+inline void cblas_Xtpmv(CBLAS_TRANSPOSE trans, const float *Mdata,
+                        const KaldiBlasInt num_rows, float *y, const KaldiBlasInt y_inc) {
+  cblas_stpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
+              CblasNonUnit, num_rows, Mdata, y, y_inc);
+}
+inline void cblas_Xtpmv(CBLAS_TRANSPOSE trans, const double *Mdata,
+                        const KaldiBlasInt num_rows, double *y, const KaldiBlasInt y_inc) {
+  cblas_dtpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
+              CblasNonUnit, num_rows, Mdata, y, y_inc);
+}
+
+
+inline void cblas_Xtpsv(CBLAS_TRANSPOSE trans, const float *Mdata,
+                        const KaldiBlasInt num_rows, float *y, const KaldiBlasInt y_inc) {
+  cblas_stpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
+              CblasNonUnit, num_rows, Mdata, y, y_inc);
+}
+inline void cblas_Xtpsv(CBLAS_TRANSPOSE trans, const double *Mdata,
+                        const KaldiBlasInt num_rows, double *y, const KaldiBlasInt y_inc) {
+  cblas_dtpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
+              CblasNonUnit, num_rows, Mdata, y, y_inc);
+}
+
+// x = alpha * M * y + beta * x
+inline void cblas_Xspmv(KaldiBlasInt dim, float alpha, const float *Mdata,
+                        const float *ydata, KaldiBlasInt ystride,
+                        float beta, float *xdata, KaldiBlasInt xstride) {
+  cblas_sspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
+              ydata, ystride, beta, xdata, xstride);
+}
+inline void cblas_Xspmv(KaldiBlasInt dim, double alpha, const double *Mdata,
+                        const double *ydata, KaldiBlasInt ystride,
+                        double beta, double *xdata, KaldiBlasInt xstride) {
+  cblas_dspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
+              ydata, ystride, beta, xdata, xstride);
+}
+
+// Implements  A += alpha * (x y'  + y x'); A is symmetric matrix.
+inline void cblas_Xspr2(KaldiBlasInt dim, float alpha, const float *Xdata,
+                        KaldiBlasInt incX, const float *Ydata, KaldiBlasInt incY,
+                          float *Adata) {
+  cblas_sspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
+              incX, Ydata, incY, Adata);
+}
+inline void cblas_Xspr2(KaldiBlasInt dim, double alpha, const double *Xdata,
+                        KaldiBlasInt incX, const double *Ydata, KaldiBlasInt incY,
+                        double *Adata) {
+  cblas_dspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
+              incX, Ydata, incY, Adata);
+}
+
+// Implements  A += alpha * (x x'); A is symmetric matrix.
+inline void cblas_Xspr(KaldiBlasInt dim, float alpha, const float *Xdata,
+                       KaldiBlasInt incX, float *Adata) {
+  cblas_sspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
+}
+inline void cblas_Xspr(KaldiBlasInt dim, double alpha, const double *Xdata,
+                       KaldiBlasInt incX, double *Adata) {
+  cblas_dspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
+}
+
+// sgemv,dgemv: y = alpha M x + beta y.
+inline void cblas_Xgemv(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                        KaldiBlasInt num_cols, float alpha, const float *Mdata,
+                        KaldiBlasInt stride, const float *xdata,
+                        KaldiBlasInt incX, float beta, float *ydata, KaldiBlasInt incY) {
+  cblas_sgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
+              num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
+}
+inline void cblas_Xgemv(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                        KaldiBlasInt num_cols, double alpha, const double *Mdata,
+                        KaldiBlasInt stride, const double *xdata,
+                        KaldiBlasInt incX, double beta, double *ydata, KaldiBlasInt incY) {
+  cblas_dgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
+              num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
+}
+
+// sgbmv, dgmmv: y = alpha M x +  + beta * y.
+inline void cblas_Xgbmv(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                        KaldiBlasInt num_cols, KaldiBlasInt num_below,
+                        KaldiBlasInt num_above, float alpha, const float *Mdata,
+                        KaldiBlasInt stride, const float *xdata,
+                        KaldiBlasInt incX, float beta, float *ydata, KaldiBlasInt incY) {
+  cblas_sgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
+              num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
+              incX, beta, ydata, incY);
+}
+inline void cblas_Xgbmv(CBLAS_TRANSPOSE trans, KaldiBlasInt num_rows,
+                        KaldiBlasInt num_cols, KaldiBlasInt num_below,
+                        KaldiBlasInt num_above, double alpha, const double *Mdata,
+                        KaldiBlasInt stride, const double *xdata,
+                        KaldiBlasInt incX, double beta, double *ydata, KaldiBlasInt incY) {
+  cblas_dgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
+              num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
+              incX, beta, ydata, incY);
+}
+
+inline void cblas_Xgemm(const float alpha,
+                        CBLAS_TRANSPOSE transA,
+                        const float *Adata,
+                        KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols, KaldiBlasInt a_stride,
+                        CBLAS_TRANSPOSE transB,
+                        const float *Bdata, KaldiBlasInt b_stride,
+                        const float beta,
+                        float *Mdata,
+                        KaldiBlasInt num_rows, KaldiBlasInt num_cols,KaldiBlasInt stride) {
+  cblas_sgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA),
+              static_cast<CBLAS_TRANSPOSE>(transB),
+              num_rows, num_cols, transA == CblasNoTrans ? a_num_cols : a_num_rows,
+              alpha, Adata, a_stride, Bdata, b_stride,
+              beta, Mdata, stride);
+}
+inline void cblas_Xgemm(const double alpha,
+                        CBLAS_TRANSPOSE transA,
+                        const double *Adata,
+                        KaldiBlasInt a_num_rows, KaldiBlasInt a_num_cols, KaldiBlasInt a_stride,
+                        CBLAS_TRANSPOSE transB,
+                        const double *Bdata, KaldiBlasInt b_stride,
+                        const double beta,
+                        double *Mdata,
+                        KaldiBlasInt num_rows, KaldiBlasInt num_cols,KaldiBlasInt stride) {
+  cblas_dgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA),
+              static_cast<CBLAS_TRANSPOSE>(transB),
+              num_rows, num_cols, transA == CblasNoTrans ? a_num_cols : a_num_rows,
+              alpha, Adata, a_stride, Bdata, b_stride,
+              beta, Mdata, stride);
+}
+
+
+inline void cblas_Xsymm(const float alpha,
+                        KaldiBlasInt sz,
+                        const float *Adata,KaldiBlasInt a_stride,
+                        const float *Bdata,KaldiBlasInt b_stride,
+                        const float beta,
+                        float *Mdata, KaldiBlasInt stride) {
+  cblas_ssymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
+              a_stride, Bdata, b_stride, beta, Mdata, stride);
+}
+inline void cblas_Xsymm(const double alpha,
+                        KaldiBlasInt sz,
+                        const double *Adata,KaldiBlasInt a_stride,
+                        const double *Bdata,KaldiBlasInt b_stride,
+                        const double beta,
+                        double *Mdata, KaldiBlasInt stride) {
+  cblas_dsymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
+              a_stride, Bdata, b_stride, beta, Mdata, stride);
+}
+// ger: M += alpha x y^T.
+inline void cblas_Xger(KaldiBlasInt num_rows, KaldiBlasInt num_cols, float alpha,
+                       const float *xdata, KaldiBlasInt incX, const float *ydata,
+                       KaldiBlasInt incY, float *Mdata, KaldiBlasInt stride) {
+  cblas_sger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
+             Mdata, stride);
+}
+inline void cblas_Xger(KaldiBlasInt num_rows, KaldiBlasInt num_cols, double alpha,
+                       const double *xdata, KaldiBlasInt incX, const double *ydata,
+                       KaldiBlasInt incY, double *Mdata, KaldiBlasInt stride) {
+  cblas_dger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
+             Mdata, stride);
+}
+
+// syrk: symmetric rank-k update.
+// if trans==CblasNoTrans, then C = alpha A A^T + beta C
+// else C = alpha A^T A + beta C.
+// note: dim_c is dim(C), other_dim_a is the "other" dimension of A, i.e.
+// num-cols(A) if CblasNoTrans, or num-rows(A) if CblasTrans.
+// We only need the row-major and lower-triangular option of this, and this
+// is hard-coded.
+inline void cblas_Xsyrk (
+    const CBLAS_TRANSPOSE trans, const KaldiBlasInt dim_c,
+    const KaldiBlasInt other_dim_a, const float alpha, const float *A,
+    const KaldiBlasInt a_stride, const float beta, float *C,
+    const KaldiBlasInt c_stride) {
+  cblas_ssyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
+              dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
+}
+
+inline void cblas_Xsyrk(
+    const CBLAS_TRANSPOSE trans, const KaldiBlasInt dim_c,
+    const KaldiBlasInt other_dim_a, const double alpha, const double *A,
+    const KaldiBlasInt a_stride, const double beta, double *C,
+    const KaldiBlasInt c_stride) {
+  cblas_dsyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
+              dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
+}
+
+/// matrix-vector multiply using a banded matrix; we always call this
+/// with b = 1 meaning we're multiplying by a diagonal matrix.  This is used for
+/// elementwise multiplication.  We miss some of the arguments out of this
+/// wrapper.
+inline void cblas_Xsbmv1(
+    const KaldiBlasInt dim,
+    const double *A,
+    const double alpha,
+    const double *x,
+    const double beta,
+    double *y) {
+  cblas_dsbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
+              1, x, 1, beta, y, 1);
+}
+
+inline void cblas_Xsbmv1(
+    const KaldiBlasInt dim,
+    const float *A,
+    const float alpha,
+    const float *x,
+    const float beta,
+    float *y) {
+  cblas_ssbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
+              1, x, 1, beta, y, 1);
+}
+
+
+// add clapack here
+#if !defined(HAVE_ATLAS)
+inline void clapack_Xtptri(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *result) {
+  stptri_(const_cast<char *>("U"), const_cast<char *>("N"), num_rows, Mdata, result);
+}
+inline void clapack_Xtptri(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *result) {
+  dtptri_(const_cast<char *>("U"), const_cast<char *>("N"), num_rows, Mdata, result);
+}
+//
+inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols,
+                            float *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot,
+                            KaldiBlasInt *result) {
+  sgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
+}
+inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols,
+                            double *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot,
+                            KaldiBlasInt *result) {
+  dgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
+}
+
+//
+inline void clapack_Xgetri2(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride,
+                           KaldiBlasInt *pivot, float *p_work,
+                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
+  sgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
+}
+inline void clapack_Xgetri2(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
+                           KaldiBlasInt *pivot, double *p_work,
+                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
+  dgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
+}
+//
+inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
+                           KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride,
+                           float *sv, float *Vdata, KaldiBlasInt *vstride,
+                           float *Udata, KaldiBlasInt *ustride, float *p_work,
+                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
+  sgesvd_(v, u,
+          num_cols, num_rows, Mdata, stride,
+          sv, Vdata, vstride, Udata, ustride,
+          p_work, l_work, result);
+}
+inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
+                           KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
+                           double *sv, double *Vdata, KaldiBlasInt *vstride,
+                           double *Udata, KaldiBlasInt *ustride, double *p_work,
+                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
+  dgesvd_(v, u,
+          num_cols, num_rows, Mdata, stride,
+          sv, Vdata, vstride, Udata, ustride,
+          p_work, l_work, result);
+}
+//
+void inline clapack_Xsptri(KaldiBlasInt *num_rows, float *Mdata,
+                           KaldiBlasInt *ipiv, float *work, KaldiBlasInt *result) {
+  ssptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
+}
+void inline clapack_Xsptri(KaldiBlasInt *num_rows, double *Mdata,
+                           KaldiBlasInt *ipiv, double *work, KaldiBlasInt *result) {
+  dsptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
+}
+//
+void inline clapack_Xsptrf(KaldiBlasInt *num_rows, float *Mdata,
+                           KaldiBlasInt *ipiv, KaldiBlasInt *result) {
+  ssptrf_(const_cast<char *>("U"), num_rows, Mdata, ipiv, result);
+}
+void inline clapack_Xsptrf(KaldiBlasInt *num_rows, double *Mdata,
+                           KaldiBlasInt *ipiv, KaldiBlasInt *result) {
+  dsptrf_(const_cast<char *>("U"), num_rows, Mdata, ipiv, result);
+}
+#else
+inline void clapack_Xgetrf(KaldiBlasInt num_rows, KaldiBlasInt num_cols,
+                           float *Mdata, KaldiBlasInt stride,
+                           KaldiBlasInt *pivot, KaldiBlasInt *result) {
+  *result = clapack_sgetrf(CblasColMajor, num_rows, num_cols,
+                              Mdata, stride, pivot);
+}
+
+inline void clapack_Xgetrf(KaldiBlasInt num_rows, KaldiBlasInt num_cols,
+                           double *Mdata, KaldiBlasInt stride,
+                           KaldiBlasInt *pivot, KaldiBlasInt *result) {
+  *result = clapack_dgetrf(CblasColMajor, num_rows, num_cols,
+                              Mdata, stride, pivot);
+}
+//
+inline KaldiBlasInt clapack_Xtrtri(KaldiBlasInt num_rows, float *Mdata, KaldiBlasInt stride) {
+  return  clapack_strtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
+                              Mdata, stride);
+}
+
+inline KaldiBlasInt clapack_Xtrtri(KaldiBlasInt num_rows, double *Mdata, KaldiBlasInt stride) {
+  return  clapack_dtrtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
+                              Mdata, stride);
+}
+//
+inline void clapack_Xgetri(KaldiBlasInt num_rows, float *Mdata, KaldiBlasInt stride,
+                      KaldiBlasInt *pivot, KaldiBlasInt *result) {
+  *result = clapack_sgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
+}
+inline void clapack_Xgetri(KaldiBlasInt num_rows, double *Mdata, KaldiBlasInt stride,
+                      KaldiBlasInt *pivot, KaldiBlasInt *result) {
+  *result = clapack_dgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
+}
+#endif
+
+}
+// namespace kaldi
+
+#endif
diff --git a/src/matrix/kaldi-blas.h b/src/cblasext/kaldi-blas.h
similarity index 96%
rename from src/matrix/kaldi-blas.h
rename to src/cblasext/kaldi-blas.h
index 8a06540bba2..88ba12a0be1 100644
--- a/src/matrix/kaldi-blas.h
+++ b/src/cblasext/kaldi-blas.h
@@ -122,10 +122,8 @@ typedef integer KaldiBlasInt;
 #ifdef HAVE_MKL
 typedef MKL_INT KaldiBlasInt;
 #endif
-
 #ifdef HAVE_ATLAS
-// in this case there is no need for KaldiBlasInt-- this typedef is only needed
-// for Svd code which is not included in ATLAS (we re-implement it).
+typedef int KaldiBlasInt;
 #endif
 
 
diff --git a/src/chain/Makefile b/src/chain/Makefile
index fbad28f7de6..dd4859f5449 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -18,7 +18,7 @@ LIBNAME = kaldi-chain
 
 ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
 ifeq ($(CUDA), true)
diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc
index 11c851091bd..36e82c1baf6 100644
--- a/src/chain/chain-den-graph.cc
+++ b/src/chain/chain-den-graph.cc
@@ -162,7 +162,7 @@ void DenominatorGraph::GetNormalizationFst(const fst::StdVectorFst &ifst,
 }
 
 
-void MapFstToPdfIdsPlusOne(const TransitionModel &trans_model,
+void MapFstToPdfIdsPlusOne(const Transitions &trans_model,
                            fst::StdVectorFst *fst) {
   int32 num_states = fst->NumStates();
   for (int32 s = 0; s < num_states; s++) {
@@ -171,7 +171,7 @@ void MapFstToPdfIdsPlusOne(const TransitionModel &trans_model,
       fst::StdArc arc = aiter.Value();
       KALDI_ASSERT(arc.ilabel == arc.olabel);
       if (arc.ilabel > 0) {
-        arc.ilabel = trans_model.TransitionIdToPdf(arc.ilabel) + 1;
+        arc.ilabel = trans_model.TransitionIdToPdfFast(arc.ilabel) + 1;
         arc.olabel = arc.ilabel;
         aiter.SetValue(arc);
       }
@@ -295,7 +295,7 @@ static void CheckDenominatorFst(int32 num_pdfs,
 }
 
 void CreateDenominatorFst(const ContextDependency &ctx_dep,
-                          const TransitionModel &trans_model,
+                          const Transitions &trans_model,
                           const fst::StdVectorFst &phone_lm_in,
                           fst::StdVectorFst *den_fst) {
   using fst::StdVectorFst;
@@ -336,31 +336,26 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
             << context_dep_lm.NumStates() << " and " << NumArcs(context_dep_lm);
 
   std::vector<int32> disambig_syms_h; // disambiguation symbols on input side
-  // of H -- will be empty.
+                                      // of H -- will be empty.
+
   HTransducerConfig h_config;
-  // the default is 1, but just document that we want this to stay as one.
-  // we'll use the same value in test time.  Consistency is the key here.
-  h_config.transition_scale = 1.0;
-
-  StdVectorFst *h_fst = GetHTransducer(inv_cfst.IlabelInfo(),
-                                       ctx_dep,
-                                       trans_model,
-                                       h_config,
-                                       &disambig_syms_h);
+  std::unique_ptr<StdVectorFst> h_fst = GetHTransducer(inv_cfst.IlabelInfo(),
+                                                       ctx_dep,
+                                                       trans_model,
+                                                       h_config,
+                                                       &disambig_syms_h);
   KALDI_ASSERT(disambig_syms_h.empty());
   StdVectorFst transition_id_fst;
   TableCompose(*h_fst, context_dep_lm, &transition_id_fst);
-  delete h_fst;
 
-  BaseFloat self_loop_scale = 1.0;  // We have to be careful to use the same
-                                    // value in test time.
   // 'reorder' must always be set to true for chain models.
-  bool reorder = true;
-  bool check_no_self_loops = true;
+  bool currently_self_loop_free = true,
+      use_weights = true;
 
   // add self-loops to the FST with transition-ids as its labels.
-  AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder,
-               check_no_self_loops, &transition_id_fst);
+  AddSelfLoops(trans_model, disambig_syms_h,
+               currently_self_loop_free, use_weights,
+               &transition_id_fst);
   // at this point transition_id_fst will have transition-ids as its ilabels and
   // context-dependent phones (indexes into IlabelInfo()) as its olabels.
   // Discard the context-dependent phones by projecting on the input, keeping
diff --git a/src/chain/chain-den-graph.h b/src/chain/chain-den-graph.h
index b2510651f39..baf5ac2c6f1 100644
--- a/src/chain/chain-den-graph.h
+++ b/src/chain/chain-den-graph.h
@@ -32,7 +32,7 @@
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
 #include "chain/chain-datastruct.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
 #include "cudamatrix/cu-array.h"
@@ -149,7 +149,7 @@ void MinimizeAcceptorNoPush(fst::StdVectorFst *fst);
 // transition-ids to pdf-ids plus one.  Assumes 'fst'
 // is an acceptor, but does not check this (only looks at its
 // ilabels).
-void MapFstToPdfIdsPlusOne(const TransitionModel &trans_model,
+void MapFstToPdfIdsPlusOne(const Transitions &trans_model,
                            fst::StdVectorFst *fst);
 
 // Starting from an acceptor on phones that represents some kind of compiled
@@ -157,7 +157,7 @@ void MapFstToPdfIdsPlusOne(const TransitionModel &trans_model,
 // denominator-graph.  Note: there is similar code in chain-supervision.cc, when
 // creating the supervision graph.
 void CreateDenominatorFst(const ContextDependency &ctx_dep,
-                          const TransitionModel &trans_model,
+                          const Transitions &trans_model,
                           const fst::StdVectorFst &phone_lm,
                           fst::StdVectorFst *den_graph);
 
diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h
index 217b7447621..68e6e32682d 100644
--- a/src/chain/chain-denominator.h
+++ b/src/chain/chain-denominator.h
@@ -31,7 +31,7 @@
 #include "tree/context-dep.h"
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-array.h"
 #include "chain/chain-den-graph.h"
diff --git a/src/chain/chain-generic-numerator.h b/src/chain/chain-generic-numerator.h
index fc5e00b2c63..8c542d6049c 100644
--- a/src/chain/chain-generic-numerator.h
+++ b/src/chain/chain-generic-numerator.h
@@ -32,7 +32,7 @@
 #include "tree/context-dep.h"
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "chain/chain-supervision.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-array.h"
diff --git a/src/chain/chain-numerator.h b/src/chain/chain-numerator.h
index 15cb31e0571..c4ea4774b53 100644
--- a/src/chain/chain-numerator.h
+++ b/src/chain/chain-numerator.h
@@ -31,7 +31,7 @@
 #include "tree/context-dep.h"
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "chain/chain-supervision.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-array.h"
diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc
index 7ee5ee117b0..8af77af5d12 100644
--- a/src/chain/chain-supervision-test.cc
+++ b/src/chain/chain-supervision-test.cc
@@ -57,7 +57,7 @@ void ComputeExamplePhoneLanguageModel(const std::vector<int32> &phones,
 
 
 void ComputeExampleDenFst(const ContextDependency &ctx_dep,
-                          const TransitionModel &trans_model,
+                          const Transitions &trans_model,
                           fst::StdVectorFst *den_graph) {
   using fst::StdVectorFst;
   using fst::StdArc;
@@ -151,7 +151,7 @@ void TestSupervisionNumerator(const Supervision &supervision) {
 
 }
 
-void TestSupervisionAppend(const TransitionModel &trans_model,
+void TestSupervisionAppend(const Transitions &trans_model,
                            const Supervision &supervision) {
   int32 num_append = RandInt(1,5);
   std::vector<const Supervision*> input(num_append);
@@ -180,7 +180,7 @@ void TestSupervisionAppend(const TransitionModel &trans_model,
   output.Check(trans_model);
 }
 
-void TestSupervisionReattached(const TransitionModel &trans_model,
+void TestSupervisionReattached(const Transitions &trans_model,
                                const Supervision &supervision,
                                const Supervision &reattached_supervision) {
   using namespace fst;
@@ -333,7 +333,7 @@ void ChainTrainingTest(const DenominatorGraph &den_graph,
 }
 
 void TestSupervisionSplitting(const ContextDependency &ctx_dep,
-                              const TransitionModel &trans_model,
+                              const Transitions &trans_model,
                               const Supervision &supervision) {
   fst::StdVectorFst den_fst, normalization_fst;
   ComputeExampleDenFst(ctx_dep, trans_model, &den_fst);
@@ -456,7 +456,7 @@ void ChainDenominatorTest(const DenominatorGraph &den_graph) {
 
 void ChainSupervisionTest() {
   ContextDependency *ctx_dep;
-  TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep);
+  Transitions *trans_model = GenRandTransitions(&ctx_dep);
   const std::vector<int32> &phones = trans_model->GetPhones();
 
   int32 subsample_factor = RandInt(1, 3);
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index f8a2c1d11cc..a99592aa403 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -21,6 +21,7 @@
 #include "lat/lattice-functions.h"
 #include "util/text-utils.h"
 #include "hmm/hmm-utils.h"
+#include "fstext/fstext-utils.h"
 #include <numeric>
 
 namespace kaldi {
@@ -229,7 +230,7 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts,
 
 bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
   // the following call will do the range-check on 'ilabel'.
-  int32 phone = trans_model_.TransitionIdToPhone(ilabel);
+  int32 phone = trans_model_.InfoForTransitionId(ilabel).phone;
   KALDI_ASSERT(static_cast<size_t>(s) <= allowed_phones_.size());
   if (static_cast<size_t>(s) == allowed_phones_.size()) {
     // No arcs come from the final state.a
@@ -240,7 +241,7 @@ bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
     oarc->ilabel = ilabel;
     if (convert_to_pdfs_) {
       // the olabel will be a pdf-id plus one, not a transition-id.
-      int32 pdf_id = trans_model_.TransitionIdToPdf(ilabel);
+      int32 pdf_id = trans_model_.TransitionIdToPdfFast(ilabel);
       oarc->olabel = pdf_id + 1;
     } else {
       oarc->olabel = ilabel;
@@ -255,7 +256,7 @@ bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
 
 bool TrainingGraphToSupervisionE2e(
     const fst::StdVectorFst &training_graph,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     int32 num_frames,
     Supervision *supervision) {
   using fst::VectorFst;
@@ -276,7 +277,7 @@ bool TrainingGraphToSupervisionE2e(
       }
       KALDI_ASSERT(arc.ilabel != 0);
       StdArc arc2(arc);
-      arc2.ilabel = arc2.olabel = trans_model.TransitionIdToPdf(arc.ilabel) + 1;
+      arc2.ilabel = arc2.olabel = trans_model.TransitionIdToPdfFast(arc.ilabel) + 1;
       aiter.SetValue(arc2);
     }
   }
@@ -292,7 +293,7 @@ bool TrainingGraphToSupervisionE2e(
 
 bool ProtoSupervisionToSupervision(
     const ContextDependencyInterface &ctx_dep,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const ProtoSupervision &proto_supervision,
     bool convert_to_pdfs,
     Supervision *supervision) {
@@ -332,39 +333,27 @@ bool ProtoSupervisionToSupervision(
                                       // disambiguation symbols on the output.
 
   HTransducerConfig h_cfg;
-
-  // We don't want to add any transition probabilities as they will be added
-  // when we compose with the denominator graph.
-  h_cfg.transition_scale = 0.0;
-
-  VectorFst<StdArc> *h_fst = GetHTransducer(inv_cfst.IlabelInfo(),
-                                            ctx_dep,
-                                            trans_model,
-                                            h_cfg,
-                                            &disambig_syms_h);
+  h_cfg.include_self_loops = true;
+  std::unique_ptr<VectorFst<StdArc>> h_fst = GetHTransducer(inv_cfst.IlabelInfo(),
+                                                            ctx_dep,
+                                                            trans_model,
+                                                            h_cfg,
+                                                            &disambig_syms_h);
   KALDI_ASSERT(disambig_syms_h.empty());
 
+  // We don't want to include any transition probabilities as they will be added
+  // when we compose with the normalization FST.
+  fst::RemoveWeights(h_fst.get());
+
   VectorFst<StdArc> transition_id_fst;
   TableCompose(*h_fst, context_dep_fst, &transition_id_fst);
-  delete h_fst;
-
-  // We don't want to add any transition probabilities as they will be added
-  // when we compose with the denominator graph.
-  BaseFloat self_loop_scale = 0.0;
-
-  // You should always set reorder to true; for the current chain-model
-  // topologies, it will affect results if you are inconsistent about this.
-  bool reorder = true,
-      check_no_self_loops = true;
-  // add self-loops to the FST with transition-ids as its labels.
-  AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder,
-               check_no_self_loops, &transition_id_fst);
 
   // at this point transition_id_fst will have transition-ids as its ilabels and
   // context-dependent phones (indexes into ILabelInfo()) as its olabels.
   // Discard the context-dependent phones by projecting on the input, keeping
   // only the transition-ids.
   fst::Project(&transition_id_fst, fst::PROJECT_INPUT);
+
   if (transition_id_fst.Properties(fst::kIEpsilons, true) != 0) {
     // remove epsilons, if there are any.
     fst::RmEpsilon(&transition_id_fst);
@@ -906,7 +895,7 @@ bool Supervision::operator == (const Supervision &other) const {
       label_dim == other.label_dim && fst::Equal(fst, other.fst);
 }
 
-void Supervision::Check(const TransitionModel &trans_mdl) const {
+void Supervision::Check(const Transitions &trans_mdl) const {
   if (weight <= 0.0)
     KALDI_ERR << "Weight should be positive.";
   if (frames_per_sequence <= 0)
@@ -970,7 +959,7 @@ void GetWeightsForRanges(int32 range_length,
 }
 
 bool ConvertSupervisionToUnconstrained(
-    const TransitionModel &trans_mdl,
+    const Transitions &trans_mdl,
     Supervision *supervision) {
   KALDI_ASSERT(supervision->label_dim == trans_mdl.NumTransitionIds() &&
                supervision->fst.NumStates() > 0 &&
@@ -1000,7 +989,7 @@ bool ConvertSupervisionToUnconstrained(
     }
     for (int32 i = 0; i < supervision->frames_per_sequence; i++) {
       supervision->alignment_pdfs[i] =
-          trans_mdl.TransitionIdToPdf(supervision->alignment_pdfs[i]);
+          trans_mdl.TransitionIdToPdfFast(supervision->alignment_pdfs[i]);
     }
   }
 
@@ -1027,7 +1016,7 @@ bool ConvertSupervisionToUnconstrained(
         // because these graphs are always built with reorder == true; if it was
         // built with reorder == false, we'd have to treat the last, not first,
         // frame specially.)
-        if (trans_mdl.IsSelfLoop(transition_id) && s != start_state)
+        if (trans_mdl.InfoForTransitionId(transition_id).is_self_loop && s != start_state)
           arc.ilabel = 0;
         aiter.SetValue(arc);
       }
@@ -1062,19 +1051,18 @@ bool ConvertSupervisionToUnconstrained(
 
     // There are be no disambiguation symbols here.
     std::vector<int32> disambig_syms;
-    // We're not adding transition probabilities; we rely on compsition with the
+    // We're not adding transition probabilities; we rely on composition with the
     // normalization FST for that.  (note: all transition probabilities are just
     // 0.5 anyway, for the typical chain topology).
-    BaseFloat self_loop_scale = 0.0;
-    // 'reorder' must always be true for chain models.
-    bool reorder = true;
-    // The FST we're about to call AddSelfLoops() on will have self-loops, on
-    // the first frame, so disable the check that the FST was originally
-    // self-loop-free.
-    bool check_no_self_loops = false;
+    //
+    // The FST we're about to call AddSelfLoops() on will already have one
+    // self-loop, on the first frame, so tell that to AddSelfLoops().
+    bool currently_self_loop_free = false,
+        use_weights = false;
     supervision->e2e_fsts.resize(1);
-    AddSelfLoops(trans_mdl, disambig_syms, self_loop_scale,
-                 reorder, check_no_self_loops, &(supervision->e2e_fsts[0]));
+    AddSelfLoops(trans_mdl, disambig_syms,
+                 currently_self_loop_free, use_weights,
+                 &(supervision->e2e_fsts[0]));
   }
 
   { // Convert transition-ids to pdf-ids+1 on the FST labels,
@@ -1089,7 +1077,7 @@ bool ConvertSupervisionToUnconstrained(
         // AddSelfLoops() works (it calls MakePrecedingInputSymbolsSame(), which
         // adds epsilons).  zero olabels.
         if (arc.ilabel != 0) {
-          int32 pdf_id_plus_one = trans_mdl.TransitionIdToPdf(arc.ilabel) + 1;
+          int32 pdf_id_plus_one = trans_mdl.TransitionIdToPdfFast(arc.ilabel) + 1;
           arc.ilabel = pdf_id_plus_one;
           arc.olabel = pdf_id_plus_one;
           aiter.SetValue(arc);
diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h
index f1a796dc2f8..0b8a760f1e6 100644
--- a/src/chain/chain-supervision.h
+++ b/src/chain/chain-supervision.h
@@ -29,7 +29,7 @@
 #include "util/common-utils.h"
 #include "lat/kaldi-lattice.h"
 #include "fstext/deterministic-fst.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 namespace chain {
@@ -181,7 +181,7 @@ class TimeEnforcerFst:
   typedef fst::StdArc::StateId StateId;
   typedef fst::StdArc::Label Label;
 
-  TimeEnforcerFst(const TransitionModel &trans_model,
+  TimeEnforcerFst(const Transitions &trans_model,
                   bool convert_to_pdfs,
                   const std::vector<std::vector<int32> > &allowed_phones):
       trans_model_(trans_model),
@@ -204,7 +204,7 @@ class TimeEnforcerFst:
   virtual bool GetArc(StateId s, Label ilabel, fst::StdArc* oarc);
 
  private:
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   // if convert_to_pdfs_ is true, this FST will map from transition-id (on the
   // input side) to pdf-id plus one (on the output side); if false, both sides'
   // labels will be transition-id.
@@ -234,10 +234,10 @@ struct Supervision {
 
   // the maximum possible value of the labels in 'fst' (which go from 1 to
   // label_dim).  For fully-processed examples this will equal the NumPdfs() in the
-  // TransitionModel object, but for newer-style "unconstrained" examples
+  // Transitions object, but for newer-style "unconstrained" examples
   // that have been output by chain-get-supervision but not yet processed
   // by nnet3-chain-get-egs, it will be the NumTransitionIds() of the
-  // TransitionModel object.
+  // Transitions object.
   int32 label_dim;
 
   // This is an epsilon-free unweighted acceptor that is sorted in increasing
@@ -297,7 +297,7 @@ struct Supervision {
 
   // This function checks that this supervision object satifsies some
   // of the properties we expect of it, and calls KALDI_ERR if not.
-  void Check(const TransitionModel &trans_model) const;
+  void Check(const Transitions &trans_model) const;
 
   void Write(std::ostream &os, bool binary) const;
   void Read(std::istream &is, bool binary);
@@ -317,7 +317,7 @@ struct Supervision {
 */
 bool ProtoSupervisionToSupervision(
     const ContextDependencyInterface &ctx_dep,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const ProtoSupervision &proto_supervision,
     bool convert_to_pdfs,
     Supervision *supervision);
@@ -333,7 +333,7 @@ bool ProtoSupervisionToSupervision(
  */
 bool TrainingGraphToSupervisionE2e(
     const fst::StdVectorFst& training_graph,
-    const TransitionModel& trans_model,
+    const Transitions& trans_model,
     int32 num_frames,
     Supervision *supervision);
 
@@ -484,7 +484,7 @@ void GetWeightsForRanges(int32 range_length,
 /// It returns true on success, and false if some kind of error happened
 /// (this is not expected).
 bool ConvertSupervisionToUnconstrained(
-    const TransitionModel &trans_mdl,
+    const Transitions &trans_mdl,
     Supervision *supervision);
 
 
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
index 3e7efbb59a1..cd243ff06ba 100644
--- a/src/chain/chain-training.h
+++ b/src/chain/chain-training.h
@@ -31,7 +31,7 @@
 #include "tree/context-dep.h"
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "chain/chain-den-graph.h"
 #include "chain/chain-supervision.h"
 
diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile
index 41ac7342d17..519c2bbf77d 100644
--- a/src/chainbin/Makefile
+++ b/src/chainbin/Makefile
@@ -25,7 +25,7 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/chainbin/chain-get-supervision.cc b/src/chainbin/chain-get-supervision.cc
index 1ac89d4630b..8a4904843be 100644
--- a/src/chainbin/chain-get-supervision.cc
+++ b/src/chainbin/chain-get-supervision.cc
@@ -30,7 +30,7 @@ namespace chain {
 
 // This wrapper function does all the job of processing the features and
 // lattice into ChainSupervision objects, and writing them out.
-static bool ProcessSupervision(const TransitionModel &trans_model,
+static bool ProcessSupervision(const Transitions &trans_model,
                                const ContextDependencyInterface &ctx_dep,
                                const ProtoSupervision &proto_sup,
                                const std::string &key,
@@ -97,7 +97,7 @@ int main(int argc, char *argv[]) {
         phone_durs_or_lat_rspecifier = po.GetArg(3),
         supervision_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(trans_model_rxfilename, &trans_model);
 
     ContextDependency ctx_dep;
diff --git a/src/chainbin/chain-make-den-fst.cc b/src/chainbin/chain-make-den-fst.cc
index 0d8d249242b..dc2b41a369d 100644
--- a/src/chainbin/chain-make-den-fst.cc
+++ b/src/chainbin/chain-make-den-fst.cc
@@ -56,7 +56,7 @@ int main(int argc, char *argv[]) {
 
 
     ContextDependency ctx_dep;
-    TransitionModel trans_model;
+    Transitions trans_model;
     fst::StdVectorFst phone_lm;
 
     ReadKaldiObject(tree_rxfilename, &ctx_dep);
diff --git a/src/chainbin/nnet3-chain-acc-lda-stats.cc b/src/chainbin/nnet3-chain-acc-lda-stats.cc
index 693eb2dad86..0cf2d449d76 100644
--- a/src/chainbin/nnet3-chain-acc-lda-stats.cc
+++ b/src/chainbin/nnet3-chain-acc-lda-stats.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/lattice-functions.h"
 #include "nnet3/nnet-nnet.h"
 #include "nnet3/nnet-chain-example.h"
diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc
index 0117fe2200f..46744b239d0 100644
--- a/src/chainbin/nnet3-chain-copy-egs.cc
+++ b/src/chainbin/nnet3-chain-copy-egs.cc
@@ -21,7 +21,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-chain-example.h"
 
 namespace kaldi {
diff --git a/src/chainbin/nnet3-chain-e2e-get-egs.cc b/src/chainbin/nnet3-chain-e2e-get-egs.cc
index 8cdda8deb32..31b14cb7b0f 100644
--- a/src/chainbin/nnet3-chain-e2e-get-egs.cc
+++ b/src/chainbin/nnet3-chain-e2e-get-egs.cc
@@ -22,7 +22,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
@@ -74,7 +74,7 @@ static int32 FindMinimumLengthPath(
 */
 
 static bool ProcessFile(const ExampleGenerationConfig &opts,
-                        const TransitionModel &trans_model,
+                        const Transitions &trans_model,
                         const fst::StdVectorFst &normalization_fst,
                         const MatrixBase<BaseFloat> &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
@@ -285,7 +285,7 @@ int main(int argc, char *argv[]) {
       KALDI_ASSERT(normalization_fst.NumStates() > 0);
     }
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(trans_model_rxfilename, &trans_model);
 
     RandomAccessBaseFloatMatrixReader feat_reader(feature_rspecifier);
diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index 1032b7e2125..2c506c5b460 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -21,7 +21,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-chain-example.h"
@@ -86,7 +86,7 @@ namespace nnet3 {
 
 **/
 
-static bool ProcessFile(const TransitionModel *trans_mdl,
+static bool ProcessFile(const Transitions *trans_mdl,
                         const fst::StdVectorFst &normalization_fst,
                         const GeneralMatrix &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
@@ -345,8 +345,8 @@ int main(int argc, char *argv[]) {
     UtteranceSplitter utt_splitter(eg_config);
 
 
-    const TransitionModel *trans_mdl_ptr = NULL;
-    TransitionModel trans_mdl;
+    const Transitions *trans_mdl_ptr = NULL;
+    Transitions trans_mdl;
     if (!trans_mdl_rxfilename.empty()) {
       ReadKaldiObject(trans_mdl_rxfilename,
                       &trans_mdl);
diff --git a/src/chainbin/nnet3-chain-merge-egs.cc b/src/chainbin/nnet3-chain-merge-egs.cc
index a3686d2fc30..14bdbe55115 100644
--- a/src/chainbin/nnet3-chain-merge-egs.cc
+++ b/src/chainbin/nnet3-chain-merge-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-chain-example.h"
 
 
diff --git a/src/chainbin/nnet3-chain-normalize-egs.cc b/src/chainbin/nnet3-chain-normalize-egs.cc
index a97797e3246..70f6852e963 100644
--- a/src/chainbin/nnet3-chain-normalize-egs.cc
+++ b/src/chainbin/nnet3-chain-normalize-egs.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-chain-example.h"
 #include "chain/chain-supervision.h"
 
diff --git a/src/chainbin/nnet3-chain-shuffle-egs.cc b/src/chainbin/nnet3-chain-shuffle-egs.cc
index 7ab6e28f607..94ba30799b0 100644
--- a/src/chainbin/nnet3-chain-shuffle-egs.cc
+++ b/src/chainbin/nnet3-chain-shuffle-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-chain-example.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/configure b/src/configure
index e6ffdf337af..c727948962e 100755
--- a/src/configure
+++ b/src/configure
@@ -502,8 +502,8 @@ function configure_cuda {
     echo CUDA = true >> kaldi.mk
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
-    
-    
+
+
     echo >> kaldi.mk
 
     # 64bit/32bit? We do not support cross compilation with CUDA so, use direct
@@ -524,7 +524,7 @@ WARNING: CUDA will not be used!
          CUDA is not supported with 32-bit builds."
       exit 1;
     fi
-    
+
     #add cusolver flags for newer toolkits
     if [ "$CUSOLVER" == "true" ]; then
       echo "CUDA_LDLIBS += -lcusolver" >> kaldi.mk
@@ -1346,6 +1346,9 @@ if [ -n "$ENV_CXXFLAGS" ]; then echo "CXXFLAGS += $ENV_CXXFLAGS" >> kaldi.mk; fi
 if [ -n "$ENV_LDFLAGS" ]; then echo "LDFLAGS += $ENV_LDFLAGS" >> kaldi.mk; fi
 if [ -n "$ENV_LDLIBS" ]; then echo "LDLIBS += $ENV_LDLIBS" >> kaldi.mk; fi
 
+echo "# The following makes it possible to include as kaldi/foo/bar.h" >> kaldi.mk
+echo "CXXFLAGS += -I ../.." >> kaldi.mk
+
 # We check for slow exp implementation just before we exit. This check uses
 # and possibly modifies the kaldi.mk file that we just generated.
 check_for_slow_expf;
diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
index 166f72e060f..6b3f4129a18 100644
--- a/src/cudadecoder/Makefile
+++ b/src/cudadecoder/Makefile
@@ -20,8 +20,8 @@ LDLIBS += $(CUDA_LDLIBS)
 
 LIBNAME = kaldi-cudadecoder
 
-ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../base/kaldi-base.a ../matrix/kaldi-matrix.a \
-          ../lat/kaldi-lat.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../gmm/kaldi-gmm.a \
+ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../base/kaldi-base.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
+          ../lat/kaldi-lat.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../gmm/kaldi-gmm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a ../transform/kaldi-transform.a \
           ../tree/kaldi-tree.a ../online2/kaldi-online2.a ../nnet3/kaldi-nnet3.a \
 					../cudafeat/kaldi-cudafeat.a
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
index d3ad909d80a..0007234016d 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -28,7 +28,7 @@ namespace cuda_decoder {
 
 void BatchedThreadedNnet3CudaPipeline::Initialize(
     const fst::Fst<fst::StdArc> &decode_fst, const nnet3::AmNnetSimple &am_nnet,
-    const TransitionModel &trans_model) {
+    const Transitions &trans_model) {
   KALDI_LOG << "BatchedThreadedNnet3CudaPipeline Initialize with "
             << config_.num_control_threads << " control threads, "
             << config_.num_worker_threads << " worker threads"
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
index 6401b24b7db..79bc6d69de6 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
@@ -343,7 +343,7 @@ class BatchedThreadedNnet3CudaPipeline {
   BatchedThreadedNnet3CudaPipelineConfig config_;
 
   CudaFst cuda_fst_;
-  const TransitionModel *trans_model_;
+  const Transitions *trans_model_;
   const nnet3::AmNnetSimple *am_nnet_;
   nnet3::DecodableNnetSimpleLoopedInfo *decodable_info_;
   OnlineNnet2FeaturePipelineInfo *feature_info_;
diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc
index 6f899d87321..70f745f286a 100644
--- a/src/cudadecoder/cuda-fst.cc
+++ b/src/cudadecoder/cuda-fst.cc
@@ -113,21 +113,21 @@ void CudaFst::PopulateArcs(const fst::Fst<StdArc> &fst) {
       h_arc_id_ilabels_[idx] = arc.ilabel;
       // For now we consider id indexing == pdf indexing
       // If the two are differents, we'll call ApplyTransModelOnIlabels with a
-      // TransitionModel
+      // Transitions
       h_arc_pdf_ilabels_[idx] = arc.ilabel;
       h_arc_olabels_[idx] = arc.olabel;
     }
   }
 }
 
-void CudaFst::ApplyTransitionModelOnIlabels(
-    const TransitionModel &trans_model) {
+void CudaFst::ApplyTransitionsOnIlabels(
+    const Transitions &trans_model) {
   // Converting ilabel here, to avoid reindexing when reading nnet3 output
   // We only need to convert the emitting arcs
   // The emitting arcs are the first e_count_ arcs
   for (int iarc = 0; iarc < e_count_; ++iarc)
     h_arc_pdf_ilabels_[iarc] =
-        trans_model.TransitionIdToPdf(h_arc_id_ilabels_[iarc]);
+        trans_model.InfoForTransitionId(h_arc_id_ilabels_[iarc]).pdf_id;
 }
 
 void CudaFst::CopyDataToDevice() {
@@ -153,7 +153,7 @@ void CudaFst::CopyDataToDevice() {
 }
 
 void CudaFst::Initialize(const fst::Fst<StdArc> &fst,
-                         const TransitionModel *trans_model) {
+                         const Transitions *trans_model) {
   nvtxRangePushA("CudaFst constructor");
   start_ = fst.Start();
 
@@ -164,7 +164,7 @@ void CudaFst::Initialize(const fst::Fst<StdArc> &fst,
   // at the end of Initialize
   h_arc_pdf_ilabels_.resize(arc_count_);
   PopulateArcs(fst);
-  if (trans_model) ApplyTransitionModelOnIlabels(*trans_model);
+  if (trans_model) ApplyTransitionsOnIlabels(*trans_model);
 
   KALDI_ASSERT(d_e_offsets_);
   KALDI_ASSERT(d_ne_offsets_);
diff --git a/src/cudadecoder/cuda-fst.h b/src/cudadecoder/cuda-fst.h
index 1dac627755b..8c07bb4936d 100644
--- a/src/cudadecoder/cuda-fst.h
+++ b/src/cudadecoder/cuda-fst.h
@@ -20,7 +20,7 @@
 #include "cudadecoder/cuda-decoder-common.h"
 #include "cudamatrix/cu-device.h"
 #include "lat/kaldi-lattice.h"
-#include "nnet3/decodable-online-looped.h"  // TransitionModel
+#include "nnet3/decodable-online-looped.h"  // Transitions
 
 namespace kaldi {
 namespace cuda_decoder {
@@ -52,13 +52,13 @@ class CudaFst {
         d_final_(nullptr){};
   // Creates a CSR representation of the FST,
   // then copies it to the GPU
-  // If a TransitionModel is passed, we'll use it to convert the ilabels id
+  // If a Transitions is passed, we'll use it to convert the ilabels id
   // indexes into pdf indexes
-  // If no TransitionModel is passed, we'll assume TransitionModel == identity
-  // Important: The CudaDecodable won't apply the TransitionModel. If you use a
-  // TransitionModel, you need to apply it now
+  // If no Transitions is passed, we'll assume Transitions == identity
+  // Important: The CudaDecodable won't apply the Transitions. If you use a
+  // Transitions, you need to apply it now
   void Initialize(const fst::Fst<StdArc> &fst,
-                  const TransitionModel *trans_model = NULL);
+                  const Transitions *trans_model = NULL);
   void Finalize();
 
   inline uint32_t NumStates() const { return num_states_; }
@@ -75,7 +75,7 @@ class CudaFst {
   // Converting the id ilabels into pdf ilabels using the transition model
   // It allows the CudaDecoder to read the acoustic model loglikelihoods at the
   // right indexes
-  void ApplyTransitionModelOnIlabels(const TransitionModel &trans_model);
+  void ApplyTransitionsOnIlabels(const Transitions &trans_model);
   // Copies fst to device into the pre-allocated datastructures
   void CopyDataToDevice();
   // Total number of states
diff --git a/src/cudadecoder/decodable-cumatrix.cc b/src/cudadecoder/decodable-cumatrix.cc
index d7c1d0359a5..4704238852c 100644
--- a/src/cudadecoder/decodable-cumatrix.cc
+++ b/src/cudadecoder/decodable-cumatrix.cc
@@ -24,7 +24,7 @@ namespace kaldi {
 namespace cuda_decoder {
 
 DecodableCuMatrixMapped::DecodableCuMatrixMapped(
-    const TransitionModel &tm, const CuMatrixBase<BaseFloat> &likes,
+    const Transitions &tm, const CuMatrixBase<BaseFloat> &likes,
     int32 frame_offset)
     : trans_model_(tm), likes_(&likes), frame_offset_(frame_offset) {
   if (likes.NumCols() != tm.NumPdfs())
diff --git a/src/cudadecoder/decodable-cumatrix.h b/src/cudadecoder/decodable-cumatrix.h
index d34079cc9c7..aaef4c9fd3f 100644
--- a/src/cudadecoder/decodable-cumatrix.h
+++ b/src/cudadecoder/decodable-cumatrix.h
@@ -35,7 +35,7 @@ class DecodableCuMatrixMapped : public CudaDecodableInterface {
   // This constructor creates an object that will not delete "likes" when done.
   // the frame_offset is the frame the row 0 of 'likes' corresponds to, would be
   // greater than one if this is not the first chunk of likelihoods.
-  DecodableCuMatrixMapped(const TransitionModel &tm,
+  DecodableCuMatrixMapped(const Transitions &tm,
                           const CuMatrixBase<BaseFloat> &likes,
                           int32 frame_offset = 0);
 
@@ -57,7 +57,7 @@ class DecodableCuMatrixMapped : public CudaDecodableInterface {
   virtual BaseFloat *GetLogLikelihoodsCudaPointer(int32 subsampled_frame);
 
 private:
-  const TransitionModel &trans_model_; // for tid to pdf mapping
+  const Transitions &trans_model_; // for tid to pdf mapping
   const CuMatrixBase<BaseFloat> *likes_;
 
   int32 frame_offset_;
diff --git a/src/cudadecoderbin/Makefile b/src/cudadecoderbin/Makefile
index 6a31a52ceca..b0867e75c7f 100644
--- a/src/cudadecoderbin/Makefile
+++ b/src/cudadecoderbin/Makefile
@@ -15,12 +15,12 @@ TESTFILES =
 
 ADDLIBS = ../cudadecoder/kaldi-cudadecoder.a  ../cudafeat/kaldi-cudafeat.a \
 ../online2/kaldi-online2.a ../ivector/kaldi-ivector.a \
-../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a ../nnet2/kaldi-nnet2.a \
+../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
 ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
 ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
 ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
 ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-../matrix/kaldi-matrix.a ../base/kaldi-base.a
+../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a
 
 endif
 
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
index df6810ee2c8..10a58699ed6 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
@@ -169,7 +169,7 @@ int main(int argc, char *argv[]) {
     std::string nnet3_rxfilename = po.GetArg(1), fst_rxfilename = po.GetArg(2),
                 wav_rspecifier = po.GetArg(3), clat_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     nnet3::AmNnetSimple am_nnet;
 
     // read transition model and nnet
diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
index 33aca4eedaa..7dfe3c41cea 100644
--- a/src/cudafeat/Makefile
+++ b/src/cudafeat/Makefile
@@ -5,17 +5,17 @@ all:
 include ../kaldi.mk
 ifeq ($(CUDA), true)
 
-TESTFILES = 
+TESTFILES =
 
 ifeq ($(CUDA), true)
   OBJFILES +=  feature-window-cuda.o feature-spectral-cuda.o feature-online-cmvn-cuda.o \
-							 online-ivector-feature-cuda-kernels.o online-ivector-feature-cuda.o \
-							 online-cuda-feature-pipeline.o
+               online-ivector-feature-cuda-kernels.o online-ivector-feature-cuda.o \
+               online-cuda-feature-pipeline.o
 endif
 
 LIBNAME = kaldi-cudafeat
 
-ADDLIBS = ../feat/kaldi-feat.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+ADDLIBS = ../feat/kaldi-feat.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a ../cudamatrix/kaldi-cudamatrix.a \
           ../gmm/kaldi-gmm.a ../ivector/kaldi-ivector.a ../online2/kaldi-online2.a
 
diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h
index 8683372098c..ba9f8ebea0f 100644
--- a/src/cudafeat/feature-spectral-cuda.h
+++ b/src/cudafeat/feature-spectral-cuda.h
@@ -22,7 +22,6 @@
 #include <cufft.h>
 #endif
 
-#include "cudafeat/feature-window-cuda.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
 #include "feat/feature-fbank.h"
@@ -38,8 +37,8 @@ struct CudaSpectralFeatureOptions {
   SpectralFeatureType feature_type;
   CudaSpectralFeatureOptions(MfccOptions opts_in)
       : mfcc_opts(opts_in),
-        use_log_fbank(true), 
-	use_power(true), 
+        use_log_fbank(true),
+	use_power(true),
 	use_dct(true),
         feature_type(MFCC) {}
   CudaSpectralFeatureOptions(FbankOptions opts){
@@ -75,13 +74,13 @@ class CudaSpectralFeatures : public MfccComputer {
   ~CudaSpectralFeatures();
   CudaSpectralFeatureOptions cumfcc_opts_;
   int32 Dim()
-  // The dimension of the output is different for MFCC and Fbank. 
+  // The dimension of the output is different for MFCC and Fbank.
   // This returns the appropriate value depending on the feature
   // extraction algorithm
   {
     if (cumfcc_opts_.feature_type == MFCC) return MfccComputer::Dim();
     //If we're running fbank, we need to set the dimension right
-    else return cumfcc_opts_.mfcc_opts.mel_opts.num_bins + 
+    else return cumfcc_opts_.mfcc_opts.mel_opts.num_bins +
 	        (cumfcc_opts_.mfcc_opts.use_energy ? 1 : 0);
   }
 
diff --git a/src/cudafeat/feature-window-cuda.h b/src/cudafeat/feature-window-cuda.h
deleted file mode 100644
index ff749a855b9..00000000000
--- a/src/cudafeat/feature-window-cuda.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// cudafeat/feature-window-cuda.h
-//
-// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-// Justin Luitjens
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_CUDAFEAT_FEATURE_WINDOW_CUDA_H_
-#define KALDI_CUDAFEAT_FEATURE_WINDOW_CUDA_H_
-
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-vector.h"
-#include "feat/feature-window.h"
-
-namespace kaldi {
-
-// This struct stores a feature window on the device.
-// Behind the scense it just computes a feature window on
-// the host and then copies it into device memory.
-struct CudaFeatureWindowFunction {
-  CudaFeatureWindowFunction() {}
-  explicit CudaFeatureWindowFunction(const FrameExtractionOptions &opts);
-  CuVector<float> cu_window;
-};
-
-}  // namespace kaldi
-
-#endif  // KALDI_CUDAFEAT_FEATURE_WINDOW_CUDA_H_
diff --git a/src/cudafeatbin/Makefile b/src/cudafeatbin/Makefile
index 105ece3c67f..b154623b1fb 100644
--- a/src/cudafeatbin/Makefile
+++ b/src/cudafeatbin/Makefile
@@ -22,6 +22,7 @@ ADDLIBS = ../cudafeat/kaldi-cudafeat.a ../online2/kaldi-online2.a  \
           ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+	      ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 45c2ba44fd7..5c0b4e7680c 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -18,7 +18,7 @@ endif
 
 LIBNAME = kaldi-cudamatrix
 
-ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
 ifeq ($(CUDA), true)
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index c788a621a85..d285699edc7 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -26,7 +26,7 @@
 // files in this directory.
 #include <mutex>
 #include "base/kaldi-common.h"
-#include "matrix/kaldi-blas.h"
+#include "cblasext/kaldi-blas.h"
 #include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-matrixdim.h"
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 21468ca9f63..514d129d56d 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -990,7 +990,7 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
   }
   ssum[tid] = tsum;
   __syncthreads();
-  
+
   // Block reduce
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
@@ -1655,7 +1655,7 @@ static void _transform_reduce_mat_rows(
   Real tdata = op.InitValue();
   for (int i = tid; i < d.rows; i += CU1DBLOCK) {
     //Note the loads of mat are uncoalesced.  We could eliminate these
-    //with shared memory but at the matrix sizes we are currently looking 
+    //with shared memory but at the matrix sizes we are currently looking
     //at it probably would not help much and would add a lot of complexity.
     //Alternatively we could look at something like trov to help loads.
     tdata = op.Reduce(tdata, op.Transform(mat[i * d.stride + j]));
@@ -3618,7 +3618,7 @@ template <typename Real>
 __global__
 void _cuda_mat_copy_range_clamped(
    int32_t row_start, int32_t row_end, int32_t num_cols,
-   const Real * __restrict__ src, int32_t lds, 
+   const Real * __restrict__ src, int32_t lds,
    int32_t clamp_low, int32_t clamp_high,
    Real * __restrict__ dst, int32_t ldd) {
   int32_t rid = blockIdx.y*blockDim.y+threadIdx.y;
@@ -3641,7 +3641,7 @@ void _cuda_mat_copy_range_clamped(
   }
 }
 
-template <typename Real> 
+template <typename Real>
 struct MatrixCopyDesc {
   const Real *input;
   Real *output;
@@ -3652,7 +3652,7 @@ struct MatrixCopyDesc {
 template <typename Real>
 struct  BatchedMatrixCopyDesc {
   //maximum size allowed in formal parameter list
-  static const int32_t MAX_BATCH_SIZE=128; 
+  static const int32_t MAX_BATCH_SIZE=128;
   MatrixCopyDesc<Real> batch[MAX_BATCH_SIZE];
 };
 
@@ -3660,12 +3660,12 @@ struct  BatchedMatrixCopyDesc {
 // grid dim x,y expands to fill out average in x/y across batches
 // grid dim.z is batch
 template<typename Real>
-__global__ 
+__global__
 void _cuda_batch_copy_mats(BatchedMatrixCopyDesc<Real> batch_desc) {
 
   int32_t rid = blockIdx.y * blockDim.y + threadIdx.y;
   int32_t cid = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_t bid = blockIdx.z;  // batch id 
+  int32_t bid = blockIdx.z;  // batch id
 
   // read copy parameters
   MatrixCopyDesc<Real> desc = batch_desc.batch[bid];
@@ -5466,7 +5466,7 @@ void cuda_legacy_noop() {
 
 void cudaF_mat_copy_range_clamped(
    int32_t row_start, int32_t row_end, int32_t num_cols,
-   const float *src, int32_t lds, 
+   const float *src, int32_t lds,
    int32_t clamp_low, int32_t clamp_high,
    float *dst, int32_t ldd) {
 
@@ -5480,7 +5480,7 @@ void cudaF_mat_copy_range_clamped(
 
 void cudaD_mat_copy_range_clamped(
    int32_t row_start, int32_t row_end, int32_t num_cols,
-   const double *src, int32_t lds, 
+   const double *src, int32_t lds,
    int32_t clamp_low, int32_t clamp_high,
    double *dst, int32_t ldd) {
 
@@ -5498,14 +5498,14 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
 
   dim3 threads(32,32);
   int32_t total_rows=0, total_cols=0;
-  
-  BatchedMatrixCopyDesc<float> batch_desc; 
+
+  BatchedMatrixCopyDesc<float> batch_desc;
   const int32_t MAX_BATCH_SIZE=batch_desc.MAX_BATCH_SIZE;
 
   int i;
   for (i = 0; i < num_mats; i++) {
     int b = i%MAX_BATCH_SIZE;
-    
+
     // fill in desc
     MatrixCopyDesc<float> &desc = batch_desc.batch[b];
     desc.num_rows = num_rows[i];
@@ -5523,12 +5523,12 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
       dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
+                  (rows + 31) / 32,
                   MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
-      
+
       // launch batch
        _cuda_batch_copy_mats<<<blocks,threads>>>(batch_desc);
 
@@ -5544,9 +5544,9 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)remaining);
       int32_t cols = ceilf(total_cols / (float)remaining);
-      
+
       dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
+                  (rows + 31) / 32,
                   remaining);
 
       // no memcpy needed here.  Memory will be passed down directly
@@ -5563,14 +5563,14 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
 
   dim3 threads(32,32);
   int32_t total_rows=0, total_cols=0;
-  
-  BatchedMatrixCopyDesc<double> batch_desc; 
+
+  BatchedMatrixCopyDesc<double> batch_desc;
   const int32_t MAX_BATCH_SIZE=batch_desc.MAX_BATCH_SIZE;
 
   int i;
   for (i = 0; i < num_mats; i++) {
     int b = i%MAX_BATCH_SIZE;
-    
+
     // fill in desc
     MatrixCopyDesc<double> &desc = batch_desc.batch[b];
     desc.num_rows = num_rows[i];
@@ -5588,12 +5588,12 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
       dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
+                  (rows + 31) / 32,
                   MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
-      
+
       // launch batch
        _cuda_batch_copy_mats<<<blocks,threads>>>(batch_desc);
 
@@ -5611,9 +5611,9 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       int32_t cols = ceilf(total_cols / (float)remaining);
 
       dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
+                  (rows + 31) / 32,
                   remaining);
-      
+
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
 
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 1df1626fc6d..a706b317cdd 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -1558,7 +1558,7 @@ inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
 
 inline void cuda_mat_copy_range_clamped(
    int32_t row_start, int32_t row_end, int32_t num_cols,
-   const double *src, int32_t lds, 
+   const double *src, int32_t lds,
    int32_t clamp_low, int32_t clamp_high,
    double *dst, int32_t ldd) {
   cudaD_mat_copy_range_clamped(row_start, row_end, num_cols,
@@ -1567,7 +1567,7 @@ inline void cuda_mat_copy_range_clamped(
 
 inline void cuda_mat_copy_range_clamped(
    int32_t row_start, int32_t row_end, int32_t num_cols,
-   const float *src, int32_t lds, 
+   const float *src, int32_t lds,
    int32_t clamp_low, int32_t clamp_high,
    float *dst, int32_t ldd) {
   cudaF_mat_copy_range_clamped(row_start, row_end, num_cols,
@@ -1587,7 +1587,7 @@ inline void cuda_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
   cudaD_batched_copy_mats(num_mats, num_rows, num_cols, inputs, ldi,
       outputs, ldo);
 }
-    
+
 
 } // namespace kaldi
 
diff --git a/src/decoder/Makefile b/src/decoder/Makefile
index fbd8386f005..b74f45a38a4 100644
--- a/src/decoder/Makefile
+++ b/src/decoder/Makefile
@@ -13,7 +13,7 @@ LIBNAME = kaldi-decoder
 
 ADDLIBS = ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrixp/kaldi-matrix.a \
+	      ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/decoder/decodable-matrix.cc b/src/decoder/decodable-matrix.cc
index 3cc7b87f2d7..98cd75d1ede 100644
--- a/src/decoder/decodable-matrix.cc
+++ b/src/decoder/decodable-matrix.cc
@@ -22,7 +22,7 @@
 namespace kaldi {
 
 DecodableMatrixMapped::DecodableMatrixMapped(
-    const TransitionModel &tm,
+    const Transitions &tm,
     const MatrixBase<BaseFloat> &likes,
     int32 frame_offset):
     trans_model_(tm), likes_(&likes), likes_to_delete_(NULL),
@@ -32,12 +32,12 @@ DecodableMatrixMapped::DecodableMatrixMapped(
 
   if (likes.NumCols() != tm.NumPdfs())
     KALDI_ERR << "Mismatch, matrix has "
-              << likes.NumCols() << " rows but transition-model has "
+              << likes.NumCols() << " rows but transitions.has "
               << tm.NumPdfs() << " pdf-ids.";
 }
 
 DecodableMatrixMapped::DecodableMatrixMapped(
-    const TransitionModel &tm, const Matrix<BaseFloat> *likes,
+    const Transitions &tm, const Matrix<BaseFloat> *likes,
     int32 frame_offset):
     trans_model_(tm), likes_(likes), likes_to_delete_(likes),
     frame_offset_(frame_offset) {
@@ -45,7 +45,7 @@ DecodableMatrixMapped::DecodableMatrixMapped(
   raw_data_ = likes->Data() - (stride_ * frame_offset_);
   if (likes->NumCols() != tm.NumPdfs())
     KALDI_ERR << "Mismatch, matrix has "
-              << likes->NumCols() << " rows but transition-model has "
+              << likes->NumCols() << " rows but transitions.has "
               << tm.NumPdfs() << " pdf-ids.";
 }
 
diff --git a/src/decoder/decodable-matrix.h b/src/decoder/decodable-matrix.h
index 30b8b467c2e..c7d52c8ff10 100644
--- a/src/decoder/decodable-matrix.h
+++ b/src/decoder/decodable-matrix.h
@@ -24,7 +24,7 @@
 #include <vector>
 
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "itf/decodable-itf.h"
 #include "matrix/kaldi-matrix.h"
 
@@ -34,26 +34,26 @@ namespace kaldi {
 class DecodableMatrixScaledMapped: public DecodableInterface {
  public:
   // This constructor creates an object that will not delete "likes" when done.
-  DecodableMatrixScaledMapped(const TransitionModel &tm,
+  DecodableMatrixScaledMapped(const Transitions &tm,
                               const Matrix<BaseFloat> &likes,
                               BaseFloat scale): trans_model_(tm), likes_(&likes),
                                                 scale_(scale), delete_likes_(false) {
     if (likes.NumCols() != tm.NumPdfs())
       KALDI_ERR << "DecodableMatrixScaledMapped: mismatch, matrix has "
-                << likes.NumCols() << " rows but transition-model has "
+                << likes.NumCols() << " rows but transitions.has "
                 << tm.NumPdfs() << " pdf-ids.";
   }
 
   // This constructor creates an object that will delete "likes"
   // when done.
-  DecodableMatrixScaledMapped(const TransitionModel &tm,
+  DecodableMatrixScaledMapped(const Transitions &tm,
                               BaseFloat scale,
                               const Matrix<BaseFloat> *likes):
       trans_model_(tm), likes_(likes),
       scale_(scale), delete_likes_(true) {
     if (likes->NumCols() != tm.NumPdfs())
       KALDI_ERR << "DecodableMatrixScaledMapped: mismatch, matrix has "
-                << likes->NumCols() << " rows but transition-model has "
+                << likes->NumCols() << " rows but transitions.has "
                 << tm.NumPdfs() << " pdf-ids.";
   }
 
@@ -76,7 +76,7 @@ class DecodableMatrixScaledMapped: public DecodableInterface {
     if (delete_likes_) delete likes_;
   }
  private:
-  const TransitionModel &trans_model_;  // for tid to pdf mapping
+  const Transitions &trans_model_;  // for tid to pdf mapping
   const Matrix<BaseFloat> *likes_;
   BaseFloat scale_;
   bool delete_likes_;
@@ -100,13 +100,13 @@ class DecodableMatrixMapped: public DecodableInterface {
   // This constructor creates an object that will not delete "likes" when done.
   // the frame_offset is the frame the row 0 of 'likes' corresponds to, would be
   // greater than one if this is not the first chunk of likelihoods.
-  DecodableMatrixMapped(const TransitionModel &tm,
+  DecodableMatrixMapped(const Transitions &tm,
                         const MatrixBase<BaseFloat> &likes,
                         int32 frame_offset = 0);
 
   // This constructor creates an object that will delete "likes"
   // when done.
-  DecodableMatrixMapped(const TransitionModel &tm,
+  DecodableMatrixMapped(const Transitions &tm,
                         const Matrix<BaseFloat> *likes,
                         int32 frame_offset = 0);
 
@@ -122,7 +122,7 @@ class DecodableMatrixMapped: public DecodableInterface {
   virtual ~DecodableMatrixMapped();
 
  private:
-  const TransitionModel &trans_model_;  // for tid to pdf mapping
+  const Transitions &trans_model_;  // for tid to pdf mapping
   const MatrixBase<BaseFloat> *likes_;
   const Matrix<BaseFloat> *likes_to_delete_;
   int32 frame_offset_;
@@ -151,7 +151,7 @@ class DecodableMatrixMapped: public DecodableInterface {
 */
 class DecodableMatrixMappedOffset: public DecodableInterface {
  public:
-  DecodableMatrixMappedOffset(const TransitionModel &tm):
+  DecodableMatrixMappedOffset(const Transitions &tm):
       trans_model_(tm), frame_offset_(0), input_is_finished_(false) { }
 
   // this is not part of the generic Decodable interface.
@@ -192,7 +192,7 @@ class DecodableMatrixMappedOffset: public DecodableInterface {
   // nothing special to do in destructor.
   virtual ~DecodableMatrixMappedOffset() { }
  private:
-  const TransitionModel &trans_model_;  // for tid to pdf mapping
+  const Transitions &trans_model_;  // for tid to pdf mapping
   Matrix<BaseFloat> loglikes_;
   int32 frame_offset_;
   bool input_is_finished_;
diff --git a/src/decoder/decoder-wrappers.cc b/src/decoder/decoder-wrappers.cc
index 588274e113b..b684b6bcda4 100644
--- a/src/decoder/decoder-wrappers.cc
+++ b/src/decoder/decoder-wrappers.cc
@@ -32,7 +32,7 @@ namespace kaldi {
 DecodeUtteranceLatticeFasterClass::DecodeUtteranceLatticeFasterClass(
     LatticeFasterDecoder *decoder,
     DecodableInterface *decodable,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     const std::string &utt,
     BaseFloat acoustic_scale,
@@ -201,7 +201,7 @@ template <typename FST>
 bool DecodeUtteranceLatticeFaster(
     LatticeFasterDecoderTpl<FST> &decoder, // not const but is really an input.
     DecodableInterface &decodable, // not const but is really an input.
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     std::string utt,
     double acoustic_scale,
@@ -299,7 +299,7 @@ bool DecodeUtteranceLatticeFaster(
 template bool DecodeUtteranceLatticeFaster(
     LatticeFasterDecoderTpl<fst::Fst<fst::StdArc> > &decoder,
     DecodableInterface &decodable,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     std::string utt,
     double acoustic_scale,
@@ -314,7 +314,7 @@ template bool DecodeUtteranceLatticeFaster(
 template bool DecodeUtteranceLatticeFaster(
     LatticeFasterDecoderTpl<fst::GrammarFst> &decoder,
     DecodableInterface &decodable,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     std::string utt,
     double acoustic_scale,
@@ -331,7 +331,7 @@ template bool DecodeUtteranceLatticeFaster(
 bool DecodeUtteranceLatticeSimple(
     LatticeSimpleDecoder &decoder, // not const but is really an input.
     DecodableInterface &decodable, // not const but is really an input.
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     std::string utt,
     double acoustic_scale,
diff --git a/src/decoder/decoder-wrappers.h b/src/decoder/decoder-wrappers.h
index 17592d0282b..3e440cea1e5 100644
--- a/src/decoder/decoder-wrappers.h
+++ b/src/decoder/decoder-wrappers.h
@@ -103,7 +103,7 @@ template <typename FST>
 bool DecodeUtteranceLatticeFaster(
     LatticeFasterDecoderTpl<FST> &decoder, // not const but is really an input.
     DecodableInterface &decodable, // not const but is really an input.
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     std::string utt,
     double acoustic_scale,
@@ -129,7 +129,7 @@ class DecodeUtteranceLatticeFasterClass {
   DecodeUtteranceLatticeFasterClass(
       LatticeFasterDecoder *decoder,
       DecodableInterface *decodable,
-      const TransitionModel &trans_model,
+      const Transitions &trans_model,
       const fst::SymbolTable *word_syms,
       const std::string &utt,
       BaseFloat acoustic_scale,
@@ -150,7 +150,7 @@ class DecodeUtteranceLatticeFasterClass {
   // The following variables correspond to inputs:
   LatticeFasterDecoder *decoder_;
   DecodableInterface *decodable_;
-  const TransitionModel *trans_model_;
+  const Transitions *trans_model_;
   const fst::SymbolTable *word_syms_;
   std::string utt_;
   BaseFloat acoustic_scale_;
@@ -183,7 +183,7 @@ class DecodeUtteranceLatticeFasterClass {
 bool DecodeUtteranceLatticeSimple(
     LatticeSimpleDecoder &decoder, // not const but is really an input.
     DecodableInterface &decodable, // not const but is really an input.
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     std::string utt,
     double acoustic_scale,
diff --git a/src/decoder/training-graph-compiler.cc b/src/decoder/training-graph-compiler.cc
index 191d02f1720..a59e83dee43 100644
--- a/src/decoder/training-graph-compiler.cc
+++ b/src/decoder/training-graph-compiler.cc
@@ -23,7 +23,7 @@
 namespace kaldi {
 
 
-TrainingGraphCompiler::TrainingGraphCompiler(const TransitionModel &trans_model,
+TrainingGraphCompiler::TrainingGraphCompiler(const Transitions &trans_model,
                                              const ContextDependency &ctx_dep,  // Does not maintain reference to this.
                                              fst::VectorFst<fst::StdArc> *lex_fst,
                                              const std::vector<int32> &disambig_syms,
@@ -98,15 +98,17 @@ bool TrainingGraphCompiler::CompileGraph(const fst::VectorFst<fst::StdArc> &word
   KALDI_ASSERT(ctx2word_fst.Start() != kNoStateId);
 
   HTransducerConfig h_cfg;
-  h_cfg.transition_scale = opts_.transition_scale;
 
   std::vector<int32> disambig_syms_h; // disambiguation symbols on
-  // input side of H.
-  VectorFst<StdArc> *H = GetHTransducer(inv_cfst.IlabelInfo(),
-                                        ctx_dep_,
-                                        trans_model_,
-                                        h_cfg,
-                                        &disambig_syms_h);
+                                      // input side of H.
+
+  std::unique_ptr<VectorFst<StdArc>> H = GetHTransducer(inv_cfst.IlabelInfo(),
+                                                        ctx_dep_,
+                                                        trans_model_,
+                                                        h_cfg,
+                                                        &disambig_syms_h);
+
+  RemoveWeights(H.get());
 
   VectorFst<StdArc> &trans2word_fst = *out_fst;  // transition-id to word.
   TableCompose(*H, ctx2word_fst, &trans2word_fst);
@@ -129,15 +131,15 @@ bool TrainingGraphCompiler::CompileGraph(const fst::VectorFst<fst::StdArc> &word
   MinimizeEncoded(&trans2word_fst);
 
   std::vector<int32> disambig;
-  bool check_no_self_loops = true;
+  bool currently_self_loop_free = true,
+      use_weights = false;
+
   AddSelfLoops(trans_model_,
                disambig,
-               opts_.self_loop_scale,
-               opts_.reorder,
-               check_no_self_loops,
+               currently_self_loop_free,
+               use_weights,
                &trans2word_fst);
 
-  delete H;
   return true;
 }
 
@@ -195,14 +197,13 @@ bool TrainingGraphCompiler::CompileGraphs(
   }
 
   HTransducerConfig h_cfg;
-  h_cfg.transition_scale = opts_.transition_scale;
 
   std::vector<int32> disambig_syms_h;
-  VectorFst<StdArc> *H = GetHTransducer(inv_cfst.IlabelInfo(),
-                                        ctx_dep_,
-                                        trans_model_,
-                                        h_cfg,
-                                        &disambig_syms_h);
+  std::unique_ptr<VectorFst<StdArc>> H = GetHTransducer(inv_cfst.IlabelInfo(),
+                                                        ctx_dep_,
+                                                        trans_model_,
+                                                        h_cfg,
+                                                        &disambig_syms_h);
 
   for (size_t i = 0; i < out_fsts->size(); i++) {
     VectorFst<StdArc> &ctx2word_fst = *((*out_fsts)[i]);
@@ -216,25 +217,21 @@ bool TrainingGraphCompiler::CompileGraphs(
       if (opts_.rm_eps)
         RemoveEpsLocal(&trans2word_fst);
     }
-
-    // Encoded minimization.
     MinimizeEncoded(&trans2word_fst);
 
     std::vector<int32> disambig;
-    bool check_no_self_loops = true;
+    bool currently_self_loop_free = true,
+        use_weights = true;
     AddSelfLoops(trans_model_,
                  disambig,
-                 opts_.self_loop_scale,
-                 opts_.reorder,
-                 check_no_self_loops,
+                 currently_self_loop_free,
+                 use_weights,
                  &trans2word_fst);
 
     KALDI_ASSERT(trans2word_fst.Start() != kNoStateId);
 
     *((*out_fsts)[i]) = trans2word_fst;
   }
-
-  delete H;
   return true;
 }
 
diff --git a/src/decoder/training-graph-compiler.h b/src/decoder/training-graph-compiler.h
index ee56c6dfb3d..989accb2a05 100644
--- a/src/decoder/training-graph-compiler.h
+++ b/src/decoder/training-graph-compiler.h
@@ -21,7 +21,7 @@
 #define KALDI_DECODER_TRAINING_GRAPH_COMPILER_H_
 
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fst/fstlib.h"
 #include "fstext/fstext-lib.h"
 #include "tree/context-dep.h"
@@ -31,34 +31,20 @@ namespace kaldi {
 
 struct TrainingGraphCompilerOptions {
 
-  BaseFloat transition_scale;
-  BaseFloat self_loop_scale;
   bool rm_eps;
-  bool reorder;  // (Dan-style graphs)
 
-  explicit TrainingGraphCompilerOptions(BaseFloat transition_scale = 1.0,
-                                        BaseFloat self_loop_scale = 1.0,
-                                        bool b = true) :
-      transition_scale(transition_scale),
-      self_loop_scale(self_loop_scale),
-      rm_eps(false),
-      reorder(b) { }
+  explicit TrainingGraphCompilerOptions(): rm_eps(false) { }
 
   void Register(OptionsItf *opts) {
-    opts->Register("transition-scale", &transition_scale, "Scale of transition "
-                   "probabilities (excluding self-loops)");
-    opts->Register("self-loop-scale", &self_loop_scale, "Scale of self-loop vs. "
-                   "non-self-loop probability mass ");
-    opts->Register("reorder", &reorder, "Reorder transition ids for greater decoding efficiency.");
-    opts->Register("rm-eps", &rm_eps,  "Remove [most] epsilons before minimization (only applicable "
-                   "if disambig symbols present)");
+    opts->Register("rm-eps", &rm_eps,  "Remove [most] epsilons before minimization (only "
+                   "matters if disambig symbols present)");
   }
 };
 
 
 class TrainingGraphCompiler {
  public:
-  TrainingGraphCompiler(const TransitionModel &trans_model,  // Maintains reference to this object.
+  TrainingGraphCompiler(const Transitions &trans_model,  // Maintains reference to this object.
                         const ContextDependency &ctx_dep,  // And this.
                         fst::VectorFst<fst::StdArc> *lex_fst,  // Takes ownership of this object.
                         // It should not contain disambiguation symbols or subsequential symbol,
@@ -93,7 +79,7 @@ class TrainingGraphCompiler {
 
   ~TrainingGraphCompiler() { delete lex_fst_; }
  private:
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   const ContextDependency &ctx_dep_;
   fst::VectorFst<fst::StdArc> *lex_fst_; // lexicon FST (an input; we take
   // ownership as we need to modify it).
diff --git a/src/doc/online_decoding.dox b/src/doc/online_decoding.dox
index 9bcc2575be1..dc04d9bef4e 100644
--- a/src/doc/online_decoding.dox
+++ b/src/doc/online_decoding.dox
@@ -444,25 +444,22 @@ The program to run the TCP sever is online2-tcp-nnet3-decode-faster located in t
 ~/src/online2bin folder. The usage is as follows:
 
 \verbatim
-online2-tcp-nnet3-decode-faster <nnet3-in> <fst-in> <word-symbol-table>
+online2-tcp-nnet3-decode-faster <nnet3-in> <fst-in> <word-symbol-table> <listen-port>
 \endverbatim
 
 For example:
 
 \verbatim
-online2-tcp-nnet3-decode-faster model/final.mdl graph/HCLG.fst graph/words.txt
+online2-tcp-nnet3-decode-faster model/final.mdl graph/HCLG.fst graph/words.txt 5050
 \endverbatim
 
 The word symbol table is mandatory (unlike other nnet3 online decoding programs) because
 the server outputs word strings. Endpointing is mandatory to make the operation of the
 program reasonable. Other, non-standard options include:
-    - port-num - the port the server listens on (by default 5050)
     - samp-freq - sampling frequency of audio (usually 8000 for telephony and 16000 for other uses)
     - chunk-length - length of signal being processed by decoder at each step
     - output-period - how often we check for changes in the decoding (ie. output refresh rate, default 1s)
     - num-threads-startup - number of threads used when initializing iVector extractor
-    - read-timeout - it the program doesn't receive data during this timeout, the server terminates the connection.
-		Use -1 to disable this feature.
 
 The TCP protocol simply takes RAW signal on input (16-bit signed integer
 encoding at chosen sampling frequency) and outputs simple text using the following
@@ -482,25 +479,9 @@ command should look like this:
 \verbatim
 online2-tcp-nnet3-decode-faster --samp-freq=8000 --frames-per-chunk=20 --extra-left-context-initial=0
     --frame-subsampling-factor=3 --config=model/conf/online.conf --min-active=200 --max-active=7000
-    --beam=15.0 --lattice-beam=6.0 --acoustic-scale=1.0 --port-num=5050 model/final.mdl graph/HCLG.fst graph/words.txt
+    --beam=15.0 --lattice-beam=6.0 --acoustic-scale=1.0 model/final.mdl graph/HCLG.fst graph/words.txt 5050
 \endverbatim
 
-Note in order to make the communication as simple as possible, the server has to accept
-any data on input and cannot figure out when the stream is over. It will therefore not
-be able to terminate the connection and it is the client's resposibility to disconnect
-when it is ready to do so. As a fallback for certain situations, the read-timeout option
-was added, which will automatically disconnect if a chosen amount of seconds has passed.
-Keep in mind, that this is not an ideal solution and it's a better idea to design your
-client to properly disconnect the connection when neccessary.
-
-For testing purposes, we will use the netcat program. We will also use sox to reeoncode the
-files properly from any source. Netcat has an issue that, similarly to what was stated above 
-about the server, it cannot always interpret the data and usually it won't automatically
-disconnect the TCP connection. To get around this, we will use the '-N' switch, which kills
-the connection once streaming of the file is complete, but this can have a small sideffect of
-not reading the whole output from the Kaldi server if the discconect comes too fast. Just
-keep this in mind if you intend to implement any of these programs into a production environment.
-
 To send a WAV file into the server, it first needs to be decoded into raw audio, then it can be
 sent to the socket:
 \verbatim
diff --git a/src/feat/Makefile b/src/feat/Makefile
index dcd029f7f94..4396caaf409 100644
--- a/src/feat/Makefile
+++ b/src/feat/Makefile
@@ -4,19 +4,19 @@ all:
 
 include ../kaldi.mk
 
-TESTFILES = feature-mfcc-test feature-plp-test feature-fbank-test \
+TESTFILES = feature-mfcc-test \
          feature-functions-test pitch-functions-test feature-sdc-test \
          resample-test online-feature-test signal-test wave-reader-test
 
-OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o \
-           feature-spectrogram.o mel-computations.o wave-reader.o \
+OBJFILES = feature-functions.o feature-mfcc.o feature-fbank.o \
+           mel-computations.o wave-reader.o \
            pitch-functions.o resample.o online-feature.o signal.o \
            feature-window.o
 
 LIBNAME = kaldi-feat
 
 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/feat/feature-common-inl.h b/src/feat/feature-common-inl.h
index 26127a4dc4d..10bfe5cdfd1 100644
--- a/src/feat/feature-common-inl.h
+++ b/src/feat/feature-common-inl.h
@@ -70,15 +70,12 @@ void OfflineFeatureTpl<F>::Compute(
   }
   output->Resize(rows_out, cols_out);
   Vector<BaseFloat> window;  // windowed waveform.
-  bool use_raw_log_energy = computer_.NeedRawLogEnergy();
   for (int32 r = 0; r < rows_out; r++) {  // r is frame index.
-    BaseFloat raw_log_energy = 0.0;
     ExtractWindow(0, wave, r, computer_.GetFrameOptions(),
-                  feature_window_function_, &window,
-                  (use_raw_log_energy ? &raw_log_energy : NULL));
+                  feature_window_function_, &window);
 
     SubVector<BaseFloat> output_row(*output, r);
-    computer_.Compute(raw_log_energy, vtln_warp, &window, &output_row);
+    computer_.Compute(vtln_warp, &window, &output_row);
   }
 }
 
diff --git a/src/feat/feature-common.h b/src/feat/feature-common.h
index 3c2fbd37381..04cdca6d8bf 100644
--- a/src/feat/feature-common.h
+++ b/src/feat/feature-common.h
@@ -115,8 +115,10 @@ class OfflineFeatureTpl {
   // Note: feature_window_function_ is the windowing function, which initialized
   // using the options class, that we cache at this level.
   OfflineFeatureTpl(const Options &opts):
-      computer_(opts),
-      feature_window_function_(computer_.GetFrameOptions()) { }
+      computer_(opts) {
+    InitFeatureWindowFunction(computer_.GetFrameOptions(),
+                              &feature_window_function_);
+  }
 
   // Internal (and back-compatibility) interface for computing features, which
   // requires that the user has already checked that the sampling frequency
@@ -164,7 +166,7 @@ class OfflineFeatureTpl {
   OfflineFeatureTpl<F> &operator =(const OfflineFeatureTpl<F> &other);
 
   F computer_;
-  FeatureWindowFunction feature_window_function_;
+  Vector<BaseFloat> feature_window_function_;
 };
 
 /// @} End of "addtogroup feat"
diff --git a/src/feat/feature-fbank-test.cc b/src/feat/feature-fbank-test.cc
index 47b7b1c4244..9298b47eba4 100644
--- a/src/feat/feature-fbank-test.cc
+++ b/src/feat/feature-fbank-test.cc
@@ -29,431 +29,6 @@ using namespace kaldi;
 
 
 
-static void UnitTestReadWave() {
-
-  std::cout << "=== UnitTestReadWave() ===\n";
-
-  Vector<BaseFloat> v, v2;
-
-  std::cout << "<<<=== Reading waveform\n";
-
-  {
-    std::ifstream is("test_data/test.wav", std::ios_base::binary);
-    WaveData wave;
-    wave.Read(is);
-    const Matrix<BaseFloat> data(wave.Data());
-    KALDI_ASSERT(data.NumRows() == 1);
-    v.Resize(data.NumCols());
-    v.CopyFromVec(data.Row(0));
-  }
-
-  std::cout << "<<<=== Reading Vector<BaseFloat> waveform, prepared by matlab\n";
-  std::ifstream input(
-    "test_data/test_matlab.ascii"
-  );
-  KALDI_ASSERT(input.good());
-  v2.Read(input, false);
-  input.close();
-
-  std::cout << "<<<=== Comparing freshly read waveform to 'libsndfile' waveform\n";
-  KALDI_ASSERT(v.Dim() == v2.Dim());
-  for (int32 i = 0; i < v.Dim(); i++) {
-    KALDI_ASSERT(v(i) == v2(i));
-  }
-  std::cout << "<<<=== Comparing done\n";
-
-  // std::cout << "== The Waveform Samples == \n";
-  // std::cout << v;
-
-  std::cout << "Test passed :)\n\n";
-
-}
-
-
-
-/**
- */
-static void UnitTestSimple() {
-  std::cout << "=== UnitTestSimple() ===\n";
-
-  Vector<BaseFloat> v(100000);
-  Matrix<BaseFloat> m;
-
-  // init with noise
-  for (int32 i = 0; i < v.Dim(); i++) {
-    v(i) = (abs( i * 433024253 ) % 65535) - (65535 / 2);
-  }
-
-  std::cout << "<<<=== Just make sure it runs... Nothing is compared\n";
-  // the parametrization object
-  FbankOptions op;
-  // trying to have same opts as baseline.
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "rectangular";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.htk_compat = true;
-  op.use_energy = true;
-
-  Fbank fbank(op);
-  // use default parameters
-
-  // compute fbanks.
-  fbank.Compute(v, 1.0, &m);
-
-  // possibly dump
-  //   std::cout << "== Output features == \n" << m;
-  std::cout << "Test passed :)\n\n";
-}
-
-
-static void UnitTestHTKCompare1() {
-  std::cout << "=== UnitTestHTKCompare1() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fbank_htk.1",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use fbank with default configuration...
-  FbankOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.htk_compat = true;
-  op.mel_opts.htk_mode = true;
-  op.use_energy = false;  // C0 not energy.
-
-  Fbank fbank(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_features;
-  fbank.Compute(waveform, 1.0, &kaldi_features);
-
-
-  std::cout << "<<<=== Compare with HTK features...\n";
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      /// THE FEATURES ARE ALMOST IDENTICAL WITH HTK!!! (SEE THE TOLERANCE!)
-      if ((std::abs(b - a)) > 0.001) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    000007  // FBANK
-  };
-  {
-    std::ofstream os("tmp.test.wav.fbank_kaldi.1",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fbank_kaldi.1");
-}
-
-
-static void UnitTestHTKCompare2() {
-  std::cout << "=== UnitTestHTKCompare2() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fbank_htk.2",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use fbank with default configuration...
-  FbankOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 25.0;
-  op.htk_compat = true;
-  op.mel_opts.htk_mode = true;
-  op.use_energy = false;  // C0 not energy.
-
-  Fbank fbank(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_features;
-  fbank.Compute(waveform, 1.0, &kaldi_features);
-
-
-  std::cout << "<<<=== Compare with HTK features...\n";
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      /// THE FEATURES ARE ALMOST IDENTICAL WITH HTK!!! (SEE THE TOLERANCE!)
-      if ((std::abs(b - a)) > 0.001) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    000007  // FBANK
-  };
-  {
-    std::ofstream os("tmp.test.wav.fbank_kaldi.1",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fbank_kaldi.1");
-}
-
-static void UnitTestHTKCompare3() {
-  std::cout << "=== UnitTestHTKCompare3() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fbank_htk.3",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use fbank with default configuration...
-  FbankOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 25.0;
-  op.htk_compat = true;
-  op.mel_opts.htk_mode = true;
-  op.use_energy = false;  // C0 not energy.
-
-  op.mel_opts.vtln_low = 100.0;
-  op.mel_opts.vtln_high = 7500.0;
-  BaseFloat vtln_warp = 0.9;
-
-  Fbank fbank(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_features;
-  fbank.Compute(waveform, vtln_warp, &kaldi_features);
-
-
-  std::cout << "<<<=== Compare with HTK features...\n";
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      /// THE FEATURES ARE ALMOST IDENTICAL WITH HTK!!! (SEE THE TOLERANCE!)
-      if ((std::abs(b - a)) > 0.001) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        if (j < 20) passed = false; // We know the last couple of filterbanks differ.  We let this slide.
-        else KALDI_WARN << "Ignoring difference in last fbanks, we know the algorithms differ.";
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    000007  // FBANK
-  };
-  {
-    std::ofstream os("tmp.test.wav.fbank_kaldi.1",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fbank_kaldi.1");
-}
-
-
-static void UnitTestHTKCompare4() {
-  std::cout << "=== UnitTestHTKCompare4() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fbank_htk.4",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use fbank with default configuration...
-  FbankOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 25.0;
-  op.htk_compat = true;
-  op.mel_opts.htk_mode = true;
-  op.use_energy = false;  // C0 not energy.
-
-  op.mel_opts.vtln_low = 100.0;
-  op.mel_opts.vtln_high = 7500.0;
-  BaseFloat vtln_warp = 1.1;
-
-  Fbank fbank(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_features;
-  fbank.Compute(waveform, vtln_warp, &kaldi_features);
-
-
-  std::cout << "<<<=== Compare with HTK features...\n";
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      /// THE FEATURES ARE ALMOST IDENTICAL WITH HTK!!! (SEE THE TOLERANCE!)
-      if ((std::abs(b - a)) > 0.01) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    000007  // FBANK
-  };
-  {
-    std::ofstream os("tmp.test.wav.fbank_kaldi.1",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fbank_kaldi.1");
-}
-
-
-
-
-static void UnitTestFeat() {
-  UnitTestReadWave();
-  UnitTestSimple();
-  UnitTestHTKCompare1();
-  UnitTestHTKCompare2();
-  UnitTestHTKCompare3();
-  UnitTestHTKCompare4();
-}
-
-
-
 
 int main() {
   try {
@@ -466,5 +41,3 @@ int main() {
     return 1;
   }
 }
-
-
diff --git a/src/feat/feature-fbank.cc b/src/feat/feature-fbank.cc
index d9ac03e5920..df10712f956 100644
--- a/src/feat/feature-fbank.cc
+++ b/src/feat/feature-fbank.cc
@@ -24,28 +24,22 @@
 namespace kaldi {
 
 FbankComputer::FbankComputer(const FbankOptions &opts):
-    opts_(opts), srfft_(NULL) {
-  if (opts.energy_floor > 0.0)
-    log_energy_floor_ = Log(opts.energy_floor);
-
-  int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
-  if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two...
-    srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
-
+    opts_(opts),
+    srfft_(new SplitRadixRealFft<BaseFloat>(opts.frame_opts.PaddedWindowSize())) {
+  KALDI_ASSERT(opts.energy_floor > 0.0 && "Nonzero energy floor is required.");
   // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
   // [note: this call caches it.]
   GetMelBanks(1.0);
 }
 
 FbankComputer::FbankComputer(const FbankComputer &other):
-    opts_(other.opts_), log_energy_floor_(other.log_energy_floor_),
-    mel_banks_(other.mel_banks_), srfft_(NULL) {
+    opts_(other.opts_),
+    mel_banks_(other.mel_banks_),
+    srfft_(new SplitRadixRealFft<BaseFloat>(*(other.srfft_))) {
   for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
       iter != mel_banks_.end();
       ++iter)
     iter->second = new MelBanks(*(iter->second));
-  if (other.srfft_)
-    srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
 }
 
 FbankComputer::~FbankComputer() {
@@ -69,8 +63,7 @@ const MelBanks* FbankComputer::GetMelBanks(BaseFloat vtln_warp) {
   return this_mel_banks;
 }
 
-void FbankComputer::Compute(BaseFloat signal_raw_log_energy,
-                            BaseFloat vtln_warp,
+void FbankComputer::Compute(BaseFloat vtln_warp,
                             VectorBase<BaseFloat> *signal_frame,
                             VectorBase<BaseFloat> *feature) {
 
@@ -80,45 +73,40 @@ void FbankComputer::Compute(BaseFloat signal_raw_log_energy,
                feature->Dim() == this->Dim());
 
 
-  // Compute energy after window function (not the raw one).
-  if (opts_.use_energy && !opts_.raw_energy)
-    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::epsilon()));
+  BaseFloat signal_log_energy = 0.0;
+  if (opts_.use_energy)
+    signal_log_energy = Log(std::max<BaseFloat>(
+        VecVec(*signal_frame, *signal_frame),
+        opts_.energy_floor * opts_.frame_opts.WindowSize()));
 
-  if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
-    srfft_->Compute(signal_frame->Data(), true);
-  else  // An alternative algorithm that works for non-powers-of-two.
-    RealFft(signal_frame, true);
+  // Compute FFT using split-radix algorithm.
+  srfft_->Compute(signal_frame->Data(), true);
 
   // Convert the FFT into a power spectrum.
   ComputePowerSpectrum(signal_frame);
   SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
                                       signal_frame->Dim() / 2 + 1);
 
-  // Use magnitude instead of power if requested.
-  if (!opts_.use_power)
-    power_spectrum.ApplyPow(0.5);
+  // The energy_floor has the scale for the energy of a single sample, and the
+  // FFT has a higher dynamic range (it's not the orthogonal FFT)... the sqrt
+  // expression is to correct for that.
+  BaseFloat floor = opts_.energy_floor *
+                    std::sqrt(BaseFloat(opts_.frame_opts.WindowSize()));
+  power_spectrum.ApplyFloor(floor);
 
-  int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
+  int32 mel_offset = (opts_.use_energy ? 1 : 0);
   SubVector<BaseFloat> mel_energies(*feature,
                                     mel_offset,
                                     opts_.mel_opts.num_bins);
 
   // Sum with mel fiterbanks over the power spectrum
   mel_banks.Compute(power_spectrum, &mel_energies);
-  if (opts_.use_log_fbank) {
-    // Avoid log of zero (which should be prevented anyway by dithering).
-    mel_energies.ApplyFloor(std::numeric_limits<float>::epsilon());
-    mel_energies.ApplyLog();  // take the log.
-  }
 
-  // Copy energy as first value (or the last, if htk_compat == true).
+  mel_energies.ApplyLog();  // take the log.
+
+  // Copy energy as first value
   if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
-      signal_raw_log_energy = log_energy_floor_;
-    }
-    int32 energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
-    (*feature)(energy_index) = signal_raw_log_energy;
+    (*feature)(0) = signal_log_energy;
   }
 }
 
diff --git a/src/feat/feature-fbank.h b/src/feat/feature-fbank.h
index f57d185a41c..665a087fcaa 100644
--- a/src/feat/feature-fbank.h
+++ b/src/feat/feature-fbank.h
@@ -42,41 +42,26 @@ struct FbankOptions {
   FrameExtractionOptions frame_opts;
   MelBanksOptions mel_opts;
   bool use_energy;  // append an extra dimension with energy to the filter banks
-  BaseFloat energy_floor;
-  bool raw_energy;  // If true, compute energy before preemphasis and windowing
-  bool htk_compat;  // If true, put energy last (if using energy)
-  bool use_log_fbank;  // if true (default), produce log-filterbank, else linear
-  bool use_power;  // if true (default), use power in filterbank analysis, else magnitude.
+  BaseFloat energy_floor;  // Floor on energy, to avoid log(0.0), which will be
+                           // multiplied by sqrt(window-length-in-frames) and
+                           // applied per FFT bin. The value of 1.0e-09 is
+                           // approximately (1.0/32768.0)^2, like a signal value
+                           // of +- 1 in a 16-bit recording.
 
   FbankOptions(): mel_opts(23),
-                 // defaults the #mel-banks to 23 for the FBANK computations.
-                 // this seems to be common for 16khz-sampled data,
-                 // but for 8khz-sampled data, 15 may be better.
-                 use_energy(false),
-                 energy_floor(0.0),
-                 raw_energy(true),
-                 htk_compat(false),
-                 use_log_fbank(true),
-                 use_power(true) {}
+                  use_energy(false),
+                  energy_floor(1.0e-09) { }
 
   void Register(OptionsItf *opts) {
     frame_opts.Register(opts);
     mel_opts.Register(opts);
     opts->Register("use-energy", &use_energy,
-                   "Add an extra dimension with energy to the FBANK output.");
+                   "Add an extra dimension with energy to the filterbank "
+                   "output.");
     opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in FBANK computation. "
-                   "Only makes a difference if --use-energy=true; only necessary if "
-                   "--dither=0.0.  Suggested values: 0.1 or 1.0");
-    opts->Register("raw-energy", &raw_energy,
-                   "If true, compute energy before preemphasis and windowing");
-    opts->Register("htk-compat", &htk_compat, "If true, put energy last.  "
-                   "Warning: not sufficient to get HTK compatible features (need "
-                   "to change other parameters).");
-    opts->Register("use-log-fbank", &use_log_fbank,
-                   "If true, produce log-filterbank, else produce linear.");
-    opts->Register("use-power", &use_power,
-                   "If true, use power, else use magnitude.");
+                   "Floor on energy expressed as a squared-signal value per "
+                   "frame.  The default value represents about +-1 in int16 "
+                   "representation.");
   }
 };
 
@@ -94,8 +79,6 @@ class FbankComputer {
     return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
   }
 
-  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
-
   const FrameExtractionOptions &GetFrameOptions() const {
     return opts_.frame_opts;
   }
@@ -104,11 +87,6 @@ class FbankComputer {
      Function that computes one frame of features from
      one frame of signal.
 
-     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
-         prior to windowing and pre-emphasis, or
-         log(numeric_limits<float>::min()), whichever is greater.  Must be
-         ignored by this function if this class returns false from
-         this->NeedsRawLogEnergy().
      @param [in] vtln_warp  The VTLN warping factor that the user wants
          to be applied when computing features for this utterance.  Will
          normally be 1.0, meaning no warping is to be done.  The value will
@@ -121,8 +99,7 @@ class FbankComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_raw_log_energy,
-               BaseFloat vtln_warp,
+  void Compute(BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
 
@@ -133,7 +110,6 @@ class FbankComputer {
 
 
   FbankOptions opts_;
-  BaseFloat log_energy_floor_;
   std::map<BaseFloat, MelBanks*> mel_banks_;  // BaseFloat is VTLN coefficient.
   SplitRadixRealFft<BaseFloat> *srfft_;
   // Disallow assignment.
diff --git a/src/feat/feature-functions.cc b/src/feat/feature-functions.cc
index 76500ccf87a..36c20df6f84 100644
--- a/src/feat/feature-functions.cc
+++ b/src/feat/feature-functions.cc
@@ -29,13 +29,8 @@ namespace kaldi {
 void ComputePowerSpectrum(VectorBase<BaseFloat> *waveform) {
   int32 dim = waveform->Dim();
 
-  // no, letting it be non-power-of-two for now.
-  // KALDI_ASSERT(dim > 0 && (dim & (dim-1) == 0));  // make sure a power of two.. actually my FFT code
-  // does not require this (dan) but this is better in case we use different code [dan].
-
-  // RealFft(waveform, true);  // true == forward (not inverse) FFT; makes no difference here,
-  // as we just want power spectrum.
-
+  // make sure a power of two.
+  KALDI_ASSERT(dim > 0 && ((dim & (dim-1)) == 0));
   // now we have in waveform, first half of complex spectrum
   // it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
   int32 half_dim = dim/2;
@@ -46,8 +41,9 @@ void ComputePowerSpectrum(VectorBase<BaseFloat> *waveform) {
     (*waveform)(i) = real*real + im*im;
   }
   (*waveform)(0) = first_energy;
-  (*waveform)(half_dim) = last_energy;  // Will actually never be used, and anyway
-  // if the signal has been bandlimited sensibly this should be zero.
+  (*waveform)(half_dim) = last_energy;
+  // Will actually never be used, and anyway if the signal has been bandlimited
+  // sensibly this should be zero.
 }
 
 
diff --git a/src/feat/feature-mfcc-test.cc b/src/feat/feature-mfcc-test.cc
index c4367139707..280e2155c86 100644
--- a/src/feat/feature-mfcc-test.cc
+++ b/src/feat/feature-mfcc-test.cc
@@ -53,6 +53,7 @@ static void UnitTestReadWave() {
   );
   KALDI_ASSERT(input.good());
   v2.Read(input, false);
+  v2.Scale(BaseFloat(1.0 / 32768.0));
   input.close();
 
   std::cout << "<<<=== Comparing freshly read waveform to 'libsndfile' waveform\n";
@@ -71,551 +72,7 @@ static void UnitTestReadWave() {
 
 
 
-/**
- */
-static void UnitTestSimple() {
-  std::cout << "=== UnitTestSimple() ===\n";
 
-  Vector<BaseFloat> v(100000);
-  Matrix<BaseFloat> m;
-
-  // init with noise
-  for (int32 i = 0; i < v.Dim(); i++) {
-    v(i) = (abs( i * 433024253 ) % 65535) - (65535 / 2);
-  }
-
-  std::cout << "<<<=== Just make sure it runs... Nothing is compared\n";
-  // the parametrization object
-  MfccOptions op;
-  // trying to have same opts as baseline.
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "rectangular";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.mel_opts.htk_mode = true;
-  op.htk_compat = true;
-
-  Mfcc mfcc(op);
-  // use default parameters
-
-  // compute mfccs.
-  mfcc.Compute(v, 1.0, &m);
-
-  // possibly dump
-  //   std::cout << "== Output features == \n" << m;
-  std::cout << "Test passed :)\n\n";
-}
-
-
-static void UnitTestHTKCompare1() {
-  std::cout << "=== UnitTestHTKCompare1() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.1",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.mel_opts.htk_mode = true;
-  op.htk_compat = true;
-  op.use_energy = false;  // C0 not energy.
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.1",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fea_kaldi.1");
-}
-
-
-static void UnitTestHTKCompare2() {
-  std::cout << "=== UnitTestHTKCompare2() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.2",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.mel_opts.htk_mode = true;
-  op.htk_compat = true;
-  op.use_energy = true;  // Use energy.
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.2",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fea_kaldi.2");
-}
-
-
-static void UnitTestHTKCompare3() {
-  std::cout << "=== UnitTestHTKCompare3() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.3",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.htk_compat = true;
-  op.use_energy = true;  // Use energy.
-  op.mel_opts.low_freq = 20.0;
-  //op.mel_opts.debug_mel = true;
-  op.mel_opts.htk_mode = true;
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (static_cast<int32>(i_old) != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.3",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fea_kaldi.3");
-}
-
-
-static void UnitTestHTKCompare4() {
-  std::cout << "=== UnitTestHTKCompare4() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.4",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.htk_compat = true;
-  op.use_energy = true;  // Use energy.
-  op.mel_opts.htk_mode = true;
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (static_cast<int32>(i_old) != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.4",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fea_kaldi.4");
-}
-
-
-static void UnitTestHTKCompare5() {
-  std::cout << "=== UnitTestHTKCompare5() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.5",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.htk_compat = true;
-  op.use_energy = true;  // Use energy.
-  op.mel_opts.low_freq = 0.0;
-  op.mel_opts.vtln_low = 100.0;
-  op.mel_opts.vtln_high = 7500.0;
-  op.mel_opts.htk_mode = true;
-
-  BaseFloat vtln_warp = 1.1; // our approach identical to htk for warp factor >1,
-  // differs slightly for higher mel bins if warp_factor <0.9
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, vtln_warp, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (static_cast<int32>(i_old) != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.5",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fea_kaldi.5");
-}
-
-static void UnitTestHTKCompare6() {
-  std::cout << "=== UnitTestHTKCompare6() ===\n";
-
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.6",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.97;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.num_bins = 24;
-  op.mel_opts.low_freq = 125.0;
-  op.mel_opts.high_freq = 7800.0;
-  op.htk_compat = true;
-  op.use_energy = false;  // C0 not energy.
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (static_cast<int32>(i_old) != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.6",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fea_kaldi.6");
-}
 
 void UnitTestVtln() {
   // Test the function VtlnWarpFreq.
@@ -656,16 +113,6 @@ void UnitTestVtln() {
 static void UnitTestFeat() {
   UnitTestVtln();
   UnitTestReadWave();
-  UnitTestSimple();
-  UnitTestHTKCompare1();
-  UnitTestHTKCompare2();
-  // commenting out this one as it doesn't compare right now I normalized
-  // the way the FFT bins are treated (removed offset of 0.5)... this seems
-  // to relate to the way frequency zero behaves.
-  UnitTestHTKCompare3();
-  UnitTestHTKCompare4();
-  UnitTestHTKCompare5();
-  UnitTestHTKCompare6();
   std::cout << "Tests succeeded.\n";
 }
 
@@ -682,5 +129,3 @@ int main() {
     return 1;
   }
 }
-
-
diff --git a/src/feat/feature-mfcc.cc b/src/feat/feature-mfcc.cc
index 73ab4b312c4..79e02ca5db2 100644
--- a/src/feat/feature-mfcc.cc
+++ b/src/feat/feature-mfcc.cc
@@ -25,62 +25,57 @@
 namespace kaldi {
 
 
-void MfccComputer::Compute(BaseFloat signal_raw_log_energy,
-                           BaseFloat vtln_warp,
+// Compute liftering coefficients (scaling on cepstral coeffs)
+// coeffs are numbered slightly differently from HTK: the zeroth
+// index is C0, which is not affected.
+static void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs) {
+  for (int32 i = 0; i < coeffs->Dim(); i++)
+      (*coeffs)(i) = 1.0 + 0.5 * Q * sin (M_PI * i / Q);
+}
+
+
+void MfccComputer::Compute(BaseFloat vtln_warp,
                            VectorBase<BaseFloat> *signal_frame,
                            VectorBase<BaseFloat> *feature) {
   KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
                feature->Dim() == this->Dim());
 
+  BaseFloat signal_log_energy;
+  if (opts_.use_energy)
+    signal_log_energy = Log(std::max<BaseFloat>(
+        VecVec(*signal_frame, *signal_frame),
+        opts_.energy_floor * opts_.frame_opts.WindowSize()));
   const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
 
-  if (opts_.use_energy && !opts_.raw_energy)
-    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::epsilon()));
-
-  if (srfft_ != NULL)  // Compute FFT using the split-radix algorithm.
-    srfft_->Compute(signal_frame->Data(), true);
-  else  // An alternative algorithm that works for non-powers-of-two.
-    RealFft(signal_frame, true);
+  srfft_->Compute(signal_frame->Data(), true);
 
   // Convert the FFT into a power spectrum.
   ComputePowerSpectrum(signal_frame);
   SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
                                       signal_frame->Dim() / 2 + 1);
 
-  mel_banks.Compute(power_spectrum, &mel_energies_);
+  // The energy_floor has the scale for the energy of a single sample, and the
+  // FFT has a higher dynamic range (it's not the orthogonal FFT)... the sqrt
+  // expression is to correct for that.
+  BaseFloat floor = opts_.energy_floor *
+                    std::sqrt(BaseFloat(opts_.frame_opts.WindowSize()));
+  power_spectrum.ApplyFloor(floor);
 
-  // avoid log of zero (which should be prevented anyway by dithering).
-  mel_energies_.ApplyFloor(std::numeric_limits<float>::epsilon());
-  mel_energies_.ApplyLog();  // take the log.
+  mel_banks.Compute(power_spectrum, &mel_energies_);
+  mel_energies_.ApplyLog();
 
   feature->SetZero();  // in case there were NaNs.
   // feature = dct_matrix_ * mel_energies [which now have log]
   feature->AddMatVec(1.0, dct_matrix_, kNoTrans, mel_energies_, 0.0);
+  feature->MulElements(lifter_coeffs_);
 
-  if (opts_.cepstral_lifter != 0.0)
-    feature->MulElements(lifter_coeffs_);
-
-  if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
-      signal_raw_log_energy = log_energy_floor_;
-    (*feature)(0) = signal_raw_log_energy;
-  }
-
-  if (opts_.htk_compat) {
-    BaseFloat energy = (*feature)(0);
-    for (int32 i = 0; i < opts_.num_ceps - 1; i++)
-      (*feature)(i) = (*feature)(i+1);
-    if (!opts_.use_energy)
-      energy *= M_SQRT2;  // scale on C0 (actually removing a scale
-    // we previously added that's part of one common definition of
-    // the cosine transform.)
-    (*feature)(opts_.num_ceps - 1)  = energy;
-  }
+  if (opts_.use_energy)
+    (*feature)(0) = signal_log_energy;
 }
 
 MfccComputer::MfccComputer(const MfccOptions &opts):
-    opts_(opts), srfft_(NULL),
+    opts_(opts),
+    srfft_(new SplitRadixRealFft<BaseFloat>(opts.frame_opts.PaddedWindowSize())),
     mel_energies_(opts.mel_opts.num_bins) {
 
   int32 num_bins = opts.mel_opts.num_bins;
@@ -92,22 +87,16 @@ MfccComputer::MfccComputer(const MfccOptions &opts):
 
   Matrix<BaseFloat> dct_matrix(num_bins, num_bins);
   ComputeDctMatrix(&dct_matrix);
+  lifter_coeffs_.Resize(opts.num_ceps);
+  ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
+
+
   // Note that we include zeroth dct in either case.  If using the
   // energy we replace this with the energy.  This means a different
   // ordering of features than HTK.
   SubMatrix<BaseFloat> dct_rows(dct_matrix, 0, opts.num_ceps, 0, num_bins);
   dct_matrix_.Resize(opts.num_ceps, num_bins);
   dct_matrix_.CopyFromMat(dct_rows);  // subset of rows.
-  if (opts.cepstral_lifter != 0.0) {
-    lifter_coeffs_.Resize(opts.num_ceps);
-    ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
-  }
-  if (opts.energy_floor > 0.0)
-    log_energy_floor_ = Log(opts.energy_floor);
-
-  int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
-  if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two...
-    srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
 
   // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
   // [note: this call caches it.]
@@ -117,15 +106,12 @@ MfccComputer::MfccComputer(const MfccOptions &opts):
 MfccComputer::MfccComputer(const MfccComputer &other):
     opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
     dct_matrix_(other.dct_matrix_),
-    log_energy_floor_(other.log_energy_floor_),
     mel_banks_(other.mel_banks_),
-    srfft_(NULL),
+    srfft_(new SplitRadixRealFft<BaseFloat>(*(other.srfft_))),
     mel_energies_(other.mel_energies_.Dim(), kUndefined) {
   for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
        iter != mel_banks_.end(); ++iter)
     iter->second = new MelBanks(*(iter->second));
-  if (other.srfft_ != NULL)
-    srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
 }
 
 
diff --git a/src/feat/feature-mfcc.h b/src/feat/feature-mfcc.h
index dbfb9d60364..993d0dc777e 100644
--- a/src/feat/feature-mfcc.h
+++ b/src/feat/feature-mfcc.h
@@ -1,7 +1,7 @@
 // feat/feature-mfcc.h
 
 // Copyright 2009-2011  Karel Vesely;  Petr Motlicek;  Saarland University
-//           2014-2016  Johns Hopkins University (author: Daniel Povey)
+//           2014-2019  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -39,25 +39,24 @@ struct MfccOptions {
   FrameExtractionOptions frame_opts;
   MelBanksOptions mel_opts;
   int32 num_ceps;  // e.g. 13: num cepstral coeffs, counting zero.
-  bool use_energy;  // use energy; else C0
-  BaseFloat energy_floor;  // 0 by default; set to a value like 1.0 or 0.1 if
-                           // you disable dithering.
-  bool raw_energy;  // If true, compute energy before preemphasis and windowing
-  BaseFloat cepstral_lifter;  // Scaling factor on cepstra for HTK compatibility.
-                              // if 0.0, no liftering is done.
-  bool htk_compat;  // if true, put energy/C0 last and introduce a factor of
-                    // sqrt(2) on C0 to be the same as HTK.
+  bool use_energy;  // if true, use energy; else C0
+  BaseFloat energy_floor;  // Floor on energy, to avoid log(0.0), which will be
+                           // multiplied by sqrt(window-length-in-frames) and
+                           // applied per FFT bin. The value of 1.0e-09 is
+                           // approximately (1.0/32768.0)^2, like a signal value
+                           // of +- 1 in a 16-bit recording.
+  // cepstral_lifter controls a scaling factor on the cepstra that helps give
+  // all the MFCC coeffs a similar dynamic range by scaling up the
+  // higher-frequency coefficients.  It's a rather odd formula involving
+  // a sigh.   We don't make it configurable.
+  BaseFloat cepstral_lifter;
 
   MfccOptions() : mel_opts(23),
-                  // defaults the #mel-banks to 23 for the MFCC computations.
-                  // this seems to be common for 16khz-sampled data,
-                  // but for 8khz-sampled data, 15 may be better.
                   num_ceps(13),
                   use_energy(true),
-                  energy_floor(0.0),
-                  raw_energy(true),
-                  cepstral_lifter(22.0),
-                  htk_compat(false) {}
+                  energy_floor(1.0e-09),
+                  cepstral_lifter(22.0) { }
+
 
   void Register(OptionsItf *opts) {
     frame_opts.Register(opts);
@@ -67,17 +66,8 @@ struct MfccOptions {
     opts->Register("use-energy", &use_energy,
                    "Use energy (not C0) in MFCC computation");
     opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in MFCC computation. "
-                   "Only makes a difference if --use-energy=true; only necessary if "
-                   "--dither=0.0.  Suggested values: 0.1 or 1.0");
-    opts->Register("raw-energy", &raw_energy,
-                   "If true, compute energy before preemphasis and windowing");
-    opts->Register("cepstral-lifter", &cepstral_lifter,
-                   "Constant that controls scaling of MFCCs");
-    opts->Register("htk-compat", &htk_compat,
-                   "If true, put energy or C0 last and use a factor of sqrt(2) on "
-                   "C0.  Warning: not sufficient to get HTK compatible features "
-                   "(need to change other parameters).");
+                   "Floor on energy (absolute, not relative) of mel bins etc. "
+                   "in MFCC computation. ");
   }
 };
 
@@ -96,17 +86,10 @@ class MfccComputer {
 
   int32 Dim() const { return opts_.num_ceps; }
 
-  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
-
   /**
      Function that computes one frame of features from
      one frame of signal.
 
-     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
-         prior to windowing and pre-emphasis, or
-         log(numeric_limits<float>::min()), whichever is greater.  Must be
-         ignored by this function if this class returns false from
-         this->NeedsRawLogEnergy().
      @param [in] vtln_warp  The VTLN warping factor that the user wants
          to be applied when computing features for this utterance.  Will
          normally be 1.0, meaning no warping is to be done.  The value will
@@ -119,8 +102,7 @@ class MfccComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_raw_log_energy,
-               BaseFloat vtln_warp,
+  void Compute(BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
 
@@ -135,7 +117,6 @@ class MfccComputer {
   MfccOptions opts_;
   Vector<BaseFloat> lifter_coeffs_;
   Matrix<BaseFloat> dct_matrix_;  // matrix we left-multiply by to perform DCT.
-  BaseFloat log_energy_floor_;
   std::map<BaseFloat, MelBanks*> mel_banks_;  // BaseFloat is VTLN coefficient.
   SplitRadixRealFft<BaseFloat> *srfft_;
 
diff --git a/src/feat/feature-plp-test.cc b/src/feat/feature-plp-test.cc
deleted file mode 100644
index ad872cffcd0..00000000000
--- a/src/feat/feature-plp-test.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-// feat/feature-plp-test.cc
-
-// Copyright 2009-2011  Karel Vesely;  Petr Motlicek
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include <iostream>
-
-#include "feat/feature-plp.h"
-#include "base/kaldi-math.h"
-#include "matrix/kaldi-matrix-inl.h"
-#include "feat/wave-reader.h"
-
-using namespace kaldi;
-
-
-
-
-
-/**
- */
-static void UnitTestSimple() {
-  std::cout << "=== UnitTestSimple() ===\n";
-
-  Vector<BaseFloat> v(100000);
-  Matrix<BaseFloat> m;
-
-  // init with noise
-  for (int32 i = 0; i < v.Dim(); i++) {
-    v(i) = (abs( i * 433024253 ) % 65535) - (65535 / 2);
-  }
-
-  std::cout << "<<<=== Just make sure it runs... Nothing is compared\n";
-  // the parametrization object
-  PlpOptions op;
-  // trying to have same opts as baseline.
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "rectangular";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-//  op.htk_compat = true;
-
-  Plp plp(op);
-  // use default parameters
-
-  // compute mfccs.
-  plp.Compute(v, 1.0, &m);
-
-  // possibly dump
-  //   std::cout << "== Output features == \n" << m;
-  std::cout << "Test passed :)\n\n";
-}
-
-
-static void UnitTestHTKCompare1() {
-  std::cout << "=== UnitTestHTKCompare1() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.plp_htk.1",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use plp with default configuration...
-  PlpOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.htk_compat = true;
-  op.use_energy = false;  // C0 not energy.
-  op.cepstral_scale = 1.0;
-
-  Plp plp(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  plp.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 0.10) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021413  // PLP_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.plp_kaldi.1",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.plp_kaldi.1");
-}
-
-
-
-
-static void UnitTestFeat() {
-  UnitTestSimple();
-  UnitTestHTKCompare1();
-}
-
-
-
-
-int main() {
-  try {
-    for (int i = 0; i < 5; i++)
-      UnitTestFeat();
-    std::cout << "Tests succeeded.\n";
-    return 0;
-  } catch (const std::exception &e) {
-    std::cerr << e.what();
-    return 1;
-  }
-}
-
-
diff --git a/src/feat/feature-plp.cc b/src/feat/feature-plp.cc
deleted file mode 100644
index e0c270c7061..00000000000
--- a/src/feat/feature-plp.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-// feat/feature-plp.cc
-
-// Copyright 2009-2011  Petr Motlicek;  Karel Vesely
-//                2016  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "feat/feature-plp.h"
-
-namespace kaldi {
-
-PlpComputer::PlpComputer(const PlpOptions &opts):
-    opts_(opts), srfft_(NULL),
-    mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
-    autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
-    lpc_coeffs_(opts_.lpc_order, kUndefined),
-    raw_cepstrum_(opts_.lpc_order, kUndefined) {
-
-  if (opts.cepstral_lifter != 0.0) {
-    lifter_coeffs_.Resize(opts.num_ceps);
-    ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
-  }
-  InitIdftBases(opts_.lpc_order + 1, opts_.mel_opts.num_bins + 2,
-                &idft_bases_);
-
-  if (opts.energy_floor > 0.0)
-    log_energy_floor_ = Log(opts.energy_floor);
-
-  int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
-  if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two...
-    srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
-
-  // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
-  // [note: this call caches it.]
-  GetMelBanks(1.0);
-}
-
-PlpComputer::PlpComputer(const PlpComputer &other):
-    opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
-    idft_bases_(other.idft_bases_), log_energy_floor_(other.log_energy_floor_),
-    mel_banks_(other.mel_banks_), equal_loudness_(other.equal_loudness_),
-    srfft_(NULL),
-    mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
-    autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
-    lpc_coeffs_(opts_.lpc_order, kUndefined),
-    raw_cepstrum_(opts_.lpc_order, kUndefined) {
-  for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
-       iter != mel_banks_.end(); ++iter)
-    iter->second = new MelBanks(*(iter->second));
-  for (std::map<BaseFloat, Vector<BaseFloat>*>::iterator
-           iter = equal_loudness_.begin();
-       iter != equal_loudness_.end(); ++iter)
-    iter->second = new Vector<BaseFloat>(*(iter->second));
-  if (other.srfft_ != NULL)
-    srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
-}
-
-PlpComputer::~PlpComputer() {
-  for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
-      iter != mel_banks_.end(); ++iter)
-    delete iter->second;
-  for (std::map<BaseFloat, Vector<BaseFloat>* >::iterator
-           iter = equal_loudness_.begin();
-       iter != equal_loudness_.end(); ++iter)
-    delete iter->second;
-  delete srfft_;
-}
-
-const MelBanks *PlpComputer::GetMelBanks(BaseFloat vtln_warp) {
-  MelBanks *this_mel_banks = NULL;
-  std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
-  if (iter == mel_banks_.end()) {
-    this_mel_banks = new MelBanks(opts_.mel_opts,
-                                  opts_.frame_opts,
-                                  vtln_warp);
-    mel_banks_[vtln_warp] = this_mel_banks;
-  } else {
-    this_mel_banks = iter->second;
-  }
-  return this_mel_banks;
-}
-
-const Vector<BaseFloat> *PlpComputer::GetEqualLoudness(BaseFloat vtln_warp) {
-  const MelBanks *this_mel_banks = GetMelBanks(vtln_warp);
-  Vector<BaseFloat> *ans = NULL;
-  std::map<BaseFloat, Vector<BaseFloat>*>::iterator iter
-      = equal_loudness_.find(vtln_warp);
-  if (iter == equal_loudness_.end()) {
-    ans = new Vector<BaseFloat>;
-    GetEqualLoudnessVector(*this_mel_banks, ans);
-    equal_loudness_[vtln_warp] = ans;
-  } else {
-    ans = iter->second;
-  }
-  return ans;
-}
-
-void PlpComputer::Compute(BaseFloat signal_raw_log_energy,
-                          BaseFloat vtln_warp,
-                          VectorBase<BaseFloat> *signal_frame,
-                          VectorBase<BaseFloat> *feature) {
-  KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
-               feature->Dim() == this->Dim());
-
-  const MelBanks &mel_banks = *GetMelBanks(vtln_warp);
-  const Vector<BaseFloat> &equal_loudness = *GetEqualLoudness(vtln_warp);
-
-
-  KALDI_ASSERT(opts_.num_ceps <= opts_.lpc_order+1);  // our num-ceps includes C0.
-
-
-  if (opts_.use_energy && !opts_.raw_energy)
-    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::min()));
-
-  if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
-    srfft_->Compute(signal_frame->Data(), true);
-  else  // An alternative algorithm that works for non-powers-of-two.
-    RealFft(signal_frame, true);
-
-  // Convert the FFT into a power spectrum.
-  ComputePowerSpectrum(signal_frame);  // elements 0 ... signal_frame->Dim()/2
-
-  SubVector<BaseFloat> power_spectrum(*signal_frame,
-                                      0, signal_frame->Dim() / 2 + 1);
-
-  int32 num_mel_bins = opts_.mel_opts.num_bins;
-
-  SubVector<BaseFloat> mel_energies(mel_energies_duplicated_, 1, num_mel_bins);
-
-  mel_banks.Compute(power_spectrum, &mel_energies);
-
-  mel_energies.MulElements(equal_loudness);
-
-  mel_energies.ApplyPow(opts_.compress_factor);
-
-  // duplicate first and last elements
-  mel_energies_duplicated_(0) = mel_energies_duplicated_(1);
-  mel_energies_duplicated_(num_mel_bins + 1) =
-      mel_energies_duplicated_(num_mel_bins);
-
-  autocorr_coeffs_.SetZero();  // In case of NaNs or infs
-  autocorr_coeffs_.AddMatVec(1.0, idft_bases_, kNoTrans,
-                             mel_energies_duplicated_,  0.0);
-
-  BaseFloat residual_log_energy = ComputeLpc(autocorr_coeffs_, &lpc_coeffs_);
-
-  residual_log_energy = std::max<BaseFloat>(residual_log_energy,
-                                 std::numeric_limits<float>::min());
-
-  Lpc2Cepstrum(opts_.lpc_order, lpc_coeffs_.Data(), raw_cepstrum_.Data());
-  feature->Range(1, opts_.num_ceps - 1).CopyFromVec(
-      raw_cepstrum_.Range(0, opts_.num_ceps - 1));
-  (*feature)(0) = residual_log_energy;
-
-  if (opts_.cepstral_lifter != 0.0)
-    feature->MulElements(lifter_coeffs_);
-
-  if (opts_.cepstral_scale != 1.0)
-    feature->Scale(opts_.cepstral_scale);
-
-  if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
-      signal_raw_log_energy = log_energy_floor_;
-    (*feature)(0) = signal_raw_log_energy;
-  }
-
-  if (opts_.htk_compat) {  // reorder the features.
-    BaseFloat log_energy = (*feature)(0);
-    for (int32 i = 0; i < opts_.num_ceps-1; i++)
-      (*feature)(i) = (*feature)(i+1);
-    (*feature)(opts_.num_ceps-1)  = log_energy;
-  }
-}
-
-
-}  // namespace kaldi
diff --git a/src/feat/feature-plp.h b/src/feat/feature-plp.h
deleted file mode 100644
index 4f156ca1e88..00000000000
--- a/src/feat/feature-plp.h
+++ /dev/null
@@ -1,176 +0,0 @@
-// feat/feature-plp.h
-
-// Copyright 2009-2011  Petr Motlicek;  Karel Vesely
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_FEAT_FEATURE_PLP_H_
-#define KALDI_FEAT_FEATURE_PLP_H_
-
-#include <map>
-#include <string>
-
-#include "feat/feature-common.h"
-#include "feat/feature-functions.h"
-#include "feat/feature-window.h"
-#include "feat/mel-computations.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-
-
-
-/// PlpOptions contains basic options for computing PLP features.
-/// It only includes things that can be done in a "stateless" way, i.e.
-/// it does not include energy max-normalization.
-/// It does not include delta computation.
-struct PlpOptions {
-  FrameExtractionOptions frame_opts;
-  MelBanksOptions mel_opts;
-  int32 lpc_order;
-  int32 num_ceps;  // num cepstra including zero
-  bool use_energy;  // use energy; else C0
-  BaseFloat energy_floor;
-  bool raw_energy;  // If true, compute energy before preemphasis and windowing
-  BaseFloat compress_factor;
-  int32 cepstral_lifter;
-  BaseFloat cepstral_scale;
-
-  bool htk_compat;  // if true, put energy/C0 last and introduce a factor of
-                    // sqrt(2) on C0 to be the same as HTK.
-
-  PlpOptions() : mel_opts(23),
-                 // default number of mel-banks for the PLP computation; this
-                 // seems to be common for 16kHz-sampled data. For 8kHz-sampled
-                 // data, 15 may be better.
-                 lpc_order(12),
-                 num_ceps(13),
-                 use_energy(true),
-                 energy_floor(0.0),
-                 raw_energy(true),
-                 compress_factor(0.33333),
-                 cepstral_lifter(22),
-                 cepstral_scale(1.0),
-                 htk_compat(false) {}
-
-  void Register(OptionsItf *opts) {
-    frame_opts.Register(opts);
-    mel_opts.Register(opts);
-    opts->Register("lpc-order", &lpc_order,
-                   "Order of LPC analysis in PLP computation");
-    opts->Register("num-ceps", &num_ceps,
-                   "Number of cepstra in PLP computation (including C0)");
-    opts->Register("use-energy", &use_energy,
-                   "Use energy (not C0) for zeroth PLP feature");
-    opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in PLP computation. "
-                   "Only makes a difference if --use-energy=true; only necessary if "
-                   "--dither=0.0.  Suggested values: 0.1 or 1.0");
-    opts->Register("raw-energy", &raw_energy,
-                   "If true, compute energy before preemphasis and windowing");
-    opts->Register("compress-factor", &compress_factor,
-                   "Compression factor in PLP computation");
-    opts->Register("cepstral-lifter", &cepstral_lifter,
-                   "Constant that controls scaling of PLPs");
-    opts->Register("cepstral-scale", &cepstral_scale,
-                   "Scaling constant in PLP computation");
-    opts->Register("htk-compat", &htk_compat,
-                   "If true, put energy or C0 last.  Warning: not sufficient "
-                   "to get HTK compatible features (need to change other "
-                   "parameters).");
-  }
-};
-
-
-/// This is the new-style interface to the PLP computation.
-class PlpComputer {
- public:
-  typedef PlpOptions Options;
-  explicit PlpComputer(const PlpOptions &opts);
-  PlpComputer(const PlpComputer &other);
-
-  const FrameExtractionOptions &GetFrameOptions() const {
-    return opts_.frame_opts;
-  }
-
-  int32 Dim() const { return opts_.num_ceps; }
-
-  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
-
-  /**
-     Function that computes one frame of features from
-     one frame of signal.
-
-     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
-         prior to windowing and pre-emphasis, or
-         log(numeric_limits<float>::min()), whichever is greater.  Must be
-         ignored by this function if this class returns false from
-         this->NeedsRawLogEnergy().
-     @param [in] vtln_warp  The VTLN warping factor that the user wants
-         to be applied when computing features for this utterance.  Will
-         normally be 1.0, meaning no warping is to be done.  The value will
-         be ignored for feature types that don't support VLTN, such as
-         spectrogram features.
-     @param [in] signal_frame  One frame of the signal,
-       as extracted using the function ExtractWindow() using the options
-       returned by this->GetFrameOptions().  The function will use the
-       vector as a workspace, which is why it's a non-const pointer.
-     @param [out] feature  Pointer to a vector of size this->Dim(), to which
-         the computed feature will be written.
-  */
-  void Compute(BaseFloat signal_raw_log_energy,
-               BaseFloat vtln_warp,
-               VectorBase<BaseFloat> *signal_frame,
-               VectorBase<BaseFloat> *feature);
-
-  ~PlpComputer();
- private:
-
-  const MelBanks *GetMelBanks(BaseFloat vtln_warp);
-
-  const Vector<BaseFloat> *GetEqualLoudness(BaseFloat vtln_warp);
-
-  PlpOptions opts_;
-  Vector<BaseFloat> lifter_coeffs_;
-  Matrix<BaseFloat> idft_bases_;
-  BaseFloat log_energy_floor_;
-  std::map<BaseFloat, MelBanks*> mel_banks_;  // BaseFloat is VTLN coefficient.
-  std::map<BaseFloat, Vector<BaseFloat>* > equal_loudness_;
-  SplitRadixRealFft<BaseFloat> *srfft_;
-
-  // temporary vector used inside Compute; size is opts_.mel_opts.num_bins + 2
-  Vector<BaseFloat> mel_energies_duplicated_;
-  // temporary vector used inside Compute; size is opts_.lpc_order + 1
-  Vector<BaseFloat> autocorr_coeffs_;
-  // temporary vector used inside Compute; size is opts_.lpc_order
-  Vector<BaseFloat> lpc_coeffs_;
-  // temporary vector used inside Compute; size is opts_.lpc_order
-  Vector<BaseFloat> raw_cepstrum_;
-
-  // Disallow assignment.
-  PlpComputer &operator =(const PlpComputer &other);
-};
-
-typedef OfflineFeatureTpl<PlpComputer> Plp;
-
-/// @} End of "addtogroup feat"
-
-}  // namespace kaldi
-
-
-#endif  // KALDI_FEAT_FEATURE_PLP_H_
diff --git a/src/feat/feature-sdc-test.cc b/src/feat/feature-sdc-test.cc
index 4b99c65fef8..42370ce4715 100644
--- a/src/feat/feature-sdc-test.cc
+++ b/src/feat/feature-sdc-test.cc
@@ -45,7 +45,7 @@ static void UnitTestCompareWithDeltaFeatures(Matrix<BaseFloat> &raw_features, in
   int32 dd_num_rows = deltas_features.NumRows();
   int32 sdc_num_rows = shifted_deltas_features.NumRows();
   int32 num_features = raw_features.NumCols();
- 
+
   // Number of rows will be equal, but not
   // columns, in general.
   KALDI_ASSERT(dd_num_rows == sdc_num_rows);
@@ -60,7 +60,7 @@ static void UnitTestCompareWithDeltaFeatures(Matrix<BaseFloat> &raw_features, in
   }
 }
 
-static void UnitTestParams(Matrix<BaseFloat> &raw_features, int32 window, 
+static void UnitTestParams(Matrix<BaseFloat> &raw_features, int32 window,
                            int32 shift, int32 n_blocks) {
   std::cout << "=== UnitTestSDCParams() ===\n";
   ShiftedDeltaFeaturesOptions shifted_deltas_opts;
@@ -78,8 +78,8 @@ static void UnitTestParams(Matrix<BaseFloat> &raw_features, int32 window,
   int32 sdc_num_cols = shifted_deltas_features.NumCols();
 
   KALDI_ASSERT(sdc_num_cols == raw_num_cols * (n_blocks  + 1));
-  
-  /* For every coefficient in the raw feature vector a 
+
+  /* For every coefficient in the raw feature vector a
      delta is calculated and appended to the new feature vector,
      as is done normally in a delta-deltas computation.
      In addition, n_blocks delta in advance are also appended.
@@ -89,7 +89,7 @@ static void UnitTestParams(Matrix<BaseFloat> &raw_features, int32 window,
      mapping from these additional deltas to where they would
      appear in a delta-deltas computation and verfies these
      values' equality. */
-  for (int32 i = 0; i < sdc_num_rows; i++) { 
+  for (int32 i = 0; i < sdc_num_rows; i++) {
     for (int32 j = 2 * raw_num_cols; j < sdc_num_cols; j += raw_num_cols) {
       for (int32 k = 0; k < raw_num_cols; k++) {
         int32 row = i + (j/raw_num_cols - 1) * shift;
@@ -103,7 +103,7 @@ static void UnitTestParams(Matrix<BaseFloat> &raw_features, int32 window,
   }
 }
 
-static void UnitTestEndEffects(Matrix<BaseFloat> &raw_features, int32 window, 
+static void UnitTestEndEffects(Matrix<BaseFloat> &raw_features, int32 window,
                                int32 shift, int32 n_blocks) {
   std::cout << "=== UnitTestSDCEndEffects() ===\n";
   ShiftedDeltaFeaturesOptions shifted_deltas_opts;
@@ -118,7 +118,7 @@ static void UnitTestEndEffects(Matrix<BaseFloat> &raw_features, int32 window,
   int32 raw_num_cols = raw_features.NumCols();
   int32 sdc_num_rows = shifted_deltas_features.NumRows();
   int32 sdc_num_cols = shifted_deltas_features.NumCols();
-  
+
   // If the entire window is out-of-bounds the delta should be zero.
   for (int32 i = sdc_num_rows - n_blocks + 1; i < sdc_num_rows; i++) {
     for (int32 j = 2 * raw_num_cols; j < sdc_num_cols; j += raw_num_cols) {
@@ -126,7 +126,7 @@ static void UnitTestEndEffects(Matrix<BaseFloat> &raw_features, int32 window,
         if (i + (j/raw_num_cols - 1) * shift - window/2 > sdc_num_rows)
           KALDI_ASSERT(shifted_deltas_features(i, j + k) <= 0.00001);
       }
-    } 
+    }
   }
 }
 
@@ -137,13 +137,7 @@ int main() {
   KALDI_ASSERT(wave.Data().NumRows() == 1);
   SubVector<BaseFloat> waveform(wave.Data(), 0);
 
-  // mfcc with default configuration...
   MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
   op.mel_opts.low_freq = 0.0;
   op.use_energy = false;
   Mfcc mfcc(op);
@@ -165,6 +159,5 @@ int main() {
     static_cast<void>(e);
     return 1;
   }
-  
-}
 
+}
diff --git a/src/feat/feature-spectrogram.cc b/src/feat/feature-spectrogram.cc
deleted file mode 100644
index 7eee2643cf5..00000000000
--- a/src/feat/feature-spectrogram.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// feat/feature-spectrogram.cc
-
-// Copyright 2009-2012  Karel Vesely
-// Copyright 2012  Navdeep Jaitly
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "feat/feature-spectrogram.h"
-
-
-namespace kaldi {
-
-SpectrogramComputer::SpectrogramComputer(const SpectrogramOptions &opts)
-    : opts_(opts), srfft_(NULL) {
-  if (opts.energy_floor > 0.0)
-    log_energy_floor_ = Log(opts.energy_floor);
-
-  int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
-  if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two
-    srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
-}
-
-SpectrogramComputer::SpectrogramComputer(const SpectrogramComputer &other):
-    opts_(other.opts_), log_energy_floor_(other.log_energy_floor_), srfft_(NULL) {
-  if (other.srfft_ != NULL)
-    srfft_ = new SplitRadixRealFft<BaseFloat>(*other.srfft_);
-}
-
-SpectrogramComputer::~SpectrogramComputer() {
-  delete srfft_;
-}
-
-void SpectrogramComputer::Compute(BaseFloat signal_raw_log_energy,
-                                  BaseFloat vtln_warp,
-                                  VectorBase<BaseFloat> *signal_frame,
-                                  VectorBase<BaseFloat> *feature) {
-  KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
-               feature->Dim() == this->Dim());
-
-
-  // Compute energy after window function (not the raw one)
-  if (!opts_.raw_energy)
-    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::epsilon()));
-
-  if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
-    srfft_->Compute(signal_frame->Data(), true);
-  else  // An alternative algorithm that works for non-powers-of-two
-    RealFft(signal_frame, true);
-
-  // Convert the FFT into a power spectrum.
-  ComputePowerSpectrum(signal_frame);
-  SubVector<BaseFloat> power_spectrum(*signal_frame,
-                                      0, signal_frame->Dim() / 2 + 1);
-
-  power_spectrum.ApplyFloor(std::numeric_limits<float>::epsilon());
-  power_spectrum.ApplyLog();
-
-  feature->CopyFromVec(power_spectrum);
-
-  if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
-    signal_raw_log_energy = log_energy_floor_;
-  // The zeroth spectrogram component is always set to the signal energy,
-  // instead of the square of the constant component of the signal.
-  (*feature)(0) = signal_raw_log_energy;
-}
-
-}  // namespace kaldi
diff --git a/src/feat/feature-spectrogram.h b/src/feat/feature-spectrogram.h
deleted file mode 100644
index 132a6875e00..00000000000
--- a/src/feat/feature-spectrogram.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// feat/feature-spectrogram.h
-
-// Copyright 2009-2012  Karel Vesely
-// Copyright 2012  Navdeep Jaitly
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_FEAT_FEATURE_SPECTROGRAM_H_
-#define KALDI_FEAT_FEATURE_SPECTROGRAM_H_
-
-
-#include <string>
-
-#include "feat/feature-common.h"
-#include "feat/feature-functions.h"
-#include "feat/feature-window.h"
-
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-
-
-/// SpectrogramOptions contains basic options for computing spectrogram
-/// features.
-struct SpectrogramOptions {
-  FrameExtractionOptions frame_opts;
-  BaseFloat energy_floor;
-  bool raw_energy;  // If true, compute energy before preemphasis and windowing
-
-  SpectrogramOptions() :
-    energy_floor(0.0),
-    raw_energy(true) {}
-
-  void Register(OptionsItf *opts) {
-    frame_opts.Register(opts);
-    opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in Spectrogram "
-                   "computation.  Caution: this floor is applied to the zeroth "
-                   "component, representing the total signal energy.  The "
-                   "floor on the individual spectrogram elements is fixed at "
-                   "std::numeric_limits<float>::epsilon().");
-    opts->Register("raw-energy", &raw_energy,
-                   "If true, compute energy before preemphasis and windowing");
-  }
-};
-
-/// Class for computing spectrogram features.
-class SpectrogramComputer {
- public:
-  typedef SpectrogramOptions Options;
-  explicit SpectrogramComputer(const SpectrogramOptions &opts);
-  SpectrogramComputer(const SpectrogramComputer &other);
-
-  const FrameExtractionOptions& GetFrameOptions() const {
-    return opts_.frame_opts;
-  }
-
-  int32 Dim() const { return opts_.frame_opts.PaddedWindowSize() / 2 + 1; }
-
-  bool NeedRawLogEnergy() const { return opts_.raw_energy; }
-
-
-  /**
-     Function that computes one frame of spectrogram features from
-     one frame of signal.
-
-     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
-         prior to windowing and pre-emphasis, or
-         log(numeric_limits<float>::min()), whichever is greater.  Must be
-         ignored by this function if this class returns false from
-         this->NeedsRawLogEnergy().
-     @param [in] vtln_warp  This is ignored by this function, it's only
-         needed for interface compatibility.
-     @param [in] signal_frame  One frame of the signal,
-       as extracted using the function ExtractWindow() using the options
-       returned by this->GetFrameOptions().  The function will use the
-       vector as a workspace, which is why it's a non-const pointer.
-     @param [out] feature  Pointer to a vector of size this->Dim(), to which
-         the computed feature will be written.
-  */
-  void Compute(BaseFloat signal_raw_log_energy,
-               BaseFloat vtln_warp,
-               VectorBase<BaseFloat> *signal_frame,
-               VectorBase<BaseFloat> *feature);
-
-  ~SpectrogramComputer();
-
- private:
-  SpectrogramOptions opts_;
-  BaseFloat log_energy_floor_;
-  SplitRadixRealFft<BaseFloat> *srfft_;
-
-  // Disallow assignment.
-  SpectrogramComputer &operator=(const SpectrogramComputer &other);
-};
-
-typedef OfflineFeatureTpl<SpectrogramComputer> Spectrogram;
-
-
-/// @} End of "addtogroup feat"
-}  // namespace kaldi
-
-
-#endif  // KALDI_FEAT_FEATURE_SPECTROGRAM_H_
diff --git a/src/feat/feature-window.cc b/src/feat/feature-window.cc
index c5d4cc29831..cd7b1a26326 100644
--- a/src/feat/feature-window.cc
+++ b/src/feat/feature-window.cc
@@ -1,7 +1,7 @@
 // feat/feature-window.cc
 
 // Copyright 2009-2011  Karel Vesely;  Petr Motlicek;  Microsoft Corporation
-//           2013-2016  Johns Hopkins University (author: Daniel Povey)
+//           2013-2019  Johns Hopkins University (author: Daniel Povey)
 //                2014  IMSL, PKU-HKUST (author: Wei Shi)
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -30,13 +30,9 @@ namespace kaldi {
 int64 FirstSampleOfFrame(int32 frame,
                          const FrameExtractionOptions &opts) {
   int64 frame_shift = opts.WindowShift();
-  if (opts.snip_edges) {
-    return frame * frame_shift;
-  } else {
-    int64 midpoint_of_frame = frame_shift * frame  +  frame_shift / 2,
-        beginning_of_frame = midpoint_of_frame  -  opts.WindowSize() / 2;
-    return beginning_of_frame;
-  }
+  int64 midpoint_of_frame = frame_shift * frame  +  frame_shift / 2,
+      beginning_of_frame = midpoint_of_frame  -  opts.WindowSize() / 2;
+  return beginning_of_frame;
 }
 
 int32 NumFrames(int64 num_samples,
@@ -44,85 +40,54 @@ int32 NumFrames(int64 num_samples,
                 bool flush) {
   int64 frame_shift = opts.WindowShift();
   int64 frame_length = opts.WindowSize();
-  if (opts.snip_edges) {
-    // with --snip-edges=true (the default), we use a HTK-like approach to
-    // determining the number of frames-- all frames have to fit completely into
-    // the waveform, and the first frame begins at sample zero.
-    if (num_samples < frame_length)
-      return 0;
-    else
-      return (1 + ((num_samples - frame_length) / frame_shift));
-    // You can understand the expression above as follows: 'num_samples -
-    // frame_length' is how much room we have to shift the frame within the
-    // waveform; 'frame_shift' is how much we shift it each time; and the ratio
-    // is how many times we can shift it (integer arithmetic rounds down).
-  } else {
-    // if --snip-edges=false, the number of frames is determined by rounding the
-    // (file-length / frame-shift) to the nearest integer.  The point of this
-    // formula is to make the number of frames an obvious and predictable
-    // function of the frame shift and signal length, which makes many
-    // segmentation-related questions simpler.
-    //
-    // Because integer division in C++ rounds toward zero, we add (half the
-    // frame-shift minus epsilon) before dividing, to have the effect of
-    // rounding towards the closest integer.
-    int32 num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
-
-    if (flush)
-      return num_frames;
-
-    // note: 'end' always means the last plus one, i.e. one past the last.
-    int64 end_sample_of_last_frame = FirstSampleOfFrame(num_frames - 1, opts)
-        + frame_length;
-
-    // the following code is optimized more for clarity than efficiency.
-    // If flush == false, we can't output frames that extend past the end
-    // of the signal.
-    while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
-      num_frames--;
-      end_sample_of_last_frame -= frame_shift;
-    }
+
+  // The number of frames is determined by rounding the
+  // (file-length / frame-shift) to the nearest integer.  The point of this
+  // formula is to make the number of frames an obvious and predictable
+  // function of the frame shift and signal length, which makes many
+  // segmentation-related questions simpler.
+  //
+  // Because integer division in C++ rounds toward zero, we add (half the
+  // frame-shift minus epsilon) before dividing, to have the effect of
+  // rounding towards the closest integer.
+  int32 num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
+
+  if (flush)
     return num_frames;
-  }
-}
 
+  // note: 'end' always means the last plus one, i.e. one past the last.
+  int64 end_sample_of_last_frame = FirstSampleOfFrame(num_frames - 1, opts)
+      + frame_length;
 
-void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value) {
-  if (dither_value == 0.0)
-    return;
-  int32 dim = waveform->Dim();
-  BaseFloat *data = waveform->Data();
-  RandomState rstate;
-  for (int32 i = 0; i < dim; i++)
-    data[i] += RandGauss(&rstate) * dither_value;
+  // the following code is optimized more for clarity than efficiency.
+  // If flush == false, we can't output frames that extend past the end
+  // of the signal.
+  while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
+    num_frames--;
+    end_sample_of_last_frame -= frame_shift;
+  }
+  return num_frames;
 }
 
 
-void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff) {
-  if (preemph_coeff == 0.0) return;
-  KALDI_ASSERT(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
-  for (int32 i = waveform->Dim()-1; i > 0; i--)
-    (*waveform)(i) -= preemph_coeff * (*waveform)(i-1);
-  (*waveform)(0) -= preemph_coeff * (*waveform)(0);
-}
-
-FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts) {
+void InitFeatureWindowFunction(const FrameExtractionOptions &opts,
+                               Vector<BaseFloat> *window_function) {
   int32 frame_length = opts.WindowSize();
   KALDI_ASSERT(frame_length > 0);
-  window.Resize(frame_length);
+  window_function->Resize(frame_length);
   double a = M_2PI / (frame_length-1);
   for (int32 i = 0; i < frame_length; i++) {
     double i_fl = static_cast<double>(i);
     if (opts.window_type == "hanning") {
-      window(i) = 0.5  - 0.5*cos(a * i_fl);
+      (*window_function)(i) = 0.5  - 0.5*cos(a * i_fl);
     } else if (opts.window_type == "hamming") {
-      window(i) = 0.54 - 0.46*cos(a * i_fl);
+      (*window_function)(i) = 0.54 - 0.46*cos(a * i_fl);
     } else if (opts.window_type == "povey") {  // like hamming but goes to zero at edges.
-      window(i) = pow(0.5 - 0.5*cos(a * i_fl), 0.85);
+      (*window_function)(i) = pow(0.5 - 0.5*cos(a * i_fl), 0.85);
     } else if (opts.window_type == "rectangular") {
-      window(i) = 1.0;
+      (*window_function)(i) = 1.0;
     } else if (opts.window_type == "blackman") {
-      window(i) = opts.blackman_coeff - 0.5*cos(a * i_fl) +
+      (*window_function)(i) = opts.blackman_coeff - 0.5*cos(a * i_fl) +
         (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
     } else {
       KALDI_ERR << "Invalid window type " << opts.window_type;
@@ -131,54 +96,34 @@ FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts)
 }
 
 void ProcessWindow(const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   VectorBase<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window) {
+                   const VectorBase<BaseFloat> &window_function,
+                   VectorBase<BaseFloat> *window) {
   int32 frame_length = opts.WindowSize();
   KALDI_ASSERT(window->Dim() == frame_length);
 
-  if (opts.dither != 0.0)
-    Dither(window, opts.dither);
-
-  if (opts.remove_dc_offset)
-    window->Add(-window->Sum() / frame_length);
-
-  if (log_energy_pre_window != NULL) {
-    BaseFloat energy = std::max<BaseFloat>(VecVec(*window, *window),
-                                std::numeric_limits<float>::epsilon());
-    *log_energy_pre_window = Log(energy);
-  }
 
-  if (opts.preemph_coeff != 0.0)
-    Preemphasize(window, opts.preemph_coeff);
+  /*  This was formerly enabled by the --remove-dc-offset option.  Right now that
+      option is mandatory.  */
+  window->Add(-window->Sum() / frame_length);
 
-  window->MulElements(window_function.window);
+  window->MulElements(window_function);
 }
 
 
 // ExtractWindow extracts a windowed frame of waveform with a power-of-two,
-// padded size.  It does mean subtraction, pre-emphasis and dithering as
-// requested.
+// padded size.  It does mean subtraction if requested.
 void ExtractWindow(int64 sample_offset,
                    const VectorBase<BaseFloat> &wave,
                    int32 f,  // with 0 <= f < NumFrames(feats, opts)
                    const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   Vector<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window) {
+                   const VectorBase<BaseFloat> &window_function,
+                   Vector<BaseFloat> *window) {
   KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0);
   int32 frame_length = opts.WindowSize(),
       frame_length_padded = opts.PaddedWindowSize();
-  int64 num_samples = sample_offset + wave.Dim(),
-      start_sample = FirstSampleOfFrame(f, opts),
-      end_sample = start_sample + frame_length;
+  int64 start_sample = FirstSampleOfFrame(f, opts);
 
-  if (opts.snip_edges) {
-    KALDI_ASSERT(start_sample >= sample_offset &&
-                 end_sample <= num_samples);
-  } else {
-    KALDI_ASSERT(sample_offset == 0 || start_sample >= sample_offset);
-  }
+  KALDI_ASSERT(sample_offset == 0 || start_sample >= sample_offset);
 
   if (window->Dim() != frame_length_padded)
     window->Resize(frame_length_padded, kUndefined);
@@ -216,7 +161,8 @@ void ExtractWindow(int64 sample_offset,
 
   SubVector<BaseFloat> frame(*window, 0, frame_length);
 
-  ProcessWindow(opts, window_function, &frame, log_energy_pre_window);
+  ProcessWindow(opts, window_function, &frame);
+
 }
 
 }  // namespace kaldi
diff --git a/src/feat/feature-window.h b/src/feat/feature-window.h
index a7abba50eca..979a6cac249 100644
--- a/src/feat/feature-window.h
+++ b/src/feat/feature-window.h
@@ -36,17 +36,12 @@ struct FrameExtractionOptions {
   BaseFloat samp_freq;
   BaseFloat frame_shift_ms;  // in milliseconds.
   BaseFloat frame_length_ms;  // in milliseconds.
-  BaseFloat dither;  // Amount of dithering, 0.0 means no dither.
-  BaseFloat preemph_coeff;  // Preemphasis coefficient.
-  bool remove_dc_offset;  // Subtract mean of wave before FFT.
   std::string window_type;  // e.g. Hamming window
   // May be "hamming", "rectangular", "povey", "hanning", "blackman"
   // "povey" is a window I made to be similar to Hamming but to go to zero at the
   // edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85)
   // I just don't think the Hamming window makes sense as a windowing function.
-  bool round_to_power_of_two;
   BaseFloat blackman_coeff;
-  bool snip_edges;
   bool allow_downsample;
   bool allow_upsample;
   int max_feature_vectors;
@@ -54,17 +49,11 @@ struct FrameExtractionOptions {
       samp_freq(16000),
       frame_shift_ms(10.0),
       frame_length_ms(25.0),
-      dither(1.0),
-      preemph_coeff(0.97),
-      remove_dc_offset(true),
       window_type("povey"),
-      round_to_power_of_two(true),
       blackman_coeff(0.42),
-      snip_edges(true),
       allow_downsample(false),
       allow_upsample(false),
-      max_feature_vectors(-1)
-      { }
+      max_feature_vectors(-1) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("sample-frequency", &samp_freq,
@@ -72,26 +61,11 @@ struct FrameExtractionOptions {
                    "if specified there)");
     opts->Register("frame-length", &frame_length_ms, "Frame length in milliseconds");
     opts->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
-    opts->Register("preemphasis-coefficient", &preemph_coeff,
-                   "Coefficient for use in signal preemphasis");
-    opts->Register("remove-dc-offset", &remove_dc_offset,
-                   "Subtract mean from waveform on each frame");
-    opts->Register("dither", &dither, "Dithering constant (0.0 means no dither). "
-                   "If you turn this off, you should set the --energy-floor "
-                   "option, e.g. to 1.0 or 0.1");
     opts->Register("window-type", &window_type, "Type of window "
                    "(\"hamming\"|\"hanning\"|\"povey\"|\"rectangular\""
                    "|\"blackmann\")");
     opts->Register("blackman-coeff", &blackman_coeff,
                    "Constant coefficient for generalized Blackman window.");
-    opts->Register("round-to-power-of-two", &round_to_power_of_two,
-                   "If true, round window size to power of two by zero-padding "
-                   "input to FFT.");
-    opts->Register("snip-edges", &snip_edges,
-                   "If true, end effects will be handled by outputting only frames that "
-                   "completely fit in the file, and the number of frames depends on the "
-                   "frame-length.  If false, the number of frames depends only on the "
-                   "frame-shift, and we reflect the data at the ends.");
     opts->Register("allow-downsample", &allow_downsample,
                    "If true, allow the input waveform to have a higher frequency than "
                    "the specified --sample-frequency (and we'll downsample).");
@@ -110,19 +84,16 @@ struct FrameExtractionOptions {
     return static_cast<int32>(samp_freq * 0.001 * frame_length_ms);
   }
   int32 PaddedWindowSize() const {
-    return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize()) :
-                                    WindowSize());
+    return RoundUpToNearestPowerOfTwo(WindowSize());
   }
 };
 
 
-struct FeatureWindowFunction {
-  FeatureWindowFunction() {}
-  explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
-  FeatureWindowFunction(const FeatureWindowFunction &other):
-      window(other.window) { }
-  Vector<BaseFloat> window;
-};
+// Sets up the feature window function (e.g. Hamming) as specified by the
+// options.
+void InitFeatureWindowFunction(
+    const FrameExtractionOptions &opts,
+    Vector<BaseFloat> *window_function);
 
 
 /**
@@ -135,8 +106,7 @@ struct FeatureWindowFunction {
 
       @param [in] flush   True if we are asserting that this number of samples is
              'all there is', false if we expecting more data to possibly come
-             in.  This only makes a difference to the answer if opts.snips_edges
-             == false.  For offline feature extraction you always want flush ==
+             in.   For offline feature extraction you always want flush ==
              true.  In an online-decoding context, once you know (or decide) that
              no more data is coming in, you'd call it with flush == true at the
              end to flush out any remaining data.
@@ -146,25 +116,27 @@ int32 NumFrames(int64 num_samples,
                 bool flush = true);
 
 /*
-   This function returns the index of the first sample of the frame indexed
-   'frame'.  If snip-edges=true, it just returns frame * opts.WindowShift(); if
-   snip-edges=false, the formula is a little more complicated and the result may
-   be negative.
+   This function returns the sample-index of the first sample of the frame
+   indexed 'frame'.
+      @param [in]   frame   frame index frame >= 0
+      @param [in]   opts    Options class, used for window width, and frame
+                            shift.
+      @return               Returns the sample index of the first sample of
+                            this frame.  Note: this may be negative if
+                            `frame` is close to zero.  The calling code
+                            will handle this by reflecting the signal in
+                            the boundary.
 */
 int64 FirstSampleOfFrame(int32 frame,
                          const FrameExtractionOptions &opts);
 
 
 
-void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value);
-
-void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff);
-
 /**
-  This function does all the windowing steps after actually
-  extracting the windowed signal: depending on the
-  configuration, it does dithering, dc offset removal,
-  preemphasis, and multiplication by the windowing function.
+  This function does all the windowing steps after actually extracting the
+  windowed signal: depeding on the configuration, it dc offset removal and
+  multiplication by the windowing function.
+
    @param [in] opts  The options class to be used
    @param [in] window_function  The windowing function-- should have
                     been initialized using 'opts'.
@@ -173,14 +145,10 @@ void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff);
       opts.PaddedWindowSize(), with the remaining samples zero,
       as the FFT code is more efficient if it operates on data with
       power-of-two size.
-   @param [out]   log_energy_pre_window If non-NULL, then after dithering and
-      DC offset removal, this function will write to this pointer the log of
-      the total energy (i.e. sum-squared) of the frame.
  */
 void ProcessWindow(const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   VectorBase<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window = NULL);
+                   const VectorBase<BaseFloat> &window_function,
+                   VectorBase<BaseFloat> *window);
 
 
 /*
@@ -202,18 +170,15 @@ void ProcessWindow(const FrameExtractionOptions &opts,
   @param [in] window_function  The windowing function, as derived from the
                     options class.
   @param [out] window  The windowed, possibly-padded waveform to be
-                     extracted.  Will be resized as needed.
-  @param [out] log_energy_pre_window  If non-NULL, the log-energy of
-                   the signal prior to pre-emphasis and multiplying by
-                   the windowing function will be written to here.
+                    extracted.  Will be resized as needed.
 */
 void ExtractWindow(int64 sample_offset,
                    const VectorBase<BaseFloat> &wave,
                    int32 f,
                    const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   Vector<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window = NULL);
+                   const VectorBase<BaseFloat> &window_function,
+                   Vector<BaseFloat> *window);
+
 
 
 /// @} End of "addtogroup feat"
diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc
index bb5e9f9acff..be050b386ee 100644
--- a/src/feat/mel-computations.cc
+++ b/src/feat/mel-computations.cc
@@ -32,8 +32,7 @@ namespace kaldi {
 
 MelBanks::MelBanks(const MelBanksOptions &opts,
                    const FrameExtractionOptions &frame_opts,
-                   BaseFloat vtln_warp_factor):
-    htk_mode_(opts.htk_mode) {
+                   BaseFloat vtln_warp_factor) {
   int32 num_bins = opts.num_bins;
   if (num_bins < 3) KALDI_ERR << "Must have at least 3 mel bins";
   BaseFloat sample_freq = frame_opts.samp_freq;
@@ -128,10 +127,6 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
     bins_[bin].second.Resize(size);
     bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size));
 
-    // Replicate a bug in HTK, for testing purposes.
-    if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0)
-      bins_[bin].second(0) = 0.0;
-
   }
   if (debug_) {
     for (size_t i = 0; i < bins_.size(); i++) {
@@ -144,8 +139,7 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
 MelBanks::MelBanks(const MelBanks &other):
     center_freqs_(other.center_freqs_),
     bins_(other.bins_),
-    debug_(other.debug_),
-    htk_mode_(other.htk_mode_) { }
+    debug_(other.debug_) { }
 
 BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
                                  BaseFloat vtln_high_cutoff,
@@ -232,8 +226,6 @@ void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
     int32 offset = bins_[i].first;
     const Vector<BaseFloat> &v(bins_[i].second);
     BaseFloat energy = VecVec(v, power_spectrum.Range(offset, v.Dim()));
-    // HTK-like flooring- for testing purposes (we prefer dither)
-    if (htk_mode_ && energy < 1.0) energy = 1.0;
     (*mel_energies_out)(i) = energy;
 
     // The following assert was added due to a problem with OpenBlas that
@@ -250,91 +242,7 @@ void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
   }
 }
 
-void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs) {
-  // Compute liftering coefficients (scaling on cepstral coeffs)
-  // coeffs are numbered slightly differently from HTK: the zeroth
-  // index is C0, which is not affected.
-  for (int32 i = 0; i < coeffs->Dim(); i++)
-    (*coeffs)(i) = 1.0 + 0.5 * Q * sin (M_PI * i / Q);
-}
-
-
-// Durbin's recursion - converts autocorrelation coefficients to the LPC
-// pTmp - temporal place [n]
-// pAC - autocorrelation coefficients [n + 1]
-// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}})
-//       F(z) = 1 / (1 - A(z)), 1 is not stored in the demoninator
-BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp) {
-  BaseFloat ki;                // reflection coefficient
-  int i;
-  int j;
-
-  BaseFloat E = pAC[0];
-
-  for (i = 0; i < n; i++) {
-    // next reflection coefficient
-    ki = pAC[i + 1];
-    for (j = 0; j < i; j++)
-      ki += pLP[j] * pAC[i - j];
-    ki = ki / E;
-
-    // new error
-    BaseFloat c = 1 - ki * ki;
-    if (c < 1.0e-5) // remove NaNs for constan signal
-      c = 1.0e-5;
-    E *= c;
-
-    // new LP coefficients
-    pTmp[i] = -ki;
-    for (j = 0; j < i; j++)
-      pTmp[j] = pLP[j] - ki * pLP[i - j - 1];
-
-    for (j = 0; j <= i; j++)
-      pLP[j] = pTmp[j];
-  }
 
-  return E;
-}
-
-
-void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst) {
-  for (int32 i = 0; i < n; i++) {
-    double sum = 0.0;
-    int j;
-    for (j = 0; j < i; j++) {
-      sum += static_cast<BaseFloat>(i - j) * pLPC[j] * pCepst[i - j - 1];
-    }
-    pCepst[i] = -pLPC[i] - sum / static_cast<BaseFloat>(i + 1);
-  }
-}
-
-void GetEqualLoudnessVector(const MelBanks &mel_banks,
-                            Vector<BaseFloat> *ans) {
-  int32 n = mel_banks.NumBins();
-  // Central frequency of each mel bin.
-  const Vector<BaseFloat> &f0 = mel_banks.GetCenterFreqs();
-  ans->Resize(n);
-  for (int32 i = 0; i < n; i++) {
-    BaseFloat fsq = f0(i) * f0(i);
-    BaseFloat fsub = fsq / (fsq + 1.6e5);
-    (*ans)(i) = fsub * fsub * ((fsq + 1.44e6) / (fsq + 9.61e6));
-  }
-}
-
-
-// Compute LP coefficients from autocorrelation coefficients.
-BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
-                     Vector<BaseFloat> *lpc_out) {
-  int32 n = autocorr_in.Dim() - 1;
-  KALDI_ASSERT(lpc_out->Dim() == n);
-  Vector<BaseFloat> tmp(n);
-  BaseFloat ans = Durbin(n, autocorr_in.Data(),
-                         lpc_out->Data(),
-                         tmp.Data());
-  if (ans <= 0.0)
-    KALDI_WARN << "Zero energy in LPC computation";
-  return -Log(1.0 / ans);  // forms the C0 value
-}
 
 
 }  // namespace kaldi
diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h
index 0c1d41ca45c..6822debc242 100644
--- a/src/feat/mel-computations.h
+++ b/src/feat/mel-computations.h
@@ -1,7 +1,7 @@
 // feat/mel-computations.h
 
 // Copyright 2009-2011  Phonexia s.r.o.;  Microsoft Corporation
-//                2016  Johns Hopkins University (author: Daniel Povey)
+//           2016-2019  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -44,18 +44,14 @@ struct MelBanksOptions {
   int32 num_bins;  // e.g. 25; number of triangular bins
   BaseFloat low_freq;  // e.g. 20; lower frequency cutoff
   BaseFloat high_freq;  // an upper frequency cutoff; 0 -> no cutoff, negative
-  // ->added to the Nyquist frequency to get the cutoff.
+                        // ->added to the Nyquist frequency to get the cutoff.
   BaseFloat vtln_low;  // vtln lower cutoff of warping function.
   BaseFloat vtln_high;  // vtln upper cutoff of warping function: if negative, added
                         // to the Nyquist frequency to get the cutoff.
   bool debug_mel;
-  // htk_mode is a "hidden" config, it does not show up on command line.
-  // Enables more exact compatibility with HTK, for testing purposes.  Affects
-  // mel-energy flooring and reproduces a bug in HTK.
-  bool htk_mode;
   explicit MelBanksOptions(int num_bins = 25)
       : num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(100),
-        vtln_high(-500), debug_mel(false), htk_mode(false) {}
+        vtln_high(-500), debug_mel(false) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("num-mel-bins", &num_bins,
@@ -87,10 +83,9 @@ class MelBanks {
   }
 
   static BaseFloat VtlnWarpFreq(BaseFloat vtln_low_cutoff,
-                                BaseFloat vtln_high_cutoff,  // discontinuities in warp func
+                                BaseFloat vtln_high_cutoff,
                                 BaseFloat low_freq,
-                                BaseFloat high_freq,  // upper+lower frequency cutoffs in
-                                // the mel computation
+                                BaseFloat high_freq,
                                 BaseFloat vtln_warp_factor,
                                 BaseFloat freq);
 
@@ -106,7 +101,7 @@ class MelBanks {
            const FrameExtractionOptions &frame_opts,
            BaseFloat vtln_warp_factor);
 
-  /// Compute Mel energies (note: not log enerties).
+  /// Compute Mel energies (note: not log energies).
   /// At input, "fft_energies" contains the FFT energies (not log).
   void Compute(const VectorBase<BaseFloat> &fft_energies,
                VectorBase<BaseFloat> *mel_energies_out) const;
@@ -135,36 +130,9 @@ class MelBanks {
   std::vector<std::pair<int32, Vector<BaseFloat> > > bins_;
 
   bool debug_;
-  bool htk_mode_;
 };
 
 
-// Compute liftering coefficients (scaling on cepstral coeffs)
-// coeffs are numbered slightly differently from HTK: the zeroth
-// index is C0, which is not affected.
-void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs);
-
-
-// Durbin's recursion - converts autocorrelation coefficients to the LPC
-// pTmp - temporal place [n]
-// pAC - autocorrelation coefficients [n + 1]
-// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}})
-//       F(z) = 1 / (1 - A(z)), 1 is not stored in the denominator
-// Returns log energy of residual (I think)
-BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp);
-
-// Compute LP coefficients from autocorrelation coefficients.
-// Returns log energy of residual (I think)
-BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
-                     Vector<BaseFloat> *lpc_out);
-
-void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst);
-
-
-
-void GetEqualLoudnessVector(const MelBanks &mel_banks,
-                            Vector<BaseFloat> *ans);
-
 /// @} End of "addtogroup feat"
 }  // namespace kaldi
 
diff --git a/src/feat/online-feature-test.cc b/src/feat/online-feature-test.cc
index 7ba6c7c32be..3e7834d6423 100644
--- a/src/feat/online-feature-test.cc
+++ b/src/feat/online-feature-test.cc
@@ -152,17 +152,11 @@ void TestOnlineMfcc() {
 
   // the parametrization object
   MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
   op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
+
   op.frame_opts.samp_freq = wave.SampFreq();
   op.mel_opts.low_freq = 0.0;
-  op.htk_compat = false;
   op.use_energy = false;  // C0 not energy.
-  if (RandInt(0, 1) == 0)
-    op.frame_opts.snip_edges = false;
   Mfcc mfcc(op);
 
   // compute mfcc offline
@@ -195,55 +189,6 @@ void TestOnlineMfcc() {
   }
 }
 
-void TestOnlinePlp() {
-  std::ifstream is("../feat/test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // the parametrization object
-  PlpOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.frame_opts.samp_freq = wave.SampFreq();
-  op.mel_opts.low_freq = 0.0;
-  op.htk_compat = false;
-  op.use_energy = false;  // C0 not energy.
-  Plp plp(op);
-
-  // compute plp offline
-  Matrix<BaseFloat> plp_feats;
-  plp.Compute(waveform, 1.0, &plp_feats);  // vtln not supported
-
-  // compare
-  // The test waveform is about 1.44s long, so
-  // we try to break it into from 5 pieces to 9(not essential to do so)
-  for (int32 num_piece = 5; num_piece < 10; num_piece++) {
-    OnlinePlp online_plp(op);
-    std::vector<int32> piece_length(num_piece);
-    bool ret = RandomSplit(waveform.Dim(), &piece_length, num_piece);
-    KALDI_ASSERT(ret);
-
-    int32 offset_start = 0;
-    for (int32 i = 0; i < num_piece; i++) {
-      Vector<BaseFloat> wave_piece(
-        waveform.Range(offset_start, piece_length[i]));
-      online_plp.AcceptWaveform(wave.SampFreq(), wave_piece);
-      offset_start += piece_length[i];
-    }
-    online_plp.InputFinished();
-
-    Matrix<BaseFloat> online_plp_feats;
-    GetOutput(&online_plp, &online_plp_feats);
-
-    AssertEqual(plp_feats, online_plp_feats);
-  }
-}
-
 void TestOnlineTransform() {
   std::ifstream is("../feat/test_data/test.wav", std::ios_base::binary);
   WaveData wave;
@@ -253,14 +198,10 @@ void TestOnlineTransform() {
 
   // build online feature interface, take OnlineMfcc as an example
   MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
   op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
+
   op.frame_opts.samp_freq = wave.SampFreq();
   op.mel_opts.low_freq = 0.0;
-  op.htk_compat = false;
   op.use_energy = false;  // C0 not energy.
   OnlineMfcc online_mfcc(op);
 
@@ -296,14 +237,9 @@ void TestOnlineAppendFeature() {
 
   // the parametrization object for 1st stream mfcc feature
   MfccOptions mfcc_op;
-  mfcc_op.frame_opts.dither = 0.0;
-  mfcc_op.frame_opts.preemph_coeff = 0.0;
   mfcc_op.frame_opts.window_type = "hamming";
-  mfcc_op.frame_opts.remove_dc_offset = false;
-  mfcc_op.frame_opts.round_to_power_of_two = true;
   mfcc_op.frame_opts.samp_freq = wave.SampFreq();
   mfcc_op.mel_opts.low_freq = 0.0;
-  mfcc_op.htk_compat = false;
   mfcc_op.use_energy = false;  // C0 not energy.
   Mfcc mfcc(mfcc_op);
 
@@ -311,30 +247,13 @@ void TestOnlineAppendFeature() {
   Matrix<BaseFloat> mfcc_feats;
   mfcc.Compute(waveform, 1.0, &mfcc_feats);  // vtln not supported
 
-  // the parametrization object for 2nd stream plp feature
-  PlpOptions plp_op;
-  plp_op.frame_opts.dither = 0.0;
-  plp_op.frame_opts.preemph_coeff = 0.0;
-  plp_op.frame_opts.window_type = "hamming";
-  plp_op.frame_opts.remove_dc_offset = false;
-  plp_op.frame_opts.round_to_power_of_two = true;
-  plp_op.frame_opts.samp_freq = wave.SampFreq();
-  plp_op.mel_opts.low_freq = 0.0;
-  plp_op.htk_compat = false;
-  plp_op.use_energy = false;  // C0 not energy.
-  Plp plp(plp_op);
-
-  // compute plp offline
-  Matrix<BaseFloat> plp_feats;
-  plp.Compute(waveform, 1.0, &plp_feats);  // vtln not supported
-
   // compare
   // The test waveform is about 1.44s long, so
   // we try to break it into from 5 pieces to 9(not essential to do so)
   for (int32 num_piece = 5; num_piece < 10; num_piece++) {
-    OnlineMfcc online_mfcc(mfcc_op);
-    OnlinePlp online_plp(plp_op);
-    OnlineAppendFeature online_mfcc_plp(&online_mfcc, &online_plp);
+    OnlineMfcc online_mfcc(mfcc_op),
+        online_mfcc2(mfcc_op);
+    OnlineAppendFeature online_mfcc_doubled(&online_mfcc, &online_mfcc2);
 
     std::vector<int32> piece_length(num_piece);
     bool ret = RandomSplit(waveform.Dim(), &piece_length, num_piece);
@@ -344,32 +263,27 @@ void TestOnlineAppendFeature() {
       Vector<BaseFloat> wave_piece(
         waveform.Range(offset_start, piece_length[i]));
       online_mfcc.AcceptWaveform(wave.SampFreq(), wave_piece);
-      online_plp.AcceptWaveform(wave.SampFreq(), wave_piece);
+      online_mfcc2.AcceptWaveform(wave.SampFreq(), wave_piece);
       offset_start += piece_length[i];
     }
     online_mfcc.InputFinished();
-    online_plp.InputFinished();
+    online_mfcc2.InputFinished();
 
-    Matrix<BaseFloat> online_mfcc_plp_feats;
-    GetOutput(&online_mfcc_plp, &online_mfcc_plp_feats);
+    Matrix<BaseFloat> online_mfcc_doubled_feats;
+    GetOutput(&online_mfcc_doubled, &online_mfcc_doubled_feats);
 
-    // compare mfcc_feats & plp_features with online_mfcc_plp_feats
-    KALDI_ASSERT(mfcc_feats.NumRows() == online_mfcc_plp_feats.NumRows()
-      && plp_feats.NumRows() == online_mfcc_plp_feats.NumRows()
-      && mfcc_feats.NumCols() + plp_feats.NumCols()
-         == online_mfcc_plp_feats.NumCols());
-    for (MatrixIndexT i = 0; i < online_mfcc_plp_feats.NumRows(); i++) {
+    // compare mfcc_feats & plp_features with online_mfcc_doubled_feats
+    KALDI_ASSERT(mfcc_feats.NumRows() == online_mfcc_doubled_feats.NumRows() &&
+                 online_mfcc_doubled_feats.NumCols() == 2 * mfcc_feats.NumCols());
+    for (MatrixIndexT i = 0; i < online_mfcc_doubled_feats.NumRows(); i++) {
       for (MatrixIndexT j = 0; j < mfcc_feats.NumCols(); j++) {
-        KALDI_ASSERT(std::abs(mfcc_feats(i, j) - online_mfcc_plp_feats(i, j))
-          < 0.0001*std::max(1.0, static_cast<double>(std::abs(mfcc_feats(i, j))
-                                    + std::abs(online_mfcc_plp_feats(i, j)))));
-      }
-      for (MatrixIndexT k = 0; k < plp_feats.NumCols(); k++) {
-        KALDI_ASSERT(
-          std::abs(plp_feats(i, k) -
-            online_mfcc_plp_feats(i, mfcc_feats.NumCols() + k))
-          < 0.0001*std::max(1.0, static_cast<double>(std::abs(plp_feats(i, k))
-            +std::abs(online_mfcc_plp_feats(i, mfcc_feats.NumCols() + k)))));
+        MatrixIndexT jj = j;
+        for (int count = 0; count < 2; count++) {
+          KALDI_ASSERT(std::abs(mfcc_feats(i, j) - online_mfcc_doubled_feats(i, jj))
+                       < 0.0001*std::max(1.0, static_cast<double>(std::abs(mfcc_feats(i, j))
+                                                                  + std::abs(online_mfcc_doubled_feats(i, jj)))));
+          jj += mfcc_feats.NumCols();
+        }
       }
     }
   }
@@ -423,7 +337,6 @@ int main() {
     TestOnlineDeltaFeature();
     TestOnlineSpliceFrames();
     TestOnlineMfcc();
-    TestOnlinePlp();
     TestOnlineTransform();
     TestOnlineAppendFeature();
     TestRecyclingVector();
diff --git a/src/feat/online-feature.cc b/src/feat/online-feature.cc
index 6f5ce6ee95b..b2c4799dacf 100644
--- a/src/feat/online-feature.cc
+++ b/src/feat/online-feature.cc
@@ -69,14 +69,12 @@ void OnlineGenericBaseFeature<C>::GetFrame(int32 frame,
 template <class C>
 OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
     const typename C::Options &opts):
-    computer_(opts), window_function_(computer_.GetFrameOptions()),
+    computer_(opts),
     features_(opts.frame_opts.max_feature_vectors),
-    input_finished_(false), waveform_offset_(0) {
-  // RE the following assert: search for ONLINE_IVECTOR_LIMIT in
-  // online-ivector-feature.cc.
-  // Casting to uint32, an unsigned type, means that -1 would be treated
-  // as `very large`.
-  KALDI_ASSERT(static_cast<uint32>(opts.frame_opts.max_feature_vectors) > 200);
+    input_finished_(false),
+    waveform_offset_(0) {
+  InitFeatureWindowFunction(computer_.GetFrameOptions(),
+                            &window_function_);
 }
 
 
@@ -168,17 +166,14 @@ void OnlineGenericBaseFeature<C>::ComputeFeatures() {
   KALDI_ASSERT(num_frames_new >= num_frames_old);
 
   Vector<BaseFloat> window;
-  bool need_raw_log_energy = computer_.NeedRawLogEnergy();
   for (int32 frame = num_frames_old; frame < num_frames_new; frame++) {
-    BaseFloat raw_log_energy = 0.0;
     ExtractWindow(waveform_offset_, waveform_remainder_, frame,
-                  frame_opts, window_function_, &window,
-                  need_raw_log_energy ? &raw_log_energy : NULL);
+                  frame_opts, window_function_, &window);
     Vector<BaseFloat> *this_feature = new Vector<BaseFloat>(computer_.Dim(),
                                                             kUndefined);
     // note: this online feature-extraction code does not support VTLN.
     BaseFloat vtln_warp = 1.0;
-    computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature);
+    computer_.Compute(vtln_warp, &window, this_feature);
     features_.PushBack(this_feature);
   }
   // OK, we will now discard any portion of the signal that will not be
@@ -205,7 +200,6 @@ void OnlineGenericBaseFeature<C>::ComputeFeatures() {
 
 // instantiate the templates defined here for MFCC, PLP and filterbank classes.
 template class OnlineGenericBaseFeature<MfccComputer>;
-template class OnlineGenericBaseFeature<PlpComputer>;
 template class OnlineGenericBaseFeature<FbankComputer>;
 
 OnlineCmvnState::OnlineCmvnState(const OnlineCmvnState &other):
diff --git a/src/feat/online-feature.h b/src/feat/online-feature.h
index f2ebe45bf3e..0c34c2de5dc 100644
--- a/src/feat/online-feature.h
+++ b/src/feat/online-feature.h
@@ -32,7 +32,6 @@
 #include "base/kaldi-error.h"
 #include "feat/feature-functions.h"
 #include "feat/feature-mfcc.h"
-#include "feat/feature-plp.h"
 #include "feat/feature-fbank.h"
 #include "itf/online-feature-itf.h"
 
@@ -72,7 +71,7 @@ class RecyclingVector {
 
 
 /// This is a templated class for online feature extraction;
-/// it's templated on a class like MfccComputer or PlpComputer
+/// it's templated on a class like MfccComputer
 /// that does the basic feature extraction.
 template<class C>
 class OnlineGenericBaseFeature: public OnlineBaseFeature {
@@ -126,15 +125,15 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
 
   void MaybeCreateResampler(BaseFloat sampling_rate);
 
-  C computer_;  // class that does the MFCC or PLP or filterbank computation
+  C computer_;  // class that does the MFCC or filterbank computation
 
   // resampler in cases when the input sampling frequency is not equal to
   // the expected sampling rate
   std::unique_ptr<LinearResample> resampler_;
 
-  FeatureWindowFunction window_function_;
+  Vector<BaseFloat> window_function_;
 
-  // features_ is the Mfcc or Plp or Fbank features that we have already computed.
+  // features_ is the Mfcc or Fbank features that we have already computed.
 
   RecyclingVector features_;
 
@@ -156,7 +155,6 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
 };
 
 typedef OnlineGenericBaseFeature<MfccComputer> OnlineMfcc;
-typedef OnlineGenericBaseFeature<PlpComputer> OnlinePlp;
 typedef OnlineGenericBaseFeature<FbankComputer> OnlineFbank;
 
 
@@ -597,7 +595,7 @@ class OnlineCacheFeature: public OnlineFeatureInterface {
 
 
 /// This online-feature class implements combination of two feature
-/// streams (such as pitch, plp) into one stream.
+/// streams (such as pitch) into one stream.
 class OnlineAppendFeature: public OnlineFeatureInterface {
  public:
   virtual int32 Dim() const { return src1_->Dim() + src2_->Dim(); }
diff --git a/src/feat/pitch-functions-test.cc b/src/feat/pitch-functions-test.cc
index 0e481c18674..e3953acb884 100644
--- a/src/feat/pitch-functions-test.cc
+++ b/src/feat/pitch-functions-test.cc
@@ -25,7 +25,6 @@
 #include <iostream>
 
 #include "base/kaldi-math.h"
-#include "feat/feature-plp.h"
 #include "feat/pitch-functions.h"
 #include "feat/wave-reader.h"
 #include "sys/stat.h"
diff --git a/src/feat/test_data/README b/src/feat/test_data/README
index 8deadd273a4..e44395c6bad 100644
--- a/src/feat/test_data/README
+++ b/src/feat/test_data/README
@@ -7,24 +7,4 @@
 #1) convert 16kHz,lin16 wav to KALDI ASCII vector format
 cat prepare_wav_in_ascii.m | matlab
 
-#2) perform reference feature extraction by HTK
-# we used HCopy from HTK V3.4
-HCopy -C hcopy1.conf test.wav test.wav.fea_htk.1
-
-HCopy -C hcopy2.conf test.wav test.wav.fea_htk.2
-
-HCopy -C hcopy3.conf test.wav test.wav.fea_htk.3
-
-HCopy -C hcopy4.conf test.wav test.wav.fea_htk.4
-
-HCopy -C hcopy5.conf test.wav test.wav.fea_htk.5
-
-HCopy -C hcopy6.conf test.wav test.wav.fea_htk.6
-
-HCopy -C plp1.conf test.wav test.wav.plp_htk.1
-
-HCopy -C fbank1.conf test.wav test.wav.fbank_htk.1
-
-HCopy -C fbank2.conf test.wav test.wav.fbank_htk.2
-
-HCopy -C fbank3.conf test.wav test.wav.fbank_htk.3
+#2) perform reference feature extraction by HTK - HTK support is deprecated.
diff --git a/src/feat/test_data/fbank1.conf b/src/feat/test_data/fbank1.conf
deleted file mode 100644
index b751b61d6d1..00000000000
--- a/src/feat/test_data/fbank1.conf
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = FBANK
-
-LOFREQ       = 0
-HIFREQ       = 8000
-#WARPLCUTOFF  = 100
-#WARPUCUTOFF  = 7500
-#WARPFREQ     = 1.0
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-
-PREEMCOEF    = 0.0
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
diff --git a/src/feat/test_data/fbank2.conf b/src/feat/test_data/fbank2.conf
deleted file mode 100644
index 604819a88c4..00000000000
--- a/src/feat/test_data/fbank2.conf
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = FBANK
-
-LOFREQ       = 25
-HIFREQ       = 8000
-#WARPLCUTOFF  = 100
-#WARPUCUTOFF  = 7500
-#WARPFREQ     = 1.0
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-
-PREEMCOEF    = 0.0
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
diff --git a/src/feat/test_data/fbank3.conf b/src/feat/test_data/fbank3.conf
deleted file mode 100644
index f86fec4b248..00000000000
--- a/src/feat/test_data/fbank3.conf
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = FBANK
-
-LOFREQ       = 25
-HIFREQ       = 8000
-WARPLCUTOFF  = 100
-WARPUCUTOFF  = 7500
-WARPFREQ     = 0.9
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-
-PREEMCOEF    = 0.0
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
diff --git a/src/feat/test_data/fbank4.conf b/src/feat/test_data/fbank4.conf
deleted file mode 100644
index a19679f8375..00000000000
--- a/src/feat/test_data/fbank4.conf
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = FBANK
-
-LOFREQ       = 25
-HIFREQ       = 8000
-WARPLCUTOFF  = 100
-WARPUCUTOFF  = 7500
-WARPFREQ     = 1.1
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-
-PREEMCOEF    = 0.0
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
diff --git a/src/feat/test_data/hcopy1.conf b/src/feat/test_data/hcopy1.conf
deleted file mode 100644
index 25230348076..00000000000
--- a/src/feat/test_data/hcopy1.conf
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = MFCC_D_A_0
-
-LOFREQ       = 0
-HIFREQ       = 8000
-#WARPLCUTOFF  = 100
-#WARPUCUTOFF  = 7500
-#WARPFREQ     = 1.0
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-
-PREEMCOEF    = 0.0
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/hcopy2.conf b/src/feat/test_data/hcopy2.conf
deleted file mode 100644
index 36c7d97d459..00000000000
--- a/src/feat/test_data/hcopy2.conf
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = MFCC_D_A_E
-
-LOFREQ       = 0
-HIFREQ       = 8000
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-PREEMCOEF    = 0        # no preemphase
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/hcopy3.conf b/src/feat/test_data/hcopy3.conf
deleted file mode 100644
index 6ed093af685..00000000000
--- a/src/feat/test_data/hcopy3.conf
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = MFCC_D_A_E
-
-LOFREQ       = 20
-HIFREQ       = 8000
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-PREEMCOEF    = 0        # no preemphase
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/hcopy4.conf b/src/feat/test_data/hcopy4.conf
deleted file mode 100644
index e51a361cccd..00000000000
--- a/src/feat/test_data/hcopy4.conf
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = MFCC_D_A_E
-
-LOFREQ       = 0
-HIFREQ       = 8000
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-PREEMCOEF    = 0.97
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/hcopy5.conf b/src/feat/test_data/hcopy5.conf
deleted file mode 100644
index d280548b91f..00000000000
--- a/src/feat/test_data/hcopy5.conf
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = MFCC_D_A_E
-
-LOFREQ       = 0
-HIFREQ       = 8000
-WARPLCUTOFF  = 100
-WARPUCUTOFF  = 7500
-WARPFREQ     = 1.1
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-PREEMCOEF    = 0.97
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/hcopy6.conf b/src/feat/test_data/hcopy6.conf
deleted file mode 100644
index 5e305c9d445..00000000000
--- a/src/feat/test_data/hcopy6.conf
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = MFCC_D_A_0
-
-LOFREQ       = 125
-HIFREQ       = 7800
-#WARPLCUTOFF  = 100
-#WARPUCUTOFF  = 7500
-#WARPFREQ     = 1.0
-NUMCHANS     = 24       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-RAWENERGY    = T
-ENORMALISE   = F
-
-PREEMCOEF    = 0.97
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/plp1.conf b/src/feat/test_data/plp1.conf
deleted file mode 100644
index 3465bd20d18..00000000000
--- a/src/feat/test_data/plp1.conf
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/sh
-
-SOURCEKIND   = WAVEFORM
-SOURCEFORMAT = WAV
-SOURCERATE   = 625
-BYTEORDER    = VAX
-TARGETFORMAT = HTK
-TARGETKIND   = PLP_D_A_0
-
-LOFREQ       = 0
-HIFREQ       = 8000
-NUMCHANS     = 23       # number of critical bands
-USEPOWER     = T        # using power spectrum
-USEHAMMING   = T        # use hamming window on speech frame
-COMPRESSFACT = 0.33
-
-PREEMCOEF    = 0
-TARGETRATE   = 100000   # 10 ms frame rate
-WINDOWSIZE   = 250000   # 25 ms window
-SAVEWITHCRC  = F
-
-CEPLIFTER    = 22
-NUMCEPS      = 12
diff --git a/src/feat/test_data/test.wav.fbank_htk.1 b/src/feat/test_data/test.wav.fbank_htk.1
deleted file mode 100644
index cd6083c130d..00000000000
Binary files a/src/feat/test_data/test.wav.fbank_htk.1 and /dev/null differ
diff --git a/src/feat/test_data/test.wav.fbank_htk.2 b/src/feat/test_data/test.wav.fbank_htk.2
deleted file mode 100644
index 4e95c15a308..00000000000
Binary files a/src/feat/test_data/test.wav.fbank_htk.2 and /dev/null differ
diff --git a/src/feat/test_data/test.wav.fbank_htk.3 b/src/feat/test_data/test.wav.fbank_htk.3
deleted file mode 100644
index fb3ab2258eb..00000000000
Binary files a/src/feat/test_data/test.wav.fbank_htk.3 and /dev/null differ
diff --git a/src/feat/test_data/test.wav.fbank_htk.4 b/src/feat/test_data/test.wav.fbank_htk.4
deleted file mode 100644
index ec9fae638c0..00000000000
Binary files a/src/feat/test_data/test.wav.fbank_htk.4 and /dev/null differ
diff --git a/src/feat/test_data/test.wav.fea_htk.1 b/src/feat/test_data/test.wav.fea_htk.1
deleted file mode 100644
index 0cc28939ef2..00000000000
Binary files a/src/feat/test_data/test.wav.fea_htk.1 and /dev/null differ
diff --git a/src/feat/test_data/test.wav.fea_htk.2 b/src/feat/test_data/test.wav.fea_htk.2
deleted file mode 100644
index 5d68a63d43b..00000000000
Binary files a/src/feat/test_data/test.wav.fea_htk.2 and /dev/null differ
diff --git a/src/feat/test_data/test.wav.fea_htk.3 b/src/feat/test_data/test.wav.fea_htk.3
deleted file mode 100644
index c1c577d5749..00000000000
Binary files a/src/feat/test_data/test.wav.fea_htk.3 and /dev/null differ
diff --git a/src/feat/test_data/test.wav.fea_htk.4 b/src/feat/test_data/test.wav.fea_htk.4
deleted file mode 100644
index 0b8667e1fa3..00000000000
Binary files a/src/feat/test_data/test.wav.fea_htk.4 and /dev/null differ
diff --git a/src/feat/test_data/test.wav.fea_htk.5 b/src/feat/test_data/test.wav.fea_htk.5
deleted file mode 100644
index d5164ad76a4..00000000000
Binary files a/src/feat/test_data/test.wav.fea_htk.5 and /dev/null differ
diff --git a/src/feat/test_data/test.wav.fea_htk.6 b/src/feat/test_data/test.wav.fea_htk.6
deleted file mode 100644
index c7d52ce013c..00000000000
Binary files a/src/feat/test_data/test.wav.fea_htk.6 and /dev/null differ
diff --git a/src/feat/test_data/test.wav.plp_htk.1 b/src/feat/test_data/test.wav.plp_htk.1
deleted file mode 100644
index 3485889d10d..00000000000
Binary files a/src/feat/test_data/test.wav.plp_htk.1 and /dev/null differ
diff --git a/src/feat/wave-reader-test.cc b/src/feat/wave-reader-test.cc
index f9a71e8af34..ce8299446be 100644
--- a/src/feat/wave-reader-test.cc
+++ b/src/feat/wave-reader-test.cc
@@ -72,6 +72,10 @@ static void UnitTestStereo8K() {
   std::istringstream ies(expect_mat, std::ios::in);
   Matrix<BaseFloat> expected;
   expected.Read(ies, false /* text */);
+  // WaveData scales data to the range [-1, 1], so do the same. Don't
+  // put the scaled values in the string expect_mat, since
+  // representing floating point as text losslessly is tricky.
+  expected.Scale(BaseFloat(1.0 / 32768.0));
 
   AssertEqual(wave.SampFreq(), hz, 0);
   AssertEqual(wave.Duration(), 3.0 /* samples */ / hz /* Hz */, 1E-6);
@@ -118,6 +122,7 @@ static void UnitTestMono22K() {
   std::istringstream ies(expect_mat, std::ios::in);
   Matrix<BaseFloat> expected;
   expected.Read(ies, false /* text */);
+  expected.Scale(BaseFloat(1.0 / 32768.0));
 
   AssertEqual(wave.SampFreq(), hz, 0);
   AssertEqual(wave.Duration(), 5.0 /* samples */ / hz /* Hz */, 1E-6);
@@ -157,6 +162,7 @@ static void UnitTestEndless1() {
   std::istringstream ies(expect_mat, std::ios::in);
   Matrix<BaseFloat> expected;
   expected.Read(ies, false /* text */);
+  expected.Scale(BaseFloat(1.0 / 32768.0));
 
   AssertEqual(wave.Data(), expected);
 }
@@ -194,6 +200,7 @@ static void UnitTestEndless2() {
   std::istringstream ies(expect_mat, std::ios::in);
   Matrix<BaseFloat> expected;
   expected.Read(ies, false /* text */);
+  expected.Scale(BaseFloat(1.0 / 32768.0));
 
   AssertEqual(wave.Data(), expected);
 }
diff --git a/src/feat/wave-reader.cc b/src/feat/wave-reader.cc
index f8259a3a82e..bd35b1cff43 100644
--- a/src/feat/wave-reader.cc
+++ b/src/feat/wave-reader.cc
@@ -308,7 +308,11 @@ void WaveData::Read(std::istream &is) {
 
   uint16 *data_ptr = reinterpret_cast<uint16*>(&buffer[0]);
 
-  // The matrix is arranged row per channel, column per sample.
+  // Scale the wave data to the range [-1, 1].  Prior to kaldi-10,
+  // it was in the range [-327680.0, 32768.0].
+  const BaseFloat scale = 1.0 / 32768.0;
+
+  // The row-indexes are channels; column-indexes are samples.
   data_.Resize(header.NumChannels(),
                buffer.size() / header.BlockAlign());
   for (uint32 i = 0; i < data_.NumCols(); ++i) {
@@ -316,7 +320,7 @@ void WaveData::Read(std::istream &is) {
       int16 k = *data_ptr++;
       if (header.ReverseBytes())
         KALDI_SWAP2(k);
-      data_(j, i) =  k;
+      data_(j, i) =  k * scale;
     }
   }
 }
@@ -358,9 +362,13 @@ void WaveData::Write(std::ostream &os) const {
   int32 stride = data_.Stride();
 
   int num_clipped = 0;
+
+  // This scaling factor is because we are writing 16-bit data.
+  const BaseFloat scale = 32768.0;
+
   for (int32 i = 0; i < num_samp; i++) {
     for (int32 j = 0; j < num_chan; j++) {
-      int32 elem = static_cast<int32>(trunc(data_ptr[j * stride + i]));
+      int32 elem = static_cast<int32>(trunc(data_ptr[j * stride + i] * scale));
       int16 elem_16 = static_cast<int16>(elem);
       if (elem < std::numeric_limits<int16>::min()) {
         elem_16 = std::numeric_limits<int16>::min();
diff --git a/src/feat/wave-reader.h b/src/feat/wave-reader.h
index dae74139232..2c07bda2728 100644
--- a/src/feat/wave-reader.h
+++ b/src/feat/wave-reader.h
@@ -2,7 +2,7 @@
 
 // Copyright 2009-2011  Karel Vesely;  Microsoft Corporation
 //                2013  Florent Masson
-//                2013  Johns Hopkins University (author: Daniel Povey)
+//           2013-2019  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -57,10 +57,6 @@
 
 namespace kaldi {
 
-/// For historical reasons, we scale waveforms to the range
-/// (2^15-1)*[-1, 1], not the usual default DSP range [-1, 1].
-const BaseFloat kWaveSampleMax = 32768.0;
-
 /// This class reads and hold wave file header information.
 class WaveInfo {
  public:
@@ -121,6 +117,8 @@ class WaveData {
   // This function returns the wave data-- it's in a matrix
   // because there may be multiple channels.  In the normal case
   // there's just one channel so Data() will have one row.
+  // This data will be in the range [-1, 1].  This is a difference
+  // from pre-kaldi10.
   const Matrix<BaseFloat> &Data() const { return data_; }
 
   BaseFloat SampFreq() const { return samp_freq_; }
diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index 861ba3f7a93..bb11b797e69 100644
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@@ -8,11 +8,10 @@ BINFILES = add-deltas add-deltas-sdc append-post-to-feats \
            compose-transforms compute-and-process-kaldi-pitch-feats \
            compute-cmvn-stats compute-cmvn-stats-two-channel \
            compute-fbank-feats compute-kaldi-pitch-feats compute-mfcc-feats \
-           compute-plp-feats compute-spectrogram-feats concat-feats copy-feats \
+           concat-feats copy-feats \
            copy-feats-to-htk copy-feats-to-sphinx extend-transform-dim \
            extract-feature-segments extract-segments feat-to-dim \
-           feat-to-len fmpe-acc-stats fmpe-apply-transform fmpe-est \
-           fmpe-init fmpe-sum-accs get-full-lda-mat interpolate-pitch \
+           feat-to-len get-full-lda-mat interpolate-pitch \
            modify-cmvn-stats paste-feats post-to-feats \
            process-kaldi-pitch-feats process-pitch-feats \
            select-feats shift-feats splice-feats subsample-feats \
@@ -25,7 +24,7 @@ TESTFILES =
 
 ADDLIBS = ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/featbin/compute-plp-feats.cc b/src/featbin/compute-plp-feats.cc
deleted file mode 100644
index 5c3b9843b4d..00000000000
--- a/src/featbin/compute-plp-feats.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-// featbin/compute-plp-feats.cc
-
-// Copyright 2009-2012  Microsoft Corporation
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "feat/feature-plp.h"
-#include "feat/wave-reader.h"
-#include "util/common-utils.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    const char *usage =
-        "Create PLP feature files.\n"
-        "Usage:  compute-plp-feats [options...] <wav-rspecifier> "
-        "<feats-wspecifier>\n";
-
-    // Construct all the global objects.
-    ParseOptions po(usage);
-    PlpOptions plp_opts;
-    // Define defaults for global options.
-    bool subtract_mean = false;
-    BaseFloat vtln_warp = 1.0;
-    std::string vtln_map_rspecifier;
-    std::string utt2spk_rspecifier;
-    int32 channel = -1;
-    BaseFloat min_duration = 0.0;
-    std::string output_format = "kaldi";
-    std::string utt2dur_wspecifier;
-
-    // Register the options.
-    po.Register("output-format", &output_format, "Format of the output "
-                "files [kaldi, htk]");
-    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
-                "feature file [CMS]. ");
-    po.Register("vtln-warp", &vtln_warp, "Vtln warp factor (only applicable "
-                "if vtln-map not specified)");
-    po.Register("vtln-map", &vtln_map_rspecifier, "Map from utterance or "
-                "speaker-id to vtln warp factor (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id "
-                "map (if doing VTLN and you have warps per speaker)");
-    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
-                "0 -> left, 1 -> right)");
-    po.Register("min-duration", &min_duration, "Minimum duration of segments "
-                "to process (in seconds).");
-    po.Register("write-utt2dur", &utt2dur_wspecifier, "Wspecifier to write "
-                "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
-
-    plp_opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string wav_rspecifier = po.GetArg(1);
-
-    std::string output_wspecifier = po.GetArg(2);
-
-    Plp plp(plp_opts);
-
-    if (utt2spk_rspecifier != "" && vtln_map_rspecifier != "")
-      KALDI_ERR << ("The --utt2spk option is only needed if "
-                    "the --vtln-map option is used.");
-    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
-                                                      utt2spk_rspecifier);
-
-    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
-    BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
-    TableWriter<HtkMatrixHolder> htk_writer;
-
-    if (output_format == "kaldi") {
-      if (!kaldi_writer.Open(output_wspecifier))
-        KALDI_ERR << "Could not initialize output with wspecifier "
-                  << output_wspecifier;
-    } else if (output_format == "htk") {
-      if (!htk_writer.Open(output_wspecifier))
-        KALDI_ERR << "Could not initialize output with wspecifier "
-                  << output_wspecifier;
-    } else {
-      KALDI_ERR << "Invalid output_format string " << output_format;
-    }
-
-    DoubleWriter utt2dur_writer(utt2dur_wspecifier);
-
-    int32 num_utts = 0, num_success = 0;
-    for (; !reader.Done(); reader.Next()) {
-      num_utts++;
-      std::string utt = reader.Key();
-      const WaveData &wave_data = reader.Value();
-      if (wave_data.Duration() < min_duration) {
-        KALDI_WARN << "File: " << utt << " is too short ("
-                   << wave_data.Duration() << " sec): producing no output.";
-        continue;
-      }
-      int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
-      {  // This block works out the channel (0=left, 1=right...).
-        KALDI_ASSERT(num_chan > 0);  // This should have been caught in
-        // reading code if no channels.
-        if (channel == -1) {
-          this_chan = 0;
-          if (num_chan != 1)
-            KALDI_WARN << "Channel not specified but you have data with "
-                       << num_chan  << " channels; defaulting to zero";
-        } else {
-          if (this_chan >= num_chan) {
-            KALDI_WARN << "File with id " << utt << " has "
-                       << num_chan << " channels but you specified channel "
-                       << channel << ", producing no output.";
-            continue;
-          }
-        }
-      }
-      BaseFloat vtln_warp_local;  // Work out VTLN warp factor.
-      if (vtln_map_rspecifier != "") {
-        if (!vtln_map_reader.HasKey(utt)) {
-          KALDI_WARN << "No vtln-map entry for utterance-id (or speaker-id) "
-                     << utt;
-          continue;
-        }
-        vtln_warp_local = vtln_map_reader.Value(utt);
-      } else {
-        vtln_warp_local = vtln_warp;
-      }
-
-      SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
-      Matrix<BaseFloat> features;
-      try {
-        plp.ComputeFeatures(waveform, wave_data.SampFreq(),
-                            vtln_warp_local, &features);
-      } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance " << utt;
-        continue;
-      }
-      if (subtract_mean) {
-        Vector<BaseFloat> mean(features.NumCols());
-        mean.AddRowSumMat(1.0, features);
-        mean.Scale(1.0 / features.NumRows());
-        for (size_t i = 0; i < features.NumRows(); i++)
-          features.Row(i).AddVec(-1.0, mean);
-      }
-      if (output_format == "kaldi") {
-        kaldi_writer.Write(utt, features);
-      } else {
-        std::pair<Matrix<BaseFloat>, HtkHeader> p;
-        p.first.Resize(features.NumRows(), features.NumCols());
-        p.first.CopyFromMat(features);
-        HtkHeader header = {
-          features.NumRows(),
-          100000,  // 10ms shift
-          static_cast<int16>(sizeof(float)*features.NumCols()),
-          013 | // PLP
-          020000 // C0 [no option currently to use energy in PLP.
-        };
-        p.second = header;
-        htk_writer.Write(utt, p);
-      }
-      if (utt2dur_writer.IsOpen()) {
-        utt2dur_writer.Write(utt, wave_data.Duration());
-      }
-      if (num_utts % 10 == 0)
-        KALDI_LOG << "Processed " << num_utts << " utterances";
-      KALDI_VLOG(2) << "Processed features for key " << utt;
-      num_success++;
-    }
-    KALDI_LOG << " Done " << num_success << " out of " << num_utts
-              << " utterances.";
-    return (num_success != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/featbin/compute-spectrogram-feats.cc b/src/featbin/compute-spectrogram-feats.cc
deleted file mode 100644
index 67932915278..00000000000
--- a/src/featbin/compute-spectrogram-feats.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-// featbin/compute-spectrogram-feats.cc
-
-// Copyright 2009-2011  Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "feat/feature-spectrogram.h"
-#include "feat/wave-reader.h"
-#include "util/common-utils.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    const char *usage =
-        "Create spectrogram feature files.\n"
-        "Usage:  compute-spectrogram-feats [options...] <wav-rspecifier> "
-        "<feats-wspecifier>\n";
-
-    // Construct all the global objects.
-    ParseOptions po(usage);
-    SpectrogramOptions spec_opts;
-    // Define defaults for global options.
-    bool subtract_mean = false;
-    int32 channel = -1;
-    BaseFloat min_duration = 0.0;
-    std::string output_format = "kaldi";
-    std::string utt2dur_wspecifier;
-
-    // Register the option struct
-    spec_opts.Register(&po);
-    // Register the options
-    po.Register("output-format", &output_format,
-                "Format of the output files [kaldi, htk]");
-    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
-                "feature file [CMS]; not recommended to do it this way. ");
-    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
-                "0 -> left, 1 -> right)");
-    po.Register("min-duration", &min_duration, "Minimum duration of segments "
-                "to process (in seconds).");
-    po.Register("write-utt2dur", &utt2dur_wspecifier, "Wspecifier to write "
-                "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string wav_rspecifier = po.GetArg(1);
-
-    std::string output_wspecifier = po.GetArg(2);
-
-    Spectrogram spec(spec_opts);
-
-    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
-    BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
-    TableWriter<HtkMatrixHolder> htk_writer;
-
-    if (output_format == "kaldi") {
-      if (!kaldi_writer.Open(output_wspecifier))
-        KALDI_ERR << "Could not initialize output with wspecifier "
-                  << output_wspecifier;
-    } else if (output_format == "htk") {
-      if (!htk_writer.Open(output_wspecifier))
-        KALDI_ERR << "Could not initialize output with wspecifier "
-                  << output_wspecifier;
-    } else {
-      KALDI_ERR << "Invalid output_format string " << output_format;
-    }
-
-    DoubleWriter utt2dur_writer(utt2dur_wspecifier);
-
-    int32 num_utts = 0, num_success = 0;
-    for (; !reader.Done(); reader.Next()) {
-      num_utts++;
-      std::string utt = reader.Key();
-      const WaveData &wave_data = reader.Value();
-      if (wave_data.Duration() < min_duration) {
-        KALDI_WARN << "File: " << utt << " is too short ("
-                   << wave_data.Duration() << " sec): producing no output.";
-        continue;
-      }
-      int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
-      {  // This block works out the channel (0=left, 1=right...)
-        KALDI_ASSERT(num_chan > 0);  // should have been caught in
-        // reading code if no channels.
-        if (channel == -1) {
-          this_chan = 0;
-          if (num_chan != 1)
-            KALDI_WARN << "Channel not specified but you have data with "
-                       << num_chan  << " channels; defaulting to zero";
-        } else {
-          if (this_chan >= num_chan) {
-            KALDI_WARN << "File with id " << utt << " has "
-                       << num_chan << " channels but you specified channel "
-                       << channel << ", producing no output.";
-            continue;
-          }
-        }
-      }
-
-      SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
-      Matrix<BaseFloat> features;
-      try {
-        spec.ComputeFeatures(waveform, wave_data.SampFreq(), 1.0, &features);
-      } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance " << utt;
-        continue;
-      }
-      if (subtract_mean) {
-        Vector<BaseFloat> mean(features.NumCols());
-        mean.AddRowSumMat(1.0, features);
-        mean.Scale(1.0 / features.NumRows());
-        for (int32 i = 0; i < features.NumRows(); i++)
-          features.Row(i).AddVec(-1.0, mean);
-      }
-      if (output_format == "kaldi") {
-        kaldi_writer.Write(utt, features);
-      } else {
-        std::pair<Matrix<BaseFloat>, HtkHeader> p;
-        p.first.Resize(features.NumRows(), features.NumCols());
-        p.first.CopyFromMat(features);
-        int32 frame_shift = spec_opts.frame_opts.frame_shift_ms * 10000;
-        HtkHeader header = {
-          features.NumRows(),
-          frame_shift,
-          static_cast<int16>(sizeof(float)*features.NumCols()),
-          007 | 020000
-        };
-        p.second = header;
-        htk_writer.Write(utt, p);
-      }
-      if (utt2dur_writer.IsOpen()) {
-        utt2dur_writer.Write(utt, wave_data.Duration());
-      }
-      if(num_utts % 10 == 0)
-        KALDI_LOG << "Processed " << num_utts << " utterances";
-      KALDI_VLOG(2) << "Processed features for key " << utt;
-      num_success++;
-    }
-    KALDI_LOG << " Done " << num_success << " out of " << num_utts
-              << " utterances.";
-    return (num_success != 0 ? 0 : 1);
-  } catch(const std::exception& e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/featbin/fmpe-acc-stats.cc b/src/featbin/fmpe-acc-stats.cc
deleted file mode 100644
index c69e95b6b59..00000000000
--- a/src/featbin/fmpe-acc-stats.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-// featbin/fmpe-acc-stats.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "transform/fmpe.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using kaldi::int32;
-  try {
-    const char *usage =
-        "Compute statistics for fMPE training\n"
-        "Usage:  fmpe-acc-stats [options...] <fmpe-object> "
-        "<feat-rspecifier> <feat-diff-rspecifier> <gselect-rspecifier> <stats-out>\n"
-        "Note: gmm-fmpe-acc-stats avoids computing the features an extra time\n";
-
-    ParseOptions po(usage);
-    bool binary = true;
-    po.Register("binary", &binary, "If true, output stats in binary mode.");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string fmpe_rxfilename = po.GetArg(1),
-        feat_rspecifier = po.GetArg(2),
-        feat_diff_rspecifier = po.GetArg(3),
-        gselect_rspecifier = po.GetArg(4),
-        stats_wxfilename = po.GetArg(5);
-    
-    Fmpe fmpe;
-    ReadKaldiObject(fmpe_rxfilename, &fmpe);
-
-    SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier);
-    RandomAccessBaseFloatMatrixReader diff_reader(feat_diff_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-
-    // fmpe stats...
-    FmpeStats fmpe_stats(fmpe);
-
-    int32 num_done = 0, num_err = 0;
-    
-    for (; !feat_reader.Done(); feat_reader.Next()) {
-      std::string key = feat_reader.Key();
-      const Matrix<BaseFloat> feat_in(feat_reader.Value());
-      if (!gselect_reader.HasKey(key)) {
-        KALDI_WARN << "No gselect information for key " << key;
-        num_err++;
-        continue;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(key);
-      if (static_cast<int32>(gselect.size()) != feat_in.NumRows()) {
-        KALDI_WARN << "gselect information has wrong size";
-        num_err++;
-        continue;
-      }
-      if (!diff_reader.HasKey(key)) {
-        KALDI_WARN << "No gradient information for key " << key;
-        num_err++;
-        continue;
-      }
-      const Matrix<BaseFloat> &feat_deriv = diff_reader.Value(key);
-
-      if (feat_deriv.NumCols() == feat_in.NumCols()) { // Only direct derivative.
-        fmpe.AccStats(feat_in, gselect, feat_deriv, NULL, &fmpe_stats);
-      } else if (feat_deriv.NumCols() == feat_in.NumCols() * 2) { // +indirect.
-        SubMatrix<BaseFloat> direct_deriv(feat_deriv, 0, feat_deriv.NumRows(),
-                                          0, feat_in.NumCols()),
-            indirect_deriv(feat_deriv, 0, feat_deriv.NumRows(),
-                           feat_in.NumCols(), feat_in.NumCols());
-        fmpe.AccStats(feat_in, gselect, direct_deriv, &indirect_deriv, &fmpe_stats);
-      } else {
-        KALDI_ERR << "Mismatch in dimension of feature derivative.";
-      }
-      num_done++;
-    }
-
-    KALDI_LOG << " Done " << num_done << " utterances, " << num_err
-              << " had errors.";
-
-    WriteKaldiObject(fmpe_stats, stats_wxfilename, binary);
-    
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/featbin/fmpe-apply-transform.cc b/src/featbin/fmpe-apply-transform.cc
deleted file mode 100644
index 9473e5f287b..00000000000
--- a/src/featbin/fmpe-apply-transform.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-// featbin/fmpe-apply-transform.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)  Yanmin Qian
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "transform/fmpe.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using kaldi::int32;
-  try {
-    const char *usage =
-        "Apply fMPE transform to features\n"
-        "Usage:  fmpe-apply-transform [options...] <fmpe-object> "
-        "<feat-rspecifier> <gselect-rspecifier> <feat-wspecifier>\n";
-
-    ParseOptions po(usage);
-    bool add_to_features = true;
-    po.Register("add-to-features", &add_to_features, "If true, add original "
-                "features to fMPE offsets (false useful for diagnostics)");
-    // no non-default options.
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string fmpe_rxfilename = po.GetArg(1),
-        feat_rspecifier = po.GetArg(2),
-        gselect_rspecifier = po.GetArg(3),
-        feat_wspecifier = po.GetArg(4);
-    
-    Fmpe fmpe;
-    ReadKaldiObject(fmpe_rxfilename, &fmpe);
-
-    SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    BaseFloatMatrixWriter feat_writer(feat_wspecifier);
-
-    int32 num_done = 0, num_err = 0;
-    
-    for (; !feat_reader.Done(); feat_reader.Next()) {
-      std::string key = feat_reader.Key();
-      const Matrix<BaseFloat> feat_in(feat_reader.Value());
-      if (!gselect_reader.HasKey(key)) {
-        KALDI_WARN << "No gselect information for key " << key;
-        num_err++;
-        continue;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(key);
-      if (static_cast<int32>(gselect.size()) != feat_in.NumRows()) {
-        KALDI_WARN << "gselect information has wrong size";
-        num_err++;
-        continue;
-      }
-      Matrix<BaseFloat> feat_out(feat_in.NumRows(), feat_in.NumCols());
-      fmpe.ComputeFeatures(feat_in, gselect, &feat_out);
-      if (add_to_features) // feat_out += feat_in.
-        feat_out.AddMat(1.0, feat_in, kNoTrans);
-
-      feat_writer.Write(key, feat_out);
-      num_done++;
-    }
-    KALDI_LOG << " Done " << num_done << " utterances, " << num_err
-              << " had errors.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/featbin/fmpe-est.cc b/src/featbin/fmpe-est.cc
deleted file mode 100644
index 76463c32782..00000000000
--- a/src/featbin/fmpe-est.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-// featbin/fmpe-est.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)  Yanmin Qian
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "transform/fmpe.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  try {
-    const char *usage =
-        "Do one iteration of learning (modified gradient descent)\n"
-        "on fMPE transform\n"
-        "Usage: fmpe-est [options...] <fmpe-in> <stats-in> <fmpe-out>\n"
-        "E.g. fmpe-est 1.fmpe 1.accs 2.fmpe\n";
-
-    ParseOptions po(usage);
-    FmpeUpdateOptions opts;
-    bool binary = true;
-    po.Register("binary", &binary, "If true, output fMPE object in "
-                "binary mode.");
-    opts.Register(&po);
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string fmpe_rxfilename = po.GetArg(1),
-        stats_rxfilename = po.GetArg(2),
-        fmpe_wxfilename = po.GetArg(3);
-
-    Fmpe fmpe;
-    ReadKaldiObject(fmpe_rxfilename, &fmpe);
-    FmpeStats stats;
-    ReadKaldiObject(stats_rxfilename, &stats);
-
-    stats.DoChecks(); // checks certain checksums.
-    fmpe.Update(opts, stats);
-
-    WriteKaldiObject(fmpe, fmpe_wxfilename, binary);
-
-    KALDI_LOG << "Updated fMPE object and wrote to "
-              << fmpe_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/featbin/fmpe-init.cc b/src/featbin/fmpe-init.cc
deleted file mode 100644
index 5f4455f44fc..00000000000
--- a/src/featbin/fmpe-init.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// featbin/fmpe-init.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)  Yanmin Qian
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "transform/fmpe.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  try {
-    const char *usage =
-        "Initialize fMPE transform (to zero)\n"
-        "Usage: fmpe-init [options...] <diag-gmm-in> <fmpe-out>\n"
-        "E.g. fmpe-init 1.ubm 1.fmpe\n";
-
-    ParseOptions po(usage);
-    FmpeOptions opts;
-    bool binary = true;
-    po.Register("binary", &binary, "If true, output fMPE object in binary mode.");
-    opts.Register(&po);
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string dgmm_rxfilename = po.GetArg(1),
-        fmpe_wxfilename = po.GetArg(2);
-
-    DiagGmm dgmm;
-    ReadKaldiObject(dgmm_rxfilename, &dgmm);
-    
-    
-    Fmpe fmpe(dgmm, opts);
-
-    Output ko(fmpe_wxfilename, binary);
-    fmpe.Write(ko.Stream(), binary);
-
-    KALDI_LOG << "Initialized fMPE object and wrote to "
-              << fmpe_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/featbin/fmpe-sum-accs.cc b/src/featbin/fmpe-sum-accs.cc
deleted file mode 100644
index e2976abe5ff..00000000000
--- a/src/featbin/fmpe-sum-accs.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// featbin/fmpe-sum-accs.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "transform/fmpe.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using kaldi::int32;
-  try {
-    const char *usage =
-        "Sum fMPE stats\n"
-        "Usage: fmpe-sum-accs [options...] <accs-out> <stats-in1> <stats-in2> ... \n"
-        "E.g. fmpe-sum-accs 1.accs 1.1.accs 1.2.accs 1.3.accs 1.4.accs\n";
-
-    ParseOptions po(usage);
-    bool binary = true;
-    po.Register("binary", &binary, "If true, output fMPE stats in "
-                "binary mode.");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string stats_wxfilename = po.GetArg(1);
-
-    FmpeStats stats;
-    for (int32 arg = 2; arg <= po.NumArgs(); arg++) {
-      std::string stats_rxfilename = po.GetArg(arg);
-      bool binary;
-      Input ki(stats_rxfilename, &binary);
-      stats.Read(ki.Stream(), binary, true); // true == sum accs.
-    }
-
-    WriteKaldiObject(stats, stats_wxfilename, binary);
-    
-    KALDI_LOG << "Summed " << (po.NumArgs()-1) << " fMPE stats and wrote to "
-              << stats_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/fgmmbin/Makefile b/src/fgmmbin/Makefile
index 5db252477b5..060c5e06957 100644
--- a/src/fgmmbin/Makefile
+++ b/src/fgmmbin/Makefile
@@ -18,6 +18,6 @@ TESTFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/fgmmbin/fgmm-global-info.cc b/src/fgmmbin/fgmm-global-info.cc
index e00384fe13f..867db3bdc50 100644
--- a/src/fgmmbin/fgmm-global-info.cc
+++ b/src/fgmmbin/fgmm-global-info.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/full-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
diff --git a/src/fgmmbin/fgmm-gselect.cc b/src/fgmmbin/fgmm-gselect.cc
index ab36af74275..3d962972127 100644
--- a/src/fgmmbin/fgmm-gselect.cc
+++ b/src/fgmmbin/fgmm-gselect.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/full-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
diff --git a/src/fstbin/Makefile b/src/fstbin/Makefile
index a22c014a7d5..c91e405e2c2 100644
--- a/src/fstbin/Makefile
+++ b/src/fstbin/Makefile
@@ -26,6 +26,6 @@ TESTFILES =
 LIBFILE =
 
 ADDLIBS = ../decoder/kaldi-decoder.a ../fstext/kaldi-fstext.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/fstext/Makefile b/src/fstext/Makefile
index b76bd413c42..655437dc52d 100644
--- a/src/fstext/Makefile
+++ b/src/fstext/Makefile
@@ -24,7 +24,7 @@ LIBNAME = kaldi-fstext
 
 # tree and matrix archives needed for test-context-fst
 # matrix archive needed for push-special.
-ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index 37d50fa5d80..1c0e0cbf16a 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -532,18 +532,17 @@ void MakePrecedingInputSymbolsSame(bool start_is_epsilon, MutableFst<Arc> *fst)
 
 template<class Arc, class F>
 void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *fst, const F &f) {
-  typedef typename F::Result ClassType;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
-  std::vector<ClassType> classes;
-  ClassType noClass = f(kNoLabel);
-  ClassType epsClass = f(0);
+  std::vector<int32> classes;
+  int32 no_class = f(kNoLabel),
+      eps_class = f(0);
   if (start_is_epsilon) {  // treat having-start-state as epsilon in-transition.
     StateId start_state = fst->Start();
     if (start_state < 0 || start_state == kNoStateId) // empty FST.
       return;
-    classes.resize(start_state+1, noClass);
-    classes[start_state] = epsClass;
+    classes.resize(start_state+1, no_class);
+    classes[start_state] = eps_class;
   }
 
   // Find bad states (states with multiple input-symbols into them).
@@ -553,8 +552,8 @@ void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *
     for (ArcIterator<Fst<Arc> > aiter(*fst, s); !aiter.Done(); aiter.Next()) {
       const Arc &arc = aiter.Value();
       if (classes.size() <= static_cast<size_t>(arc.nextstate))
-        classes.resize(arc.nextstate+1, noClass);
-      if (classes[arc.nextstate] == noClass)
+        classes.resize(arc.nextstate+1, no_class);
+      if (classes[arc.nextstate] == no_class)
         classes[arc.nextstate] = f(arc.ilabel);
       else
         if (classes[arc.nextstate] != f(arc.ilabel))
@@ -562,6 +561,7 @@ void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *
     }
   }
   if (bad_states.empty()) return;  // Nothing to do.
+
   kaldi::ConstIntegerSet<StateId> bad_states_ciset(bad_states);  // faster lookup.
 
   // Work out list of arcs we have to change as (state, arc-offset).
@@ -579,7 +579,7 @@ void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *
   }
   KALDI_ASSERT(!arcs_to_change.empty());  // since !bad_states.empty().
 
-  std::map<std::pair<StateId, ClassType>, StateId> state_map;
+  std::map<std::pair<StateId, int32>, StateId> state_map;
   // state_map is a map from (bad-state, input-symbol-class) to dummy-state.
 
   for (size_t i = 0; i < arcs_to_change.size(); i++) {
@@ -590,7 +590,7 @@ void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *
 
     // Transition is non-eps transition to "bad" state.  Introduce new state (or find
     // existing one).
-    std::pair<StateId, ClassType> p(arc.nextstate, f(arc.ilabel));
+    std::pair<StateId, int32> p(arc.nextstate, f(arc.ilabel));
     if (state_map.count(p) == 0) {
       StateId newstate = state_map[p] = fst->AddState();
       fst->AddArc(newstate, Arc(0, 0, Weight::One(), arc.nextstate));
@@ -606,65 +606,6 @@ void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *
   }
 }
 
-template<class Arc>
-void MakeFollowingInputSymbolsSame(bool end_is_epsilon, MutableFst<Arc> *fst) {
-  IdentityFunction<typename Arc::Label> f;
-  MakeFollowingInputSymbolsSameClass(end_is_epsilon, fst, f);
-}
-
-template<class Arc, class F>
-void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst<Arc> *fst, const F &f) {
-  typedef typename Arc::StateId StateId;
-  typedef typename Arc::Weight Weight;
-  typedef typename F::Result ClassType;
-  std::vector<StateId> bad_states;
-  ClassType noClass = f(kNoLabel);
-  ClassType epsClass = f(0);
-  for (StateIterator<Fst<Arc> > siter(*fst); !siter.Done(); siter.Next()) {
-    StateId s = siter.Value();
-    ClassType c = noClass;
-    bool bad = false;
-    for (ArcIterator<Fst<Arc> > aiter(*fst, s); !aiter.Done(); aiter.Next()) {
-      const Arc &arc = aiter.Value();
-      if (c == noClass)
-        c = f(arc.ilabel);
-      else
-        if (c != f(arc.ilabel)) {
-          bad = true;
-          break;
-        }
-    }
-    if (end_is_epsilon && c != noClass &&
-       c != epsClass && fst->Final(s) != Weight::Zero())
-      bad = true;
-    if (bad)
-      bad_states.push_back(s);
-  }
-  std::vector<Arc> my_arcs;
-  for (size_t i = 0; i < bad_states.size(); i++) {
-    StateId s = bad_states[i];
-    my_arcs.clear();
-    for (ArcIterator<MutableFst<Arc> > aiter(*fst, s); !aiter.Done(); aiter.Next())
-      my_arcs.push_back(aiter.Value());
-
-    for (size_t j = 0; j < my_arcs.size(); j++) {
-      Arc &arc = my_arcs[j];
-      if (arc.ilabel != 0) {
-        StateId newstate = fst->AddState();
-        // Create a new state for each non-eps arc in original FST, out of each bad state.
-        // Not as optimal as it could be, but does avoid some complicated weight-pushing
-        // issues in which, to maintain stochasticity, we would have to know which semiring
-        // we want to maintain stochasticity in.
-        fst->AddArc(newstate, Arc(arc.ilabel, 0, Weight::One(), arc.nextstate));
-        MutableArcIterator<MutableFst<Arc> > maiter(fst, s);
-        maiter.Seek(j);
-        maiter.SetValue(Arc(0, arc.olabel, arc.weight, newstate));
-      }
-    }
-  }
-}
-
-
 template<class Arc>
 VectorFst<Arc>* MakeLoopFst(const std::vector<const ExpandedFst<Arc> *> &fsts) {
   typedef typename Arc::Weight Weight;
diff --git a/src/fstext/fstext-utils-test.cc b/src/fstext/fstext-utils-test.cc
index 4ce296f093a..ac02b7ec305 100644
--- a/src/fstext/fstext-utils-test.cc
+++ b/src/fstext/fstext-utils-test.cc
@@ -217,19 +217,13 @@ template<class Arc>  void TestAcceptorMinimize() {
 template<class Arc>  void TestMakeSymbolsSame() {
 
   VectorFst<Arc> *fst = RandFst<Arc>();
-  bool foll = (kaldi::Rand() % 2 == 0);
   bool is_symbol = (kaldi::Rand() % 2 == 0);
 
 
   VectorFst<Arc> fst2(*fst);
 
-  if (foll) {
-    MakeFollowingInputSymbolsSame(is_symbol, &fst2);
-    assert(FollowingInputSymbolsAreSame(is_symbol, fst2));
-  } else {
-    MakePrecedingInputSymbolsSame(is_symbol, &fst2);
-    assert(PrecedingInputSymbolsAreSame(is_symbol, fst2));
-  }
+  MakePrecedingInputSymbolsSame(is_symbol, &fst2);
+  assert(PrecedingInputSymbolsAreSame(is_symbol, fst2));
 
 
   assert(RandEquivalent(*fst, fst2, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
@@ -254,20 +248,14 @@ struct TestFunctor {
 template<class Arc>  void TestMakeSymbolsSameClass() {
 
   VectorFst<Arc> *fst = RandFst<Arc>();
-  bool foll = (kaldi::Rand() % 2 == 0);
   bool is_symbol = (kaldi::Rand() % 2 == 0);
 
 
   VectorFst<Arc> fst2(*fst);
 
   TestFunctor<Arc> f;
-  if (foll) {
-    MakeFollowingInputSymbolsSameClass(is_symbol, &fst2, f);
-    assert(FollowingInputSymbolsAreSameClass(is_symbol, fst2, f));
-  } else {
-    MakePrecedingInputSymbolsSameClass(is_symbol, &fst2, f);
-    assert(PrecedingInputSymbolsAreSameClass(is_symbol, fst2, f));
-  }
+  MakePrecedingInputSymbolsSameClass(is_symbol, &fst2, f);
+  assert(PrecedingInputSymbolsAreSameClass(is_symbol, fst2, f));
 
   assert(RandEquivalent(*fst, fst2, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
 
diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h
index fb55ad69f72..950c0c87c60 100644
--- a/src/fstext/fstext-utils.h
+++ b/src/fstext/fstext-utils.h
@@ -230,7 +230,7 @@ bool PrecedingInputSymbolsAreSame(bool start_is_epsilon, const Fst<Arc> &fst);
 /// F::Result F::operator() (F::Arg a) const;
 /// where F::Result is an integer type and F::Arc can be constructed from Arc::Label.
 /// this must apply to valid labels and also to kNoLabel (so we can have a marker for
-/// the invalid labels.
+/// the invalid labels).
 template<class Arc, class F>
 bool PrecedingInputSymbolsAreSameClass(bool start_is_epsilon, const Fst<Arc> &fst, const F &f);
 
@@ -258,29 +258,13 @@ template<class Arc>
 void MakePrecedingInputSymbolsSame(bool start_is_epsilon, MutableFst<Arc> *fst);
 
 
-/// As MakePrecedingInputSymbolsSame, but takes a functor object that maps labels to classes.
+/// As MakePrecedingInputSymbolsSame, but takes a functor object that maps
+/// labels to (int32) classes.  Caution: it must not map kNoLabel (-1) to the
+/// same value as any real symbol (it should generally map -1 to -1).
 template<class Arc, class F>
 void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *fst, const F &f);
 
 
-/// MakeFollowingInputSymbolsSame ensures that all arcs exiting any given fst
-/// state have the same input symbol.  It does this by detecting states that have
-/// differing input symbols on arcs that exit it, and inserting, for each of the
-/// following arcs with non-epsilon input symbol, a new dummy state that has an
-/// input-epsilon link from the fst state.  The output symbol and weight stay on the
-/// link to the dummy state (in order to keep the FST output-deterministic and
-/// stochastic, if it already was).
-/// If end_is_epsilon, treat "being a final-state" like having an epsilon output
-/// link.
-template<class Arc>
-void MakeFollowingInputSymbolsSame(bool end_is_epsilon, MutableFst<Arc> *fst);
-
-/// As MakeFollowingInputSymbolsSame, but takes a functor object that maps labels to classes.
-template<class Arc, class F>
-void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst<Arc> *fst, const F &f);
-
-
-
 
 /// MakeLoopFst creates an FST that has a state that is both initial and
 /// final (weight == Weight::One()), and for each non-NULL pointer fsts[i],
diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h
index b6bae4b9dc9..4870acdd0cc 100644
--- a/src/fstext/kaldi-fst-io-inl.h
+++ b/src/fstext/kaldi-fst-io-inl.h
@@ -163,7 +163,84 @@ void ReadFstKaldi(std::istream &is, bool binary,
   }
 }
 
+template <class Arc>
+void ReadFsaKaldi(std::istream &is, VectorFst<Arc> *fst) {
+  typedef typename Arc::Weight Weight;
+  typedef typename Arc::StateId StateId;
+  // Consume the \r on Windows, the \n that the text-form FSA format starts
+  // with, and any extra spaces that might have got in there somehow.
+  while (std::isspace(is.peek()) && is.peek() != '\n') is.get();
+  if (is.peek() == '\n') is.get(); // consume the newline.
+  else { // saw spaces but no newline.. this is not expected.
+    KALDI_ERR << "Reading FSA: unexpected sequence of spaces "
+              << " at file position " << is.tellg();
+  }
+  using std::string;
+  using std::vector;
+  using kaldi::SplitStringToIntegers;
+  using kaldi::ConvertStringToInteger;
+  fst->DeleteStates();
+  string line;
+  size_t nline = 0;
+  string separator = FLAGS_fst_field_separator + "\r\n";
+  while (std::getline(is, line)) {
+    nline++;
+    vector<string> col;
+    // on Windows we'll write in text and read in binary mode.
+    kaldi::SplitStringToVector(line, separator.c_str(), true, &col);
+    if (col.size() == 0) break; // Empty line is a signal to stop, in our
+    // archive format.
+    if (col.size() > 4) {
+      KALDI_ERR << "Bad line in FSA: " << line;
+    }
+    StateId s;
+    if (!ConvertStringToInteger(col[0], &s)) {
+      KALDI_ERR << "Bad line in FSA: " << line;
+    }
+    while (s >= fst->NumStates())
+      fst->AddState();
+    if (nline == 1) fst->SetStart(s);
 
+    bool ok = true;
+    Arc arc;
+    Weight w;
+    StateId d = s;
+    switch (col.size()) {
+      case 1:
+        fst->SetFinal(s, Weight::One());
+        break;
+      case 2:
+        if (!StrToWeight(col[1], true, &w)) ok = false;
+        else fst->SetFinal(s, w);
+        break;
+      case 3:
+        ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
+            ConvertStringToInteger(col[2], &arc.ilabel);
+        arc.olabel = arc.ilabel;
+        if (ok) {
+          d = arc.nextstate;
+          arc.weight = Weight::One();
+          fst->AddArc(s, arc);
+        }
+        break;
+      case 4:
+        ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
+            ConvertStringToInteger(col[2], &arc.ilabel) &&
+            StrToWeight(col[3], false, &arc.weight);
+        arc.olabel = arc.ilabel;
+        if (ok) {
+          d = arc.nextstate;
+          fst->AddArc(s, arc);
+        }
+        break;
+      default:
+        ok = false;
+    }
+    while (d >= fst->NumStates()) fst->AddState();
+    if (!ok)
+      KALDI_ERR << "Bad line in FSA: " << line;
+  }
+}
 
 
 template<class Arc> // static
diff --git a/src/fstext/kaldi-fst-io.h b/src/fstext/kaldi-fst-io.h
index 206dc71238a..5de870481ea 100644
--- a/src/fstext/kaldi-fst-io.h
+++ b/src/fstext/kaldi-fst-io.h
@@ -81,6 +81,12 @@ template <class Arc>
 void ReadFstKaldi(std::istream &is, bool binary,
                   VectorFst<Arc> *fst);
 
+// A generic Kaldi-type-IO mechanism of reading FSAs from streams,
+// supporting text-mode reading.
+// Note that this assumes the input is an acceptor.
+template <class Arc>
+void ReadFsaKaldi(std::istream &is, VectorFst<Arc> *fst);
+
 // Read an FST file for LM (G.fst) and make it an acceptor,
 // and make sure it is sorted on labels
 fst::VectorFst<fst::StdArc> *ReadAndPrepareLmFst(std::string rxfilename);
diff --git a/src/gmm/Makefile b/src/gmm/Makefile
index caee6734afe..64fe320507a 100644
--- a/src/gmm/Makefile
+++ b/src/gmm/Makefile
@@ -9,13 +9,13 @@ TESTFILES = diag-gmm-test mle-diag-gmm-test full-gmm-test mle-full-gmm-test \
 
 OBJFILES = diag-gmm.o diag-gmm-normal.o mle-diag-gmm.o am-diag-gmm.o \
            mle-am-diag-gmm.o full-gmm.o full-gmm-normal.o mle-full-gmm.o \
-					 model-common.o decodable-am-diag-gmm.o model-test-common.o \
-					 ebw-diag-gmm.o indirect-diff-diag-gmm.o
+ 		   model-common.o decodable-am-diag-gmm.o model-test-common.o \
+		 ebw-diag-gmm.o indirect-diff-diag-gmm.o
 
 LIBNAME = kaldi-gmm
 
-ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
+          ../base/kaldi-base.a
 
 
 
diff --git a/src/gmm/decodable-am-diag-gmm.h b/src/gmm/decodable-am-diag-gmm.h
index 745b4f61b14..f2e03005708 100644
--- a/src/gmm/decodable-am-diag-gmm.h
+++ b/src/gmm/decodable-am-diag-gmm.h
@@ -26,11 +26,9 @@
 
 #include "base/kaldi-common.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "itf/decodable-itf.h"
-#include "transform/regression-tree.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-#include "transform/regtree-mllr-diag-gmm.h"
+
 
 namespace kaldi {
 
@@ -46,13 +44,13 @@ class DecodableAmDiagGmmUnmapped : public DecodableInterface {
  public:
   /// If you set log_sum_exp_prune to a value greater than 0 it will prune
   /// in the LogSumExp operation (larger = more exact); I suggest 5.
-  /// This is advisable if it's spending a long time doing exp 
-  /// operations. 
+  /// This is advisable if it's spending a long time doing exp
+  /// operations.
   DecodableAmDiagGmmUnmapped(const AmDiagGmm &am,
                              const Matrix<BaseFloat> &feats,
                              BaseFloat log_sum_exp_prune = -1.0):
     acoustic_model_(am), feature_matrix_(feats),
-    previous_frame_(-1), log_sum_exp_prune_(log_sum_exp_prune), 
+    previous_frame_(-1), log_sum_exp_prune_(log_sum_exp_prune),
     data_squared_(feats.NumCols()) {
     ResetLogLikeCache();
   }
@@ -63,7 +61,7 @@ class DecodableAmDiagGmmUnmapped : public DecodableInterface {
     return LogLikelihoodZeroBased(frame, state_index - 1);
   }
   virtual int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
-  
+
   // Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return acoustic_model_.NumPdfs(); }
 
@@ -98,7 +96,7 @@ class DecodableAmDiagGmmUnmapped : public DecodableInterface {
 class DecodableAmDiagGmm: public DecodableAmDiagGmmUnmapped {
  public:
   DecodableAmDiagGmm(const AmDiagGmm &am,
-                     const TransitionModel &tm,
+                     const Transitions &tm,
                      const Matrix<BaseFloat> &feats,
                      BaseFloat log_sum_exp_prune = -1.0)
     : DecodableAmDiagGmmUnmapped(am, feats, log_sum_exp_prune),
@@ -107,21 +105,21 @@ class DecodableAmDiagGmm: public DecodableAmDiagGmmUnmapped {
   // Note, frames are numbered from zero.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
     return LogLikelihoodZeroBased(frame,
-                                  trans_model_.TransitionIdToPdf(tid));
+                                  trans_model_.TransitionIdToPdfFast(tid));
   }
   // Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
 
-  const TransitionModel *TransModel() { return &trans_model_; }
+  const Transitions *TransModel() { return &trans_model_; }
  private: // want to access public to have pdf id information
-  const TransitionModel &trans_model_;  // for tid to pdf mapping
+  const Transitions &trans_model_;  // for tid to pdf mapping
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmDiagGmm);
 };
 
 class DecodableAmDiagGmmScaled: public DecodableAmDiagGmmUnmapped {
  public:
   DecodableAmDiagGmmScaled(const AmDiagGmm &am,
-                           const TransitionModel &tm,
+                           const Transitions &tm,
                            const Matrix<BaseFloat> &feats,
                            BaseFloat scale,
                            BaseFloat log_sum_exp_prune = -1.0):
@@ -131,7 +129,7 @@ class DecodableAmDiagGmmScaled: public DecodableAmDiagGmmUnmapped {
   // This version of the initializer takes ownership of the pointer
   // "feats" and will delete it when this class is destroyed.
   DecodableAmDiagGmmScaled(const AmDiagGmm &am,
-                           const TransitionModel &tm,
+                           const Transitions &tm,
                            BaseFloat scale,
                            BaseFloat log_sum_exp_prune,
                            Matrix<BaseFloat> *feats):
@@ -140,20 +138,20 @@ class DecodableAmDiagGmmScaled: public DecodableAmDiagGmmUnmapped {
 
   // Note, frames are numbered from zero but transition-ids from one.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return scale_*LogLikelihoodZeroBased(frame,
-                                         trans_model_.TransitionIdToPdf(tid));
+    return scale_ * LogLikelihoodZeroBased(
+        frame, trans_model_.TransitionIdToPdfFast(tid));
   }
   // Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
 
-  const TransitionModel *TransModel() { return &trans_model_; }
+  const Transitions *TransModel() { return &trans_model_; }
 
   virtual ~DecodableAmDiagGmmScaled() {
     delete delete_feats_;
   }
-  
+
  private: // want to access it public to have pdf id information
-  const TransitionModel &trans_model_;  // for transition-id to pdf mapping
+  const Transitions &trans_model_;  // for transition-id to pdf mapping
   BaseFloat scale_;
   Matrix<BaseFloat> *delete_feats_;
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmDiagGmmScaled);
diff --git a/src/gmmbin/Makefile b/src/gmmbin/Makefile
index 82d10abe9ce..f43dfa96ccb 100644
--- a/src/gmmbin/Makefile
+++ b/src/gmmbin/Makefile
@@ -6,25 +6,24 @@ include ../kaldi.mk
 
 BINFILES = gmm-init-mono gmm-est gmm-acc-stats-ali gmm-align \
            gmm-decode-faster gmm-decode-simple gmm-align-compiled \
-           gmm-sum-accs gmm-est-regtree-fmllr gmm-acc-stats-twofeats \
+           gmm-sum-accs gmm-acc-stats-twofeats \
            gmm-acc-stats gmm-init-lvtln gmm-est-lvtln-trans gmm-train-lvtln-special \
            gmm-acc-mllt gmm-mixup gmm-init-model gmm-transform-means \
-           gmm-make-regtree gmm-decode-faster-regtree-fmllr gmm-post-to-gpost \
-           gmm-est-fmllr-gpost gmm-est-fmllr gmm-est-regtree-fmllr-ali \
-           gmm-est-regtree-mllr gmm-compute-likes \
-           gmm-decode-faster-regtree-mllr gmm-latgen-simple \
+           gmm-post-to-gpost \
+           gmm-est-fmllr-gpost gmm-est-fmllr gmm-compute-likes \
+           gmm-latgen-simple \
            gmm-rescore-lattice gmm-decode-biglm-faster \
            gmm-est-gaussians-ebw gmm-est-weights-ebw gmm-latgen-faster gmm-copy \
            gmm-global-acc-stats gmm-global-est gmm-global-sum-accs gmm-gselect \
            gmm-latgen-biglm-faster gmm-ismooth-stats gmm-global-get-frame-likes \
            gmm-global-est-fmllr gmm-global-to-fgmm gmm-global-acc-stats-twofeats \
-           gmm-global-copy gmm-fmpe-acc-stats gmm-acc-stats2 gmm-init-model-flat gmm-info \
+           gmm-global-copy gmm-acc-stats2 gmm-init-model-flat gmm-info \
            gmm-get-stats-deriv gmm-est-rescale gmm-boost-silence \
            gmm-basis-fmllr-accs gmm-basis-fmllr-training gmm-est-basis-fmllr \
            gmm-est-map gmm-adapt-map gmm-latgen-map gmm-basis-fmllr-accs-gpost \
            gmm-est-basis-fmllr-gpost gmm-latgen-faster-parallel \
-           gmm-est-fmllr-raw gmm-est-fmllr-raw-gpost gmm-global-init-from-feats \
-           gmm-global-info gmm-latgen-faster-regtree-fmllr gmm-est-fmllr-global \
+           gmm-global-init-from-feats \
+           gmm-global-info gmm-est-fmllr-global \
            gmm-acc-mllt-global gmm-transform-means-global gmm-global-get-post \
            gmm-global-gselect-to-post gmm-global-est-lvtln-trans gmm-init-biphone
 
@@ -37,8 +36,8 @@ TESTFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
+          ../base/kaldi-base.a
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/gmmbin/gmm-acc-mllt-global.cc b/src/gmmbin/gmm-acc-mllt-global.cc
index bed91c053d3..ac3ec2237c9 100644
--- a/src/gmmbin/gmm-acc-mllt-global.cc
+++ b/src/gmmbin/gmm-acc-mllt-global.cc
@@ -23,7 +23,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/mllt.h"
 #include "hmm/posterior.h"
 
diff --git a/src/gmmbin/gmm-acc-mllt.cc b/src/gmmbin/gmm-acc-mllt.cc
index 6e57f082a62..be0d501b3f5 100644
--- a/src/gmmbin/gmm-acc-mllt.cc
+++ b/src/gmmbin/gmm-acc-mllt.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/mllt.h"
 #include "hmm/posterior.h"
 
@@ -58,7 +58,7 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int32 int32;
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/gmmbin/gmm-acc-stats-ali.cc b/src/gmmbin/gmm-acc-stats-ali.cc
index 5552d45738e..baee5f8b814 100644
--- a/src/gmmbin/gmm-acc-stats-ali.cc
+++ b/src/gmmbin/gmm-acc-stats-ali.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 
 
@@ -53,7 +53,7 @@ int main(int argc, char *argv[]) {
         accs_wxfilename = po.GetArg(4);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
@@ -61,8 +61,6 @@ int main(int argc, char *argv[]) {
       am_gmm.Read(ki.Stream(), binary);
     }
 
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
     AccumAmDiagGmm gmm_accs;
     gmm_accs.Init(am_gmm, kGmmAll);
 
@@ -94,8 +92,7 @@ int main(int argc, char *argv[]) {
 
         for (size_t i = 0; i < alignment.size(); i++) {
           int32 tid = alignment[i],  // transition identifier.
-              pdf_id = trans_model.TransitionIdToPdf(tid);
-          trans_model.Accumulate(1.0, tid, &transition_accs);
+              pdf_id = trans_model.TransitionIdToPdfFast(tid);
           tot_like_this_file += gmm_accs.AccumulateForGmm(am_gmm, mat.Row(i),
                                                           pdf_id, 1.0);
         }
@@ -117,7 +114,6 @@ int main(int argc, char *argv[]) {
 
     {
       Output ko(accs_wxfilename, binary);
-      transition_accs.Write(ko.Stream(), binary);
       gmm_accs.Write(ko.Stream(), binary);
     }
     KALDI_LOG << "Written accs.";
diff --git a/src/gmmbin/gmm-acc-stats-twofeats.cc b/src/gmmbin/gmm-acc-stats-twofeats.cc
index 05f94ff5ef6..7b3cadfdb9b 100644
--- a/src/gmmbin/gmm-acc-stats-twofeats.cc
+++ b/src/gmmbin/gmm-acc-stats-twofeats.cc
@@ -23,7 +23,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 #include "hmm/posterior.h"
 
@@ -59,7 +59,7 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int32 int32;
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
@@ -67,8 +67,6 @@ int main(int argc, char *argv[]) {
       am_gmm.Read(ki.Stream(), binary);
     }
 
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
     int32 new_dim = 0;
     AccumAmDiagGmm gmm_accs;
     // will initialize once we know new_dim.
@@ -129,13 +127,6 @@ int main(int argc, char *argv[]) {
                                                   weight);
             tot_weight_this_file += weight;
           }
-
-          // Accumulates for transitions.
-          for (size_t j = 0; j < posterior[i].size(); j++) {
-            int32 tid = posterior[i][j].first;
-            BaseFloat weight = posterior[i][j].second;
-            trans_model.Accumulate(weight, tid, &transition_accs);
-          }
         }
         KALDI_LOG << "Average like for this file is "
                   << (tot_like_this_file/tot_weight_this_file) << " over "
@@ -157,7 +148,6 @@ int main(int argc, char *argv[]) {
 
     {
       Output ko(accs_wxfilename, binary);
-      transition_accs.Write(ko.Stream(), binary);
       gmm_accs.Write(ko.Stream(), binary);
     }
     KALDI_LOG << "Written accs.";
diff --git a/src/gmmbin/gmm-acc-stats.cc b/src/gmmbin/gmm-acc-stats.cc
index e213fffdeff..76a3528d4f4 100644
--- a/src/gmmbin/gmm-acc-stats.cc
+++ b/src/gmmbin/gmm-acc-stats.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 #include "hmm/posterior.h"
 
@@ -59,7 +59,7 @@ int main(int argc, char *argv[]) {
 
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
@@ -67,8 +67,6 @@ int main(int argc, char *argv[]) {
       am_gmm.Read(ki.Stream(), binary);
     }
 
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
     AccumAmDiagGmm gmm_accs;
     gmm_accs.Init(am_gmm, StringToGmmFlags(update_flags_str));
 
@@ -110,13 +108,6 @@ int main(int argc, char *argv[]) {
                 * weight;
             tot_weight += weight;
           }
-
-          // Accumulates for transitions.
-          for (size_t j = 0; j < posterior[i].size(); j++) {
-            int32 tid = posterior[i][j].first;
-            BaseFloat weight = posterior[i][j].second;
-            trans_model.Accumulate(weight, tid, &transition_accs);
-          }
         }
         if (num_done % 50 == 0) {
           KALDI_LOG << "Processed " << num_done << " utterances; for utterance "
@@ -136,7 +127,6 @@ int main(int argc, char *argv[]) {
 
     {
       Output ko(accs_wxfilename, binary);
-      transition_accs.Write(ko.Stream(), binary);
       gmm_accs.Write(ko.Stream(), binary);
     }
     KALDI_LOG << "Written accs.";
diff --git a/src/gmmbin/gmm-acc-stats2.cc b/src/gmmbin/gmm-acc-stats2.cc
index 70730c8ca7d..15e97d07b73 100644
--- a/src/gmmbin/gmm-acc-stats2.cc
+++ b/src/gmmbin/gmm-acc-stats2.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 #include "hmm/posterior.h"
 
@@ -62,7 +62,7 @@ int main(int argc, char *argv[]) {
 
     
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_rxfilename, &binary);
@@ -70,9 +70,6 @@ int main(int argc, char *argv[]) {
       am_gmm.Read(ki.Stream(), binary);
     }
     
-    Vector<double> num_trans_accs, den_trans_accs;
-    trans_model.InitStats(&num_trans_accs);
-    trans_model.InitStats(&den_trans_accs);
     AccumAmDiagGmm num_gmm_accs, den_gmm_accs;
     num_gmm_accs.Init(am_gmm, StringToGmmFlags(update_flags_str));
     den_gmm_accs.Init(am_gmm, StringToGmmFlags(update_flags_str));
@@ -110,11 +107,8 @@ int main(int argc, char *argv[]) {
         for (size_t i = 0; i < posterior.size(); i++) {
           for (size_t j = 0; j < posterior[i].size(); j++) {
             int32 tid = posterior[i][j].first,
-                pdf_id = trans_model.TransitionIdToPdf(tid);
+                pdf_id = trans_model.TransitionIdToPdfFast(tid);
             BaseFloat weight = posterior[i][j].second;
-            trans_model.Accumulate(fabs(weight), tid,
-                                   (weight > 0.0 ?
-                                    &num_trans_accs : &den_trans_accs));
             tot_like_this_file +=
                 (weight > 0.0 ? &num_gmm_accs : &den_gmm_accs) ->
                 AccumulateForGmm(am_gmm, mat.Row(i), pdf_id, fabs(weight)) * weight;
@@ -136,12 +130,10 @@ int main(int argc, char *argv[]) {
 
     {
       Output ko(num_accs_wxfilename, binary);
-      num_trans_accs.Write(ko.Stream(), binary);
       num_gmm_accs.Write(ko.Stream(), binary);
     }
     {
       Output ko(den_accs_wxfilename, binary);
-      den_trans_accs.Write(ko.Stream(), binary);
       den_gmm_accs.Write(ko.Stream(), binary);
     }
     KALDI_LOG << "Written accs.";
diff --git a/src/gmmbin/gmm-adapt-map.cc b/src/gmmbin/gmm-adapt-map.cc
index ec3eb8cea9b..30fbc1e8d73 100644
--- a/src/gmmbin/gmm-adapt-map.cc
+++ b/src/gmmbin/gmm-adapt-map.cc
@@ -25,7 +25,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 #include "hmm/posterior.h"
 
@@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
     MapAmDiagGmmWriter map_am_writer(map_am_wspecifier);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input is(model_filename, &binary);
diff --git a/src/gmmbin/gmm-align-compiled.cc b/src/gmmbin/gmm-align-compiled.cc
index 36349774773..f8b5a11d504 100644
--- a/src/gmmbin/gmm-align-compiled.cc
+++ b/src/gmmbin/gmm-align-compiled.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
@@ -50,17 +50,11 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
     AlignConfig align_config;
     BaseFloat acoustic_scale = 1.0;
-    BaseFloat transition_scale = 1.0;
-    BaseFloat self_loop_scale = 1.0;
     std::string per_frame_acwt_wspecifier;
 
     align_config.Register(&po);
-    po.Register("transition-scale", &transition_scale,
-                "Transition-probability scale [relative to acoustics]");
     po.Register("acoustic-scale", &acoustic_scale,
                 "Scaling factor for acoustic likelihoods");
-    po.Register("self-loop-scale", &self_loop_scale,
-                "Scale of self-loop versus non-self-loop log probs [relative to acoustics]");
     po.Register("write-per-frame-acoustic-loglikes", &per_frame_acwt_wspecifier,
                 "Wspecifier for table of vectors containing the acoustic log-likelihoods "
                 "per frame for each utterance. E.g. ark:foo/per_frame_logprobs.1.ark");
@@ -77,7 +71,7 @@ int main(int argc, char *argv[]) {
         alignment_wspecifier = po.GetArg(4),
         scores_wspecifier = po.GetOptArg(5);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
@@ -114,13 +108,6 @@ int main(int argc, char *argv[]) {
           continue;
         }
 
-        {  // Add transition-probs to the FST.
-          std::vector<int32> disambig_syms;  // empty.
-          AddTransitionProbs(trans_model, disambig_syms,
-                             transition_scale, self_loop_scale,
-                             &decode_fst);
-        }
-
         DecodableAmDiagGmmScaled gmm_decodable(am_gmm, trans_model, features,
                                                acoustic_scale);
 
diff --git a/src/gmmbin/gmm-align.cc b/src/gmmbin/gmm-align.cc
index c9c2fde11f6..e84a90cdb9a 100644
--- a/src/gmmbin/gmm-align.cc
+++ b/src/gmmbin/gmm-align.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-utils.h"
 #include "decoder/decoder-wrappers.h"
 #include "decoder/training-graph-compiler.h"
@@ -73,7 +73,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;
     ReadKaldiObject(tree_in_filename, &ctx_dep);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-basis-fmllr-accs-gpost.cc b/src/gmmbin/gmm-basis-fmllr-accs-gpost.cc
index f8f7b5d3433..9001b64ae82 100644
--- a/src/gmmbin/gmm-basis-fmllr-accs-gpost.cc
+++ b/src/gmmbin/gmm-basis-fmllr-accs-gpost.cc
@@ -26,7 +26,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "transform/basis-fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
@@ -34,7 +34,7 @@ using std::vector;
 namespace kaldi {
 void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
                             const GaussPost &gpost,
-                            const TransitionModel &trans_model,
+                            const Transitions &trans_model,
                             const AmDiagGmm &am_gmm,
                             FmllrDiagGmmAccs *spk_stats) {
   for (size_t i = 0; i < gpost.size(); i++) {
@@ -81,7 +81,7 @@ int main(int argc, char *argv[]) {
         gpost_rspecifier = po.GetArg(3),
         accs_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-basis-fmllr-accs.cc b/src/gmmbin/gmm-basis-fmllr-accs.cc
index 58b365318f0..d78d652dfc5 100644
--- a/src/gmmbin/gmm-basis-fmllr-accs.cc
+++ b/src/gmmbin/gmm-basis-fmllr-accs.cc
@@ -26,7 +26,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "transform/basis-fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
@@ -34,7 +34,7 @@ using std::vector;
 namespace kaldi {
 void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
                             const Posterior &post,
-                            const TransitionModel &trans_model,
+                            const Transitions &trans_model,
                             const AmDiagGmm &am_gmm,
                             FmllrDiagGmmAccs *spk_stats) {
   Posterior pdf_post;
@@ -82,7 +82,7 @@ int main(int argc, char *argv[]) {
         post_rspecifier = po.GetArg(3),
         accs_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-basis-fmllr-training.cc b/src/gmmbin/gmm-basis-fmllr-training.cc
index 3d93c3ca877..d433f6903f6 100644
--- a/src/gmmbin/gmm-basis-fmllr-training.cc
+++ b/src/gmmbin/gmm-basis-fmllr-training.cc
@@ -25,7 +25,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "transform/basis-fmllr-diag-gmm.h"
 
@@ -53,7 +53,7 @@ int main(int argc, char *argv[]) {
         model_rxfilename = po.GetArg(1),
         basis_wspecifier = po.GetArg(2);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-boost-silence.cc b/src/gmmbin/gmm-boost-silence.cc
index 7c9e4c82806..ef57f1190cb 100644
--- a/src/gmmbin/gmm-boost-silence.cc
+++ b/src/gmmbin/gmm-boost-silence.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/am-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
     }
     
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_rxfilename, &binary_read);
diff --git a/src/gmmbin/gmm-compute-likes.cc b/src/gmmbin/gmm-compute-likes.cc
index 78c813e1c3b..c7101f1a9ae 100644
--- a/src/gmmbin/gmm-compute-likes.cc
+++ b/src/gmmbin/gmm-compute-likes.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "base/timer.h"
 
@@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
     AmDiagGmm am_gmm;
     {
       bool binary;
-      TransitionModel trans_model;  // not needed.
+      Transitions trans_model;  // not needed.
       Input ki(model_in_filename, &binary);
       trans_model.Read(ki.Stream(), binary);
       am_gmm.Read(ki.Stream(), binary);
diff --git a/src/gmmbin/gmm-copy.cc b/src/gmmbin/gmm-copy.cc
index 0b33bc6d67f..bd42aeb2a25 100644
--- a/src/gmmbin/gmm-copy.cc
+++ b/src/gmmbin/gmm-copy.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -54,7 +54,7 @@ int main(int argc, char *argv[]) {
         model_out_filename = po.GetArg(2);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
diff --git a/src/gmmbin/gmm-decode-biglm-faster.cc b/src/gmmbin/gmm-decode-biglm-faster.cc
index 6e47d68de3c..9e7845e7849 100644
--- a/src/gmmbin/gmm-decode-biglm-faster.cc
+++ b/src/gmmbin/gmm-decode-biglm-faster.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/biglm-faster-decoder.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -111,7 +111,7 @@ int main(int argc, char *argv[])
         alignment_wspecifier = po.GetOptArg(7),
         lattice_wspecifier = po.GetOptArg(8);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-decode-faster-regtree-fmllr.cc b/src/gmmbin/gmm-decode-faster-regtree-fmllr.cc
deleted file mode 100644
index ca39cbe8cb7..00000000000
--- a/src/gmmbin/gmm-decode-faster-regtree-fmllr.cc
+++ /dev/null
@@ -1,290 +0,0 @@
-// gmmbin/gmm-decode-faster-regtree-fmllr.cc
-
-// Copyright 2009-2012  Microsoft Corporation;  Saarland University;
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "transform/regression-tree.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-#include "transform/fmllr-diag-gmm.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/faster-decoder.h"
-#include "transform/decodable-am-diag-gmm-regtree.h"
-#include "base/timer.h"
-#include "lat/kaldi-lattice.h" // for {Compact}LatticeArc
-
-using fst::SymbolTable;
-using fst::VectorFst;
-using fst::StdArc;
-using kaldi::BaseFloat;
-using std::string;
-using std::vector;
-using kaldi::LatticeWeight;
-using kaldi::LatticeArc;
-
-struct DecodeInfo {
- public:
-  DecodeInfo(const kaldi::AmDiagGmm &am,
-             const kaldi::TransitionModel &tm, kaldi::FasterDecoder *decoder,
-             BaseFloat scale, bool allow_partial,
-             const kaldi::Int32VectorWriter &wwriter,
-             const kaldi::Int32VectorWriter &awriter, fst::SymbolTable *wsyms)
-      : acoustic_model(am), trans_model(tm), decoder(decoder),
-        acoustic_scale(scale), allow_partial(allow_partial), words_writer(wwriter),
-        alignment_writer(awriter), word_syms(wsyms) {}
-
-  const kaldi::AmDiagGmm &acoustic_model;
-  const kaldi::TransitionModel &trans_model;
-  kaldi::FasterDecoder *decoder;
-  BaseFloat acoustic_scale;
-  bool allow_partial;
-  const kaldi::Int32VectorWriter &words_writer;
-  const kaldi::Int32VectorWriter &alignment_writer;
-  fst::SymbolTable *word_syms;
-
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodeInfo);
-};
-
-bool DecodeUtterance(kaldi::FasterDecoder *decoder,
-                     kaldi::DecodableInterface *decodable,
-                     DecodeInfo *info,
-                     const string &uttid,
-                     int32 num_frames,
-                     BaseFloat *total_like) {
-  decoder->Decode(decodable);
-  KALDI_LOG << "Length of file is " << num_frames;
-
-  VectorFst<LatticeArc> decoded;  // linear FST.
-  if ( (info->allow_partial || decoder->ReachedFinal())
-       && decoder->GetBestPath(&decoded) ) {
-    if (!decoder->ReachedFinal())
-      KALDI_WARN << "Decoder did not reach end-state, outputting partial "
-          "traceback.";
-    
-    vector<kaldi::int32> alignment, words;
-    LatticeWeight weight;
-    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
-
-    info->words_writer.Write(uttid, words);
-    if (info->alignment_writer.IsOpen())
-      info->alignment_writer.Write(uttid, alignment);
-    if (info->word_syms != NULL) {
-      std::ostringstream ss;
-      ss << uttid << ' ';
-      for (size_t i = 0; i < words.size(); i++) {
-        string s = info->word_syms->Find(words[i]);
-        if (s == "")
-          KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
-        ss << s << ' ';
-      }
-      ss << '\n';
-      KALDI_LOG << ss.str();
-    }
-
-    BaseFloat like = -weight.Value1() -weight.Value2();
-    KALDI_LOG << "Log-like per frame = " << (like/num_frames);
-    (*total_like) += like;
-    return true;
-  } else {
-    KALDI_WARN << "Did not successfully decode utterance, length = "
-               << num_frames;
-    return false;
-  }
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    const char *usage = "Decode features using GMM-based model.\n"
-              "Usage: gmm-decode-faster-regtree-fmllr [options] model-in fst-in "
-              "regtree-in features-rspecifier transforms-rspecifier "
-              "words-wspecifier [alignments-wspecifier]\n";
-    ParseOptions po(usage);
-    bool binary = true;
-    bool allow_partial = true;
-    BaseFloat acoustic_scale = 0.1;
-    
-    std::string word_syms_filename, utt2spk_rspecifier;
-    FasterDecoderOptions decoder_opts;
-    decoder_opts.Register(&po, true);  // true == include obscure settings.
-    po.Register("utt2spk", &utt2spk_rspecifier, "rspecifier for utterance to "
-                "speaker map");
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("acoustic-scale", &acoustic_scale,
-        "Scaling factor for acoustic likelihoods");
-    po.Register("word-symbol-table", &word_syms_filename,
-        "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial,
-                "Produce output even when final state was not reached");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 6 || po.NumArgs() > 7) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_filename = po.GetArg(2),
-        regtree_filename = po.GetArg(3),
-        feature_rspecifier = po.GetArg(4),
-        xforms_rspecifier = po.GetArg(5),
-        words_wspecifier = po.GetArg(6),
-        alignment_wspecifier = po.GetOptArg(7);
-
-    TransitionModel trans_model;
-    AmDiagGmm am_gmm;
-    {
-      bool binary_read;
-      Input ki(model_in_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_gmm.Read(ki.Stream(), binary_read);
-    }
-
-    VectorFst<StdArc> *decode_fst = fst::ReadFstKaldi(fst_in_filename);
-
-    RegressionTree regtree;
-    {
-      bool binary_read;
-      Input in(regtree_filename, &binary_read);
-      regtree.Read(in.Stream(), binary_read, am_gmm);
-    }
-
-    RandomAccessRegtreeFmllrDiagGmmReaderMapped fmllr_reader(xforms_rspecifier,
-                                                             utt2spk_rspecifier);
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") {
-      word_syms = fst::SymbolTable::ReadText(word_syms_filename);
-      if (!word_syms) {
-        KALDI_ERR << "Could not read symbol table from file "
-            << word_syms_filename;
-      }
-    }
-
-    BaseFloat tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_fail = 0;
-    FasterDecoder decoder(*decode_fst, decoder_opts);
-
-    Timer timer;
-
-    DecodeInfo decode_info(am_gmm, trans_model, &decoder, acoustic_scale,
-                           allow_partial, words_writer, alignment_writer,
-                           word_syms);
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      string utt = feature_reader.Key();
-
-      Matrix<BaseFloat> features(feature_reader.Value());
-      feature_reader.FreeCurrent();
-      if (features.NumRows() == 0) {
-        KALDI_WARN << "Zero-length utterance: " << utt;
-        num_fail++;
-        continue;
-      }
-
-      if (!fmllr_reader.HasKey(utt)) {  // Decode without FMLLR if none found
-        KALDI_WARN << "No FMLLR transform for key " << utt <<
-            ", decoding without fMLLR.";
-        kaldi::DecodableAmDiagGmmScaled gmm_decodable(am_gmm, trans_model,
-                                                      features,
-                                                      acoustic_scale);
-        if (DecodeUtterance(&decoder, &gmm_decodable, &decode_info,
-                            utt, features.NumRows(), &tot_like)) {
-          frame_count += gmm_decodable.NumFramesReady();
-          num_success++;
-        } else {
-          num_fail++;
-        }
-        continue;
-      }
-
-      // If found, load the transforms for the current utterance.
-      RegtreeFmllrDiagGmm fmllr(fmllr_reader.Value(utt));
-      if (fmllr.NumRegClasses() == 1) {
-        Matrix<BaseFloat> xformed_features(features);
-        Matrix<BaseFloat> fmllr_matrix;
-        fmllr.GetXformMatrix(0, &fmllr_matrix);
-        for (int32 i = 0; i < xformed_features.NumRows(); i++) {
-          SubVector<BaseFloat> row(xformed_features, i);
-          ApplyAffineTransform(fmllr_matrix, &row);
-        }
-        kaldi::DecodableAmDiagGmmScaled gmm_decodable(am_gmm, trans_model,
-                                                      xformed_features,
-                                                      acoustic_scale);
-
-        if (DecodeUtterance(&decoder, &gmm_decodable, &decode_info,
-                            utt, xformed_features.NumRows(), &tot_like)) {
-          frame_count += gmm_decodable.NumFramesReady();
-          num_success++;
-        } else {
-          num_fail++;
-        }
-      } else {
-        kaldi::DecodableAmDiagGmmRegtreeFmllr gmm_decodable(am_gmm, trans_model,
-                                                            features, fmllr,
-                                                            regtree,
-                                                            acoustic_scale);
-        if (DecodeUtterance(&decoder, &gmm_decodable, &decode_info,
-                            utt, features.NumRows(), &tot_like)) {
-          frame_count += gmm_decodable.NumFramesReady();
-          num_success++;
-        } else {
-          num_fail++;
-        }
-      }
-    }  // end looping over all utterances
-
-    KALDI_LOG << "Average log-likelihood per frame is " << (tot_like
-                                                            / frame_count) << " over " << frame_count << " frames.";
-
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken [excluding initialization] " << elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed * 100.0 / frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_fail;
-
-    delete word_syms;
-    delete decode_fst;
-    if (num_success != 0)
-      return 0;
-    else
-      return 1;
-  }
-  catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/gmmbin/gmm-decode-faster-regtree-mllr.cc b/src/gmmbin/gmm-decode-faster-regtree-mllr.cc
deleted file mode 100644
index 9a5d9486b9f..00000000000
--- a/src/gmmbin/gmm-decode-faster-regtree-mllr.cc
+++ /dev/null
@@ -1,267 +0,0 @@
-// gmmbin/gmm-decode-faster-regtree-mllr.cc
-
-// Copyright 2009-2013  Microsoft Corporation;  Saarland University;
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "transform/regression-tree.h"
-#include "transform/regtree-mllr-diag-gmm.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/faster-decoder.h"
-#include "transform/decodable-am-diag-gmm-regtree.h"
-#include "base/timer.h"
-#include "lat/kaldi-lattice.h" // for {Compact}LatticeArc
-
-using fst::SymbolTable;
-using fst::VectorFst;
-using fst::StdArc;
-using kaldi::BaseFloat;
-using std::string;
-using std::vector;
-using kaldi::LatticeWeight;
-using kaldi::LatticeArc;
-
-struct DecodeInfo {
- public:
-  DecodeInfo(const kaldi::AmDiagGmm &am,
-             const kaldi::TransitionModel &tm, kaldi::FasterDecoder *decoder,
-             BaseFloat scale, bool allow_partial,
-             const kaldi::Int32VectorWriter &wwriter,
-             const kaldi::Int32VectorWriter &awriter, fst::SymbolTable *wsyms)
-      : acoustic_model(am), trans_model(tm), decoder(decoder),
-        acoustic_scale(scale), allow_partial(allow_partial), words_writer(wwriter),
-        alignment_writer(awriter), word_syms(wsyms) {}
-
-  const kaldi::AmDiagGmm &acoustic_model;
-  const kaldi::TransitionModel &trans_model;
-  kaldi::FasterDecoder *decoder;
-  BaseFloat acoustic_scale;
-  bool allow_partial;
-  const kaldi::Int32VectorWriter &words_writer;
-  const kaldi::Int32VectorWriter &alignment_writer;
-  fst::SymbolTable *word_syms;
-
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodeInfo);
-};
-
-bool DecodeUtterance(kaldi::FasterDecoder *decoder,
-                     kaldi::DecodableInterface *decodable,
-                     DecodeInfo *info,
-                     const string &uttid,
-                     int32 num_frames,
-                     BaseFloat *total_like) {
-  decoder->Decode(decodable);
-  KALDI_LOG << "Length of file is " << num_frames;;
-
-  VectorFst<LatticeArc> decoded;  // linear FST.
-  if ( (info->allow_partial || decoder->ReachedFinal())
-       && decoder->GetBestPath(&decoded) ) {
-    if (!decoder->ReachedFinal())
-      KALDI_WARN << "Decoder did not reach end-state, outputting partial "
-          "traceback.";
-    
-    vector<kaldi::int32> alignment, words;
-    LatticeWeight weight;
-    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
-
-    info->words_writer.Write(uttid, words);
-    if (info->alignment_writer.IsOpen())
-      info->alignment_writer.Write(uttid, alignment);
-    if (info->word_syms != NULL) {
-      std::ostringstream ss;
-      ss << uttid << ' ';
-      for (size_t i = 0; i < words.size(); i++) {
-        string s = info->word_syms->Find(words[i]);
-        if (s == "")
-          KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
-        ss << s << ' ';
-      }
-      ss << '\n';
-      KALDI_LOG << ss.str();
-    }
-
-    BaseFloat like = -weight.Value1() -weight.Value2();
-    KALDI_LOG << "Log-like per frame = " << (like/num_frames);
-    (*total_like) += like;
-    return true;
-  } else {
-    KALDI_WARN << "Did not successfully decode utterance, length = "
-               << num_frames;
-    return false;
-  }
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    const char *usage = "Decode features using GMM-based model.\n"
-              "Usage: gmm-decode-faster-regtree-mllr [options] model-in fst-in "
-              "regtree-in features-rspecifier transforms-rspecifier "
-              "words-wspecifier [alignments-wspecifier]\n";
-    ParseOptions po(usage);
-    bool binary = true;
-    bool allow_partial = true;
-    BaseFloat acoustic_scale = 0.1;
-    
-    std::string word_syms_filename, utt2spk_rspecifier;
-    FasterDecoderOptions decoder_opts;
-    decoder_opts.Register(&po, true);  // true == include obscure settings.
-    po.Register("utt2spk", &utt2spk_rspecifier, "rspecifier for utterance to "
-                "speaker map");
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("acoustic-scale", &acoustic_scale,
-        "Scaling factor for acoustic likelihoods");
-    po.Register("word-symbol-table", &word_syms_filename,
-        "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial,
-                "Produce output even when final state was not reached");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 6 || po.NumArgs() > 7) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_filename = po.GetArg(2),
-        regtree_filename = po.GetArg(3),
-        feature_rspecifier = po.GetArg(4),
-        xforms_rspecifier = po.GetArg(5),
-        words_wspecifier = po.GetArg(6),
-        alignment_wspecifier = po.GetOptArg(7);
-
-    TransitionModel trans_model;
-    AmDiagGmm am_gmm;
-    {
-      bool binary_read;
-      Input ki(model_in_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_gmm.Read(ki.Stream(), binary_read);
-    }
-
-    VectorFst<StdArc> *decode_fst = fst::ReadFstKaldi(fst_in_filename);
-
-    RegressionTree regtree;
-    {
-      bool binary_read;
-      Input in(regtree_filename, &binary_read);
-      regtree.Read(in.Stream(), binary_read, am_gmm);
-    }
-
-    RandomAccessRegtreeMllrDiagGmmReaderMapped mllr_reader(xforms_rspecifier,
-                                                           utt2spk_rspecifier);
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") {
-      word_syms = fst::SymbolTable::ReadText(word_syms_filename);
-      if (!word_syms) {
-        KALDI_ERR << "Could not read symbol table from file "
-            << word_syms_filename;
-      }
-    }
-
-    BaseFloat tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_fail = 0;
-    FasterDecoder decoder(*decode_fst, decoder_opts);
-
-    Timer timer;
-
-    DecodeInfo decode_info(am_gmm, trans_model, &decoder, acoustic_scale,
-                           allow_partial, words_writer, alignment_writer,
-                           word_syms);
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      string utt = feature_reader.Key();
-
-      Matrix<BaseFloat> features(feature_reader.Value());
-      feature_reader.FreeCurrent();
-      if (features.NumRows() == 0) {
-        KALDI_WARN << "Zero-length utterance: " << utt;
-        num_fail++;
-        continue;
-      }
-
-      if (!mllr_reader.HasKey(utt)) {  // Decode without MLLR if none found
-        KALDI_WARN << "No MLLR transform for key " << utt <<
-            ", decoding without MLLR.";
-        kaldi::DecodableAmDiagGmmScaled gmm_decodable(am_gmm, trans_model,
-                                                      features,
-                                                      acoustic_scale);
-        if (DecodeUtterance(&decoder, &gmm_decodable, &decode_info,
-                            utt, features.NumRows(), &tot_like)) {
-          frame_count += gmm_decodable.NumFramesReady();
-          num_success++;
-        } else {
-          num_fail++;
-        }
-        continue;
-      }
-
-      // If found, load the transforms for the current utterance.
-      const RegtreeMllrDiagGmm &mllr = mllr_reader.Value(utt);
-      kaldi::DecodableAmDiagGmmRegtreeMllr gmm_decodable(am_gmm, trans_model,
-                                                         features, mllr,
-                                                         regtree,
-                                                         acoustic_scale);
-      if (DecodeUtterance(&decoder, &gmm_decodable, &decode_info,
-                          utt, features.NumRows(), &tot_like)) {
-        frame_count += gmm_decodable.NumFramesReady();
-        num_success++;
-      } else {
-        num_fail++;
-      }
-    }  // end looping over all utterances
-
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken [excluding initialization] " << elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed * 100.0 / frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_fail;
-    KALDI_LOG << "Overall log-likelihood per frame is "
-              << (tot_like / frame_count) << " over " << frame_count
-              << " frames.";
-    
-    delete decode_fst;
-    if (num_success != 0)
-      return 0;
-    else
-      return 1;
-  }
-  catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/gmmbin/gmm-decode-faster.cc b/src/gmmbin/gmm-decode-faster.cc
index 34c4ff2c37e..438e3d9c9d1 100644
--- a/src/gmmbin/gmm-decode-faster.cc
+++ b/src/gmmbin/gmm-decode-faster.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/faster-decoder.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -101,7 +101,7 @@ int main(int argc, char *argv[]) {
         alignment_wspecifier = po.GetOptArg(5),
         lattice_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-decode-simple.cc b/src/gmmbin/gmm-decode-simple.cc
index 5ef35552dc0..ef87585cc1e 100644
--- a/src/gmmbin/gmm-decode-simple.cc
+++ b/src/gmmbin/gmm-decode-simple.cc
@@ -23,7 +23,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/simple-decoder.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -78,7 +78,7 @@ int main(int argc, char *argv[]) {
         alignment_wspecifier = po.GetOptArg(5),
         lattice_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-est-basis-fmllr-gpost.cc b/src/gmmbin/gmm-est-basis-fmllr-gpost.cc
index 54b92d8aa61..3d864c88086 100644
--- a/src/gmmbin/gmm-est-basis-fmllr-gpost.cc
+++ b/src/gmmbin/gmm-est-basis-fmllr-gpost.cc
@@ -26,7 +26,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "transform/basis-fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
@@ -34,7 +34,7 @@ using std::vector;
 namespace kaldi {
 void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
                             const GaussPost &gpost,
-                            const TransitionModel &trans_model,
+                            const Transitions &trans_model,
                             const AmDiagGmm &am_gmm,
                             FmllrDiagGmmAccs *spk_stats) {
   for (size_t i = 0; i < gpost.size(); i++) {
@@ -87,7 +87,7 @@ int main(int argc, char *argv[]) {
         gpost_rspecifier = po.GetArg(4),
         trans_wspecifier = po.GetArg(5);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-est-basis-fmllr.cc b/src/gmmbin/gmm-est-basis-fmllr.cc
index 0d163169ce2..fe64a1b2166 100644
--- a/src/gmmbin/gmm-est-basis-fmllr.cc
+++ b/src/gmmbin/gmm-est-basis-fmllr.cc
@@ -26,7 +26,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "transform/basis-fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
@@ -34,7 +34,7 @@ using std::vector;
 namespace kaldi {
 void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
                             const Posterior &post,
-                            const TransitionModel &trans_model,
+                            const Transitions &trans_model,
                             const AmDiagGmm &am_gmm,
                             FmllrDiagGmmAccs *spk_stats) {
   Posterior pdf_post;
@@ -89,7 +89,7 @@ int main(int argc, char *argv[]) {
         post_rspecifier = po.GetArg(4),
         trans_wspecifier = po.GetArg(5);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-est-fmllr-global.cc b/src/gmmbin/gmm-est-fmllr-global.cc
index b3af0780aa5..d167ba25890 100644
--- a/src/gmmbin/gmm-est-fmllr-global.cc
+++ b/src/gmmbin/gmm-est-fmllr-global.cc
@@ -27,7 +27,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
 
diff --git a/src/gmmbin/gmm-est-fmllr-gpost.cc b/src/gmmbin/gmm-est-fmllr-gpost.cc
index d1cae0d7f48..9d830737718 100644
--- a/src/gmmbin/gmm-est-fmllr-gpost.cc
+++ b/src/gmmbin/gmm-est-fmllr-gpost.cc
@@ -27,14 +27,14 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
 
 namespace kaldi {
 void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
                             const GaussPost &gpost,
-                            const TransitionModel &trans_model,
+                            const Transitions &trans_model,
                             const AmDiagGmm &am_gmm,
                             FmllrDiagGmmAccs *spk_stats) {
   for (size_t i = 0; i < gpost.size(); i++) {
@@ -81,7 +81,7 @@ int main(int argc, char *argv[]) {
         gpost_rspecifier = po.GetArg(3),
         trans_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-est-fmllr-raw-gpost.cc b/src/gmmbin/gmm-est-fmllr-raw-gpost.cc
deleted file mode 100644
index 1f5a09f233b..00000000000
--- a/src/gmmbin/gmm-est-fmllr-raw-gpost.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-// gmmbin/gmm-est-fmllr-raw-gpost.cc
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-//           2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "transform/fmllr-raw.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "util/common-utils.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-
-
-void AccStatsForUtterance(const TransitionModel &trans_model,
-                          const AmDiagGmm &am_gmm,
-                          const GaussPost &gpost,
-                          const Matrix<BaseFloat> &feats,
-                          FmllrRawAccs *accs) {
-  for (size_t t = 0; t < gpost.size(); t++) {
-    for (size_t i = 0; i < gpost[t].size(); i++) {
-      int32 pdf = gpost[t][i].first;
-      const Vector<BaseFloat> &posterior(gpost[t][i].second);      
-      accs->AccumulateFromPosteriors(am_gmm.GetPdf(pdf),
-                                     feats.Row(t), posterior);
-    }
-  }
-}
-
-
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Estimate fMLLR transforms in the space before splicing and linear transforms\n"
-        "such as LDA+MLLT, but using models in the space transformed by these transforms\n"
-        "Requires the original spliced features, and the full LDA+MLLT (or similar) matrix\n"
-        "including the 'rejected' rows (see the program get-full-lda-mat).  Reads in\n"
-        "Gaussian-level posteriors.\n"
-        "Usage: gmm-est-fmllr-raw-gpost [options] <model-in> <full-lda-mat-in> "
-        "<feature-rspecifier> <gpost-rspecifier> <transform-wspecifier>\n";
-
-
-    int32 raw_feat_dim = 13;
-    ParseOptions po(usage);
-    FmllrRawOptions opts;
-    std::string spk2utt_rspecifier;
-    po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
-                "utterance-list map");
-    po.Register("raw-feat-dim", &raw_feat_dim, "Dimension of raw features "
-                "prior to splicing");
-    opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string model_rxfilename = po.GetArg(1),
-        full_lda_mat_rxfilename = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        gpost_rspecifier = po.GetArg(4),
-        transform_wspecifier = po.GetArg(5);
-
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-
-    Matrix<BaseFloat> full_lda_mat;
-    ReadKaldiObject(full_lda_mat_rxfilename, &full_lda_mat);
-    
-    RandomAccessGaussPostReader gpost_reader(gpost_rspecifier);
-    BaseFloatMatrixWriter transform_writer(transform_wspecifier);
-    
-    double tot_auxf_impr = 0.0, tot_count = 0.0;
-    
-    int32 num_done = 0, num_err = 0;
-    if (!spk2utt_rspecifier.empty()) { // Adapting per speaker
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        FmllrRawAccs accs(raw_feat_dim, am_gmm.Dim(), full_lda_mat);
-        std::string spk = spk2utt_reader.Key();
-        const std::vector<std::string> &uttlist = spk2utt_reader.Value();
-        for (size_t i = 0; i < uttlist.size(); i++) {
-          std::string utt = uttlist[i];
-          if (!feature_reader.HasKey(utt)) {
-            KALDI_WARN << "Features not found for utterance " << utt;
-            num_err++;
-            continue;
-          }
-          if (!gpost_reader.HasKey(utt)) {
-            KALDI_WARN << "Gaussian-level posteriors not found for utterance " << utt;
-            num_err++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-          const GaussPost &gpost = gpost_reader.Value(utt);
-          if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
-            KALDI_WARN << "Size mismatch between gposteriors " << gpost.size()
-                       << " and features " << feats.NumRows();
-            num_err++;
-            continue;
-          }
-
-          AccStatsForUtterance(trans_model, am_gmm, gpost, feats, &accs);
-          num_done++;
-        }
-        
-        BaseFloat auxf_impr, count;
-        {
-          Matrix<BaseFloat> transform(raw_feat_dim, raw_feat_dim + 1);
-          transform.SetUnit();
-          accs.Update(opts, &transform, &auxf_impr, &count);
-          transform_writer.Write(spk, transform);
-        }
-        KALDI_LOG << "For speaker " << spk << ", auxf-impr from raw fMLLR is "
-                  << (auxf_impr/count) << " over " << count << " frames.";
-        tot_auxf_impr += auxf_impr;
-        tot_count += count;
-      }
-    } else {  // --spk2utt option not given -> adapt per utterance.
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        std::string utt = feature_reader.Key();
-        if (!gpost_reader.HasKey(utt)) {
-          KALDI_WARN << "Gaussian-level posteriors not found for utterance " << utt;
-          num_err++;
-          continue;
-        }
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        const GaussPost &gpost = gpost_reader.Value(utt);
-
-        if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
-          KALDI_WARN << "Size mismatch between posteriors " << gpost.size()
-                     << " and features " << feats.NumRows();
-          num_err++;
-          continue;
-        }
-
-        FmllrRawAccs accs(raw_feat_dim, am_gmm.Dim(), full_lda_mat);
-
-        AccStatsForUtterance(trans_model, am_gmm, gpost, feats, &accs);
-        
-        BaseFloat auxf_impr, count;        
-        {
-          Matrix<BaseFloat> transform(raw_feat_dim, raw_feat_dim + 1);
-          transform.SetUnit();
-          accs.Update(opts, &transform, &auxf_impr, &count);
-          transform_writer.Write(utt, transform);
-        }
-        KALDI_LOG << "For utterance " << utt << ", auxf-impr from raw fMLLR is "
-                  << (auxf_impr/count) << " over " << count << " frames.";
-        tot_auxf_impr += auxf_impr;
-        tot_count += count;
-        num_done++;
-      }
-    }
-
-    KALDI_LOG << "Processed " << num_done << " utterances, "
-              << num_err << " had errors.";
-    KALDI_LOG << "Overall raw-fMLLR auxf impr per frame is "
-              << (tot_auxf_impr / tot_count) << " over " << tot_count
-              << " frames.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/gmmbin/gmm-est-fmllr-raw.cc b/src/gmmbin/gmm-est-fmllr-raw.cc
deleted file mode 100644
index 5e83bfb1fb3..00000000000
--- a/src/gmmbin/gmm-est-fmllr-raw.cc
+++ /dev/null
@@ -1,199 +0,0 @@
-// gmmbin/gmm-est-fmllr-raw.cc
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-//           2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "transform/fmllr-raw.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "util/common-utils.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-
-
-void AccStatsForUtterance(const TransitionModel &trans_model,
-                          const AmDiagGmm &am_gmm,
-                          const Posterior &post,
-                          const Matrix<BaseFloat> &feats,
-                          FmllrRawAccs *accs) {
-  Posterior pdf_post;
-  ConvertPosteriorToPdfs(trans_model, post, &pdf_post);
-  for (size_t t = 0; t < post.size(); t++) {
-    for (size_t i = 0; i < pdf_post[t].size(); i++) {
-      int32 pdf = pdf_post[t][i].first;
-      BaseFloat weight = pdf_post[t][i].second;
-      accs->AccumulateForGmm(am_gmm.GetPdf(pdf),
-                             feats.Row(t), weight);
-    }
-  }
-}
-
-
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Estimate fMLLR transforms in the space before splicing and linear transforms\n"
-        "such as LDA+MLLT, but using models in the space transformed by these transforms\n"
-        "Requires the original spliced features, and the full LDA+MLLT (or similar) matrix\n"
-        "including the 'rejected' rows (see the program get-full-lda-mat)\n"
-        "Usage: gmm-est-fmllr-raw [options] <model-in> <full-lda-mat-in> "
-        "<feature-rspecifier> <post-rspecifier> <transform-wspecifier>\n";
-
-
-    int32 raw_feat_dim = 13;
-    ParseOptions po(usage);
-    FmllrRawOptions opts;
-    std::string spk2utt_rspecifier;
-    po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
-                "utterance-list map");
-    po.Register("raw-feat-dim", &raw_feat_dim, "Dimension of raw features "
-                "prior to splicing");
-    opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string model_rxfilename = po.GetArg(1),
-        full_lda_mat_rxfilename = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        post_rspecifier = po.GetArg(4),
-        transform_wspecifier = po.GetArg(5);
-
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-
-    Matrix<BaseFloat> full_lda_mat;
-    ReadKaldiObject(full_lda_mat_rxfilename, &full_lda_mat);
-    
-    RandomAccessPosteriorReader post_reader(post_rspecifier);
-    BaseFloatMatrixWriter transform_writer(transform_wspecifier);
-    
-    double tot_auxf_impr = 0.0, tot_count = 0.0;
-    
-    int32 num_done = 0, num_err = 0;
-    if (!spk2utt_rspecifier.empty()) { // Adapting per speaker
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        FmllrRawAccs accs(raw_feat_dim, am_gmm.Dim(), full_lda_mat);
-        std::string spk = spk2utt_reader.Key();
-        const std::vector<std::string> &uttlist = spk2utt_reader.Value();
-        for (size_t i = 0; i < uttlist.size(); i++) {
-          std::string utt = uttlist[i];
-          if (!feature_reader.HasKey(utt)) {
-            KALDI_WARN << "Features not found for utterance " << utt;
-            num_err++;
-            continue;
-          }
-          if (!post_reader.HasKey(utt)) {
-            KALDI_WARN << "Posteriors not found for utterance " << utt;
-            num_err++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-          const Posterior &post = post_reader.Value(utt);
-          if (static_cast<int32>(post.size()) != feats.NumRows()) {
-            KALDI_WARN << "Size mismatch between posteriors " << post.size()
-                       << " and features " << feats.NumRows();
-            num_err++;
-            continue;
-          }
-
-          AccStatsForUtterance(trans_model, am_gmm, post, feats, &accs);
-          num_done++;
-        }
-        
-        BaseFloat auxf_impr, count;
-        {
-          Matrix<BaseFloat> transform(raw_feat_dim, raw_feat_dim + 1);
-          transform.SetUnit();
-          accs.Update(opts, &transform, &auxf_impr, &count);
-          transform_writer.Write(spk, transform);
-        }
-        KALDI_LOG << "For speaker " << spk << ", auxf-impr from raw fMLLR is "
-                  << (auxf_impr/count) << " over " << count << " frames.";
-        tot_auxf_impr += auxf_impr;
-        tot_count += count;
-      }
-    } else {  // --spk2utt option not given -> adapt per utterance.
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        std::string utt = feature_reader.Key();
-        if (!post_reader.HasKey(utt)) {
-          KALDI_WARN << "Posteriors not found for utterance " << utt;
-          num_err++;
-          continue;
-        }
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        const Posterior &post = post_reader.Value(utt);
-
-        if (static_cast<int32>(post.size()) != feats.NumRows()) {
-          KALDI_WARN << "Size mismatch between posteriors " << post.size()
-                     << " and features " << feats.NumRows();
-          num_err++;
-          continue;
-        }
-
-        FmllrRawAccs accs(raw_feat_dim, am_gmm.Dim(), full_lda_mat);
-
-        AccStatsForUtterance(trans_model, am_gmm, post, feats, &accs);
-        
-        BaseFloat auxf_impr, count;        
-        {
-          Matrix<BaseFloat> transform(raw_feat_dim, raw_feat_dim + 1);
-          transform.SetUnit();
-          accs.Update(opts, &transform, &auxf_impr, &count);
-          transform_writer.Write(utt, transform);
-        }
-        KALDI_LOG << "For utterance " << utt << ", auxf-impr from raw fMLLR is "
-                  << (auxf_impr/count) << " over " << count << " frames.";
-        tot_auxf_impr += auxf_impr;
-        tot_count += count;
-        num_done++;
-      }
-    }
-
-    KALDI_LOG << "Processed " << num_done << " utterances, "
-              << num_err << " had errors.";
-    KALDI_LOG << "Overall raw-fMLLR auxf impr per frame is "
-              << (tot_auxf_impr / tot_count) << " over " << tot_count
-              << " frames.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/gmmbin/gmm-est-fmllr.cc b/src/gmmbin/gmm-est-fmllr.cc
index 9f8dfd89143..c44a284b2f8 100644
--- a/src/gmmbin/gmm-est-fmllr.cc
+++ b/src/gmmbin/gmm-est-fmllr.cc
@@ -27,14 +27,14 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "hmm/posterior.h"
 
 namespace kaldi {
 void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
                             const Posterior &post,
-                            const TransitionModel &trans_model,
+                            const Transitions &trans_model,
                             const AmDiagGmm &am_gmm,
                             FmllrDiagGmmAccs *spk_stats) {
   Posterior pdf_post;
@@ -83,7 +83,7 @@ int main(int argc, char *argv[]) {
         post_rspecifier = po.GetArg(3),
         trans_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-est-gaussians-ebw.cc b/src/gmmbin/gmm-est-gaussians-ebw.cc
index bbd53c2bec0..cfbb8ece02d 100644
--- a/src/gmmbin/gmm-est-gaussians-ebw.cc
+++ b/src/gmmbin/gmm-est-gaussians-ebw.cc
@@ -21,7 +21,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/ebw-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -62,7 +62,7 @@ int main(int argc, char *argv[]) {
         model_out_filename = po.GetArg(4);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
diff --git a/src/gmmbin/gmm-est-lvtln-trans.cc b/src/gmmbin/gmm-est-lvtln-trans.cc
index abfc24a6585..849560dd437 100644
--- a/src/gmmbin/gmm-est-lvtln-trans.cc
+++ b/src/gmmbin/gmm-est-lvtln-trans.cc
@@ -26,7 +26,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/lvtln.h"
 #include "hmm/posterior.h"
 
@@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
     {
       bool binary;
       Input ki(model_rxfilename, &binary);
-      TransitionModel trans_model;
+      Transitions trans_model;
       trans_model.Read(ki.Stream(), binary);
       am_gmm.Read(ki.Stream(), binary);
     }
diff --git a/src/gmmbin/gmm-est-map.cc b/src/gmmbin/gmm-est-map.cc
index 22ea8acda51..6cbb864fcf7 100644
--- a/src/gmmbin/gmm-est-map.cc
+++ b/src/gmmbin/gmm-est-map.cc
@@ -22,7 +22,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -36,7 +36,6 @@ int main(int argc, char *argv[]) {
         "e.g.: gmm-est-map 1.mdl 1.acc 2.mdl\n";
 
     bool binary_write = true;
-    MapTransitionUpdateConfig tcfg;
     MapDiagGmmOptions gmm_opts;
     std::string update_flags_str = "mvwt";
     std::string occs_out_filename;
@@ -47,7 +46,6 @@ int main(int argc, char *argv[]) {
                 "update: subset of mvwt.");
     po.Register("write-occs", &occs_out_filename, "File to write state "
                 "occupancies to.");
-    tcfg.Register(&po);
     gmm_opts.Register(&po);
 
     po.Read(argc, argv);
@@ -65,7 +63,7 @@ int main(int argc, char *argv[]) {
         model_out_filename = po.GetArg(3);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
@@ -82,14 +80,6 @@ int main(int argc, char *argv[]) {
       gmm_accs.Read(ki.Stream(), binary, true);  // true == add; doesn't matter here.
     }
 
-    if (update_flags & kGmmTransitions) {  // Update transition model.
-      BaseFloat objf_impr, count;
-      trans_model.MapUpdate(transition_accs, tcfg, &objf_impr, &count);
-      KALDI_LOG << "Transition model update: Overall " << (objf_impr/count)
-                << " log-like improvement per frame over " << (count)
-                << " frames.";
-    }
-
     {  // Update GMMs.
       BaseFloat objf_impr, count;
       BaseFloat tot_like = gmm_accs.TotLogLike(),
diff --git a/src/gmmbin/gmm-est-regtree-fmllr-ali.cc b/src/gmmbin/gmm-est-regtree-fmllr-ali.cc
deleted file mode 100644
index 0158bae8298..00000000000
--- a/src/gmmbin/gmm-est-regtree-fmllr-ali.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-// gmmbin/gmm-est-regtree-fmllr-ali.cc
-
-// Copyright 2009-2011  Saarland University;  Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Compute FMLLR transforms per-utterance (default) or per-speaker for "
-        "the supplied set of speakers (spk2utt option).  Note: writes RegtreeFmllrDiagGmm objects\n"
-        "Usage: gmm-est-regtree-fmllr-ali  [options] <model-in> <feature-rspecifier> "
-        "<alignments-rspecifier> <regression-tree> <transforms-wspecifier>\n";
-
-    ParseOptions po(usage);
-    string spk2utt_rspecifier;
-    bool binary = true;
-    po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
-                "utterance-list map");
-    po.Register("binary", &binary, "Write output in binary mode");
-    // register other modules
-    RegtreeFmllrOptions opts;
-    opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        alignments_rspecifier = po.GetArg(3),
-        regtree_filename = po.GetArg(4),
-        xforms_wspecifier = po.GetArg(5);
-
-    RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier);
-    RegtreeFmllrDiagGmmWriter fmllr_writer(xforms_wspecifier);
-
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-    RegressionTree regtree;
-    {
-      bool binary;
-      Input in(regtree_filename, &binary);
-      regtree.Read(in.Stream(), binary, am_gmm);
-    }
-
-    RegtreeFmllrDiagGmm fmllr_xforms;
-    RegtreeFmllrDiagGmmAccs fmllr_accs;
-    fmllr_accs.Init(regtree.NumBaseclasses(), am_gmm.Dim());
-
-    double tot_like = 0.0;
-    kaldi::int64 tot_t = 0;
-
-    int32 num_done = 0, num_no_alignment = 0, num_other_error = 0;
-    double tot_objf_impr = 0.0, tot_t_objf = 0.0;
-    if (spk2utt_rspecifier != "") {  // per-speaker adaptation
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        string spk = spk2utt_reader.Key();
-        fmllr_accs.SetZero();
-        const vector<string> &uttlist = spk2utt_reader.Value();
-        for (vector<string>::const_iterator utt_itr = uttlist.begin(),
-            itr_end = uttlist.end(); utt_itr != itr_end; ++utt_itr) {
-          if (!feature_reader.HasKey(*utt_itr)) {
-            KALDI_WARN << "Did not find features for utterance " << *utt_itr;
-            continue;
-          }
-          if (!alignments_reader.HasKey(*utt_itr)) {
-            KALDI_WARN << "Did not find aligned transcription for utterance "
-                << *utt_itr;
-            num_no_alignment++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(*utt_itr);
-          const vector<int32> &alignment = alignments_reader.Value(*utt_itr);
-          if (static_cast<int32>(alignment.size()) != feats.NumRows()) {
-            KALDI_WARN << "Alignments has wrong size " << (alignment.size())
-                << " vs. " << (feats.NumRows());
-            num_other_error++;
-            continue;
-          }
-
-          BaseFloat file_like = 0.0;
-          for (size_t i = 0; i < alignment.size(); i++) {
-            int32 pdf_id = trans_model.TransitionIdToPdf(alignment[i]);
-            file_like += fmllr_accs.AccumulateForGmm(regtree, am_gmm,
-                feats.Row(i), pdf_id, 1.0);
-          }
-          KALDI_VLOG(2) << "Average like for this file is " << (file_like
-              / alignment.size()) << " over " << alignment.size()
-              << " frames.\n";
-          tot_like += file_like;
-          tot_t += alignment.size();
-          num_done++;
-          if (num_done % 10 == 0) KALDI_VLOG(1)
-              << "Avg like per frame so far is " << (tot_like / tot_t) << '\n';
-        }  // end looping over all utterances of the current speaker
-        BaseFloat objf_impr, t;
-        fmllr_accs.Update(regtree, opts, &fmllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "fMLLR objf improvement for speaker " << spk << " is "
-                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
-                  << " frames.";
-        tot_objf_impr += objf_impr;
-        tot_t_objf += t;
-        fmllr_writer.Write(spk, fmllr_xforms);
-      }  // end looping over speakers
-    } else {  // per-utterance adaptation
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        string key = feature_reader.Key();
-        if (!alignments_reader.HasKey(key)) {
-          KALDI_WARN << "Did not find aligned transcription for utterance "
-              << key;
-          num_no_alignment++;
-          continue;
-        }
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        const vector<int32> &alignment = alignments_reader.Value(key);
-
-        if (static_cast<int32>(alignment.size()) != feats.NumRows()) {
-          KALDI_WARN << "Alignments has wrong size " << (alignment.size())
-              << " vs. " << (feats.NumRows());
-          num_other_error++;
-          continue;
-        }
-
-        num_done++;
-        BaseFloat file_like = 0.0;
-        fmllr_accs.SetZero();
-        for (size_t i = 0; i < alignment.size(); i++) {
-          int32 pdf_id = trans_model.TransitionIdToPdf(alignment[i]);
-          file_like += fmllr_accs.AccumulateForGmm(regtree, am_gmm,
-              feats.Row(i), pdf_id, 1.0);
-        }
-        KALDI_VLOG(2) << "Average like for this file is " << (file_like
-            / alignment.size()) << " over " << alignment.size() << " frames.";
-        tot_like += file_like;
-        tot_t += alignment.size();
-        if (num_done % 10 == 0) KALDI_VLOG(1)
-            << "Avg like per frame so far is " << (tot_like / tot_t);
-        BaseFloat objf_impr, t;
-        fmllr_accs.Update(regtree, opts, &fmllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "fMLLR objf improvement for utterance " << key << " is "
-                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
-                  << " frames.";
-        tot_objf_impr += objf_impr;
-        tot_t_objf += t;
-        fmllr_writer.Write(feature_reader.Key(), fmllr_xforms);
-      }
-    }
-
-    KALDI_LOG << "Overall objf improvement from fMLLR is "
-              << (tot_objf_impr/tot_t_objf)
-              << " per frame over " << tot_t_objf << " frames.";
-    KALDI_LOG << "Done " << num_done << " files, " << num_no_alignment
-              << " with no alignments, " << num_other_error
-              << " with other errors.";
-    KALDI_LOG << "Overall acoustic like per frame = " << (tot_like / tot_t)
-              << " over " << tot_t << " frames.";
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/gmmbin/gmm-est-regtree-fmllr.cc b/src/gmmbin/gmm-est-regtree-fmllr.cc
deleted file mode 100644
index ca807f07fd4..00000000000
--- a/src/gmmbin/gmm-est-regtree-fmllr.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-// gmmbin/gmm-est-regtree-fmllr.cc
-
-// Copyright 2009-2011  Saarland University;  Microsoft Corporation
-//                2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "hmm/posterior.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Compute FMLLR transforms per-utterance (default) or per-speaker for "
-        "the supplied set of speakers (spk2utt option).  Note: writes RegtreeFmllrDiagGmm objects\n"
-        "Usage: gmm-est-regtree-fmllr  [options] <model-in> <feature-rspecifier> "
-        "<posteriors-rspecifier> <regression-tree> <transforms-wspecifier>\n";
-
-    ParseOptions po(usage);
-    string spk2utt_rspecifier;
-    bool binary = true;
-    po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
-                "utterance-list map");
-    po.Register("binary", &binary, "Write output in binary mode");
-    // register other modules
-    RegtreeFmllrOptions opts;
-    opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        posteriors_rspecifier = po.GetArg(3),
-        regtree_filename = po.GetArg(4),
-        xforms_wspecifier = po.GetArg(5);
-
-    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
-    RegtreeFmllrDiagGmmWriter fmllr_writer(xforms_wspecifier);
-
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-    RegressionTree regtree;
-    {
-      bool binary;
-      Input in(regtree_filename, &binary);
-      regtree.Read(in.Stream(), binary, am_gmm);
-    }
-
-    RegtreeFmllrDiagGmm fmllr_xforms;
-    RegtreeFmllrDiagGmmAccs fmllr_accs;
-    fmllr_accs.Init(regtree.NumBaseclasses(), am_gmm.Dim());
-
-    double tot_like = 0.0, tot_t = 0;
-
-    int32 num_done = 0, num_no_posterior = 0, num_other_error = 0;
-    double tot_objf_impr = 0.0, tot_t_objf = 0.0;
-    if (spk2utt_rspecifier != "") {  // per-speaker adaptation
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        string spk = spk2utt_reader.Key();
-        fmllr_accs.SetZero();
-        const vector<string> &uttlist = spk2utt_reader.Value();
-        for (vector<string>::const_iterator utt_itr = uttlist.begin(),
-            itr_end = uttlist.end(); utt_itr != itr_end; ++utt_itr) {
-          if (!feature_reader.HasKey(*utt_itr)) {
-            KALDI_WARN << "Did not find features for utterance " << *utt_itr;
-            continue;
-          }
-          if (!posteriors_reader.HasKey(*utt_itr)) {
-            KALDI_WARN << "Did not find posteriors for utterance "
-                << *utt_itr;
-            num_no_posterior++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(*utt_itr);
-          const Posterior &posterior = posteriors_reader.Value(*utt_itr);
-          if (static_cast<int32>(posterior.size()) != feats.NumRows()) {
-            KALDI_WARN << "Posteriors has wrong size " << (posterior.size())
-                << " vs. " << (feats.NumRows());
-            num_other_error++;
-            continue;
-          }
-
-          BaseFloat file_like = 0.0, file_t = 0.0;
-          Posterior pdf_posterior;
-          ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior);
-          for (size_t i = 0; i < posterior.size(); i++) {
-            for (size_t j = 0; j < pdf_posterior[i].size(); j++) {
-              int32 pdf_id = pdf_posterior[i][j].first;
-              BaseFloat prob = pdf_posterior[i][j].second;
-              file_like += fmllr_accs.AccumulateForGmm(regtree, am_gmm,
-                                                       feats.Row(i), pdf_id,
-                                                       prob);
-              file_t += prob;
-            }
-          }
-          KALDI_VLOG(2) << "Average like for this file is " << (file_like/file_t)
-                        << " over " << file_t << " frames.";
-          tot_like += file_like;
-          tot_t += file_t;
-          num_done++;
-          if (num_done % 10 == 0)
-            KALDI_VLOG(1) << "Avg like per frame so far is "
-                          << (tot_like / tot_t);
-        }  // end looping over all utterances of the current speaker
-        BaseFloat objf_impr, t;
-        fmllr_accs.Update(regtree, opts, &fmllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "fMLLR objf improvement for speaker " << spk << " is "
-                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
-                  << " frames.";
-        tot_objf_impr += objf_impr;
-        tot_t_objf += t;
-        fmllr_writer.Write(spk, fmllr_xforms);
-      }  // end looping over speakers
-    } else {  // per-utterance adaptation
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        string key = feature_reader.Key();
-        if (!posteriors_reader.HasKey(key)) {
-          KALDI_WARN << "Did not find posteriors for utterance "
-              << key;
-          num_no_posterior++;
-          continue;
-        }
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        const Posterior &posterior = posteriors_reader.Value(key);
-
-        if (static_cast<int32>(posterior.size()) != feats.NumRows()) {
-          KALDI_WARN << "Posteriors has wrong size " << (posterior.size())
-              << " vs. " << (feats.NumRows());
-          num_other_error++;
-          continue;
-        }
-
-        num_done++;
-        BaseFloat file_like = 0.0, file_t = 0.0;
-        fmllr_accs.SetZero();
-        Posterior pdf_posterior;
-        ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior);
-        for (size_t i = 0; i < posterior.size(); i++) {
-          for (size_t j = 0; j < pdf_posterior[i].size(); j++) {
-            int32 pdf_id = pdf_posterior[i][j].first;
-            BaseFloat prob = pdf_posterior[i][j].second;
-            file_like += fmllr_accs.AccumulateForGmm(regtree, am_gmm,
-                                                     feats.Row(i), pdf_id,
-                                                     prob);
-            file_t += prob;
-          }
-        }
-        KALDI_VLOG(2) << "Average like for this file is " << (file_like/file_t)
-                      << " over " << file_t << " frames.";
-        tot_like += file_like;
-        tot_t += file_t;
-        if (num_done % 10 == 0)
-          KALDI_VLOG(1) << "Avg like per frame so far is "
-                        << (tot_like / tot_t);
-        BaseFloat objf_impr, t;
-        fmllr_accs.Update(regtree, opts, &fmllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "fMLLR objf improvement for utterance " << key << " is "
-                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
-                  << " frames.";
-        tot_objf_impr += objf_impr;
-        tot_t_objf += t;
-        fmllr_writer.Write(feature_reader.Key(), fmllr_xforms);
-      }
-    }
-    KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior
-              << " with no posteriors, " << num_other_error
-              << " with other errors.";
-    KALDI_LOG << "Overall objf improvement from MLLR is " << (tot_objf_impr/tot_t_objf)
-              << " per frame " << " over " << tot_t_objf << " frames.";
-    KALDI_LOG << "Overall acoustic likelihood was " << (tot_like/tot_t)
-              << " over " << tot_t << " frames.";
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/gmmbin/gmm-est-regtree-mllr.cc b/src/gmmbin/gmm-est-regtree-mllr.cc
deleted file mode 100644
index a4df5cc84c1..00000000000
--- a/src/gmmbin/gmm-est-regtree-mllr.cc
+++ /dev/null
@@ -1,215 +0,0 @@
-// gmmbin/gmm-est-regtree-mllr.cc
-
-// Copyright 2009-2011  Saarland University;  Microsoft Corporation
-//                2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "transform/regtree-mllr-diag-gmm.h"
-#include "hmm/posterior.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Compute MLLR transforms per-utterance (default) or per-speaker for "
-        "the supplied set of speakers (spk2utt option).  Note: writes RegtreeMllrDiagGmm objects\n"
-        "Usage: gmm-est-regtree-mllr  [options] <model-in> <feature-rspecifier> "
-        "<posteriors-rspecifier> <regression-tree> <transforms-wspecifier>\n";
-
-    ParseOptions po(usage);
-    string spk2utt_rspecifier;
-    bool binary = true;
-    po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
-                "utterance-list map");
-    po.Register("binary", &binary, "Write output in binary mode");
-    // register other modules
-    RegtreeMllrOptions opts;
-    opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        posteriors_rspecifier = po.GetArg(3),
-        regtree_filename = po.GetArg(4),
-        xforms_wspecifier = po.GetArg(5);
-
-    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
-    RegtreeMllrDiagGmmWriter mllr_writer(xforms_wspecifier);
-
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-    RegressionTree regtree;
-    {
-      bool binary;
-      Input in(regtree_filename, &binary);
-      regtree.Read(in.Stream(), binary, am_gmm);
-    }
-
-    RegtreeMllrDiagGmm mllr_xforms;
-    RegtreeMllrDiagGmmAccs mllr_accs;
-    mllr_accs.Init(regtree.NumBaseclasses(), am_gmm.Dim());
-
-    double tot_like = 0.0, tot_t = 0;
-
-    int32 num_done = 0, num_no_posterior = 0, num_other_error = 0;
-    double tot_objf_impr = 0.0, tot_t_objf = 0.0;
-    if (spk2utt_rspecifier != "") {  // per-speaker adaptation
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        string spk = spk2utt_reader.Key();
-        mllr_accs.SetZero();
-        const vector<string> &uttlist = spk2utt_reader.Value();
-        for (vector<string>::const_iterator utt_itr = uttlist.begin(),
-            itr_end = uttlist.end(); utt_itr != itr_end; ++utt_itr) {
-          if (!feature_reader.HasKey(*utt_itr)) {
-            KALDI_WARN << "Did not find features for utterance " << *utt_itr;
-            continue;
-          }
-          if (!posteriors_reader.HasKey(*utt_itr)) {
-            KALDI_WARN << "Did not find posteriors for utterance "
-                << *utt_itr;
-            num_no_posterior++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(*utt_itr);
-          const Posterior &posterior = posteriors_reader.Value(*utt_itr);
-          if (posterior.size() != feats.NumRows()) {
-            KALDI_WARN << "Posteriors has wrong size " << (posterior.size())
-                << " vs. " << (feats.NumRows());
-            num_other_error++;
-            continue;
-          }
-
-          BaseFloat file_like = 0.0, file_t = 0.0;
-          Posterior pdf_posterior;
-          ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior);
-          for (size_t i = 0; i < posterior.size(); i++) {
-            for (size_t j = 0; j < pdf_posterior[i].size(); j++) {
-              int32 pdf_id = pdf_posterior[i][j].first;
-              BaseFloat prob = pdf_posterior[i][j].second;
-              file_like += mllr_accs.AccumulateForGmm(regtree, am_gmm,
-                                                      feats.Row(i), pdf_id,
-                                                      prob);
-              file_t += prob;
-            }
-          }
-          KALDI_VLOG(2) << "Average like for this file is " << (file_like/file_t)
-                        << " over " << file_t << " frames.";
-          tot_like += file_like;
-          tot_t += file_t;
-          num_done++;
-          if (num_done % 10 == 0)
-            KALDI_VLOG(1) << "Avg like per frame so far is "
-                          << (tot_like / tot_t);
-        }  // end looping over all utterances of the current speaker
-        BaseFloat objf_impr, t;
-        mllr_accs.Update(regtree, opts, &mllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "MLLR objf improvement for speaker " << spk << " is "
-                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
-                  << " frames.";
-        tot_objf_impr += objf_impr;
-        tot_t_objf += t;
-        mllr_writer.Write(spk, mllr_xforms);
-      }  // end looping over speakers
-    } else {  // per-utterance adaptation
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        string key = feature_reader.Key();
-        if (!posteriors_reader.HasKey(key)) {
-          KALDI_WARN << "Did not find aligned transcription for utterance "
-              << key;
-          num_no_posterior++;
-          continue;
-        }
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        const Posterior &posterior = posteriors_reader.Value(key);
-
-        if (posterior.size() != feats.NumRows()) {
-          KALDI_WARN << "Posteriors has wrong size " << (posterior.size())
-              << " vs. " << (feats.NumRows());
-          num_other_error++;
-          continue;
-        }
-
-        num_done++;
-        BaseFloat file_like = 0.0, file_t = 0.0;
-        mllr_accs.SetZero();
-        Posterior pdf_posterior;
-        ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior);
-        for (size_t i = 0; i < posterior.size(); i++) {
-          for (size_t j = 0; j < pdf_posterior[i].size(); j++) {
-            int32 pdf_id = pdf_posterior[i][j].first;
-            BaseFloat prob = pdf_posterior[i][j].second;
-            file_like += mllr_accs.AccumulateForGmm(regtree, am_gmm,
-                                                    feats.Row(i), pdf_id,
-                                                    prob);
-            file_t += prob;
-          }
-        }
-        KALDI_VLOG(2) << "Average like for this file is " << (file_like/file_t)
-                      << " over " << file_t << " frames.";
-        tot_like += file_like;
-        tot_t += file_t;
-        if (num_done % 10 == 0)
-          KALDI_VLOG(1) << "Avg like per frame so far is " << (tot_like / tot_t);
-        BaseFloat objf_impr, t;
-        mllr_accs.Update(regtree, opts, &mllr_xforms, &objf_impr, &t);
-        KALDI_LOG << "MLLR objf improvement for utterance " << key << " is "
-                  << (objf_impr/(t+1.0e-10)) << " per frame over " << t
-                  << " frames.";
-        tot_objf_impr += objf_impr;
-        tot_t_objf += t;
-        mllr_writer.Write(feature_reader.Key(), mllr_xforms);
-      }
-    }
-    KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior
-              << " with no posteriors, " << num_other_error
-              << " with other errors.";
-    KALDI_LOG << "Overall objf improvement from MLLR is " << (tot_objf_impr/tot_t_objf)
-              << " per frame " << " over " << tot_t_objf << " frames.";
-    KALDI_LOG << "Overall acoustic likelihood was " << (tot_like/tot_t)
-              << " over " << tot_t << " frames.";
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/gmmbin/gmm-est-rescale.cc b/src/gmmbin/gmm-est-rescale.cc
index a432b3d77f6..1e9c1e2aa84 100644
--- a/src/gmmbin/gmm-est-rescale.cc
+++ b/src/gmmbin/gmm-est-rescale.cc
@@ -21,7 +21,7 @@
 #include "util/common-utils.h"
 #include "gmm/indirect-diff-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   using namespace kaldi;
@@ -62,7 +62,7 @@ int main(int argc, char *argv[]) {
         model_wxfilename = po.GetArg(4);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_rxfilename, &binary_read);
diff --git a/src/gmmbin/gmm-est-weights-ebw.cc b/src/gmmbin/gmm-est-weights-ebw.cc
index f19343a7ac4..9cf2c2d7d04 100644
--- a/src/gmmbin/gmm-est-weights-ebw.cc
+++ b/src/gmmbin/gmm-est-weights-ebw.cc
@@ -21,7 +21,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/ebw-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -62,7 +62,7 @@ int main(int argc, char *argv[]) {
         model_out_filename = po.GetArg(4);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
diff --git a/src/gmmbin/gmm-est.cc b/src/gmmbin/gmm-est.cc
index 18c836a1f50..5bde0923536 100644
--- a/src/gmmbin/gmm-est.cc
+++ b/src/gmmbin/gmm-est.cc
@@ -1,6 +1,7 @@
 // gmmbin/gmm-est.cc
 
 // Copyright 2009-2011  Microsoft Corporation
+//                2019  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -21,7 +22,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -35,7 +36,6 @@ int main(int argc, char *argv[]) {
         "e.g.: gmm-est 1.mdl 1.acc 2.mdl\n";
 
     bool binary_write = true;
-    MleTransitionUpdateConfig tcfg;
     MleDiagGmmOptions gmm_opts;
     int32 mixup = 0;
     int32 mixdown = 0;
@@ -61,7 +61,6 @@ int main(int argc, char *argv[]) {
                 "means by standard deviation times this factor.");
     po.Register("write-occs", &occs_out_filename, "File to write pdf "
                 "occupation counts to.");
-    tcfg.Register(&po);
     gmm_opts.Register(&po);
 
     po.Read(argc, argv);
@@ -79,7 +78,7 @@ int main(int argc, char *argv[]) {
         model_out_filename = po.GetArg(3);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
@@ -87,23 +86,13 @@ int main(int argc, char *argv[]) {
       am_gmm.Read(ki.Stream(), binary_read);
     }
 
-    Vector<double> transition_accs;
     AccumAmDiagGmm gmm_accs;
     {
       bool binary;
       Input ki(stats_filename, &binary);
-      transition_accs.Read(ki.Stream(), binary);
       gmm_accs.Read(ki.Stream(), binary, true);  // true == add; doesn't matter here.
     }
 
-    if (update_flags & kGmmTransitions) {  // Update transition model.
-      BaseFloat objf_impr, count;
-      trans_model.MleUpdate(transition_accs, tcfg, &objf_impr, &count);
-      KALDI_LOG << "Transition model update: Overall " << (objf_impr/count)
-                << " log-like improvement per frame over " << (count)
-                << " frames.";
-    }
-
     {  // Update GMMs.
       BaseFloat objf_impr, count;
       BaseFloat tot_like = gmm_accs.TotLogLike(),
@@ -143,12 +132,10 @@ int main(int argc, char *argv[]) {
       am_gmm.Write(ko.Stream(), binary_write);
     }
 
-    KALDI_LOG << "Written model to " << model_out_filename;
+    KALDI_LOG << "Wrote model to " << model_out_filename;
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
   }
 }
-
-
diff --git a/src/gmmbin/gmm-fmpe-acc-stats.cc b/src/gmmbin/gmm-fmpe-acc-stats.cc
deleted file mode 100644
index 4868b63b6ae..00000000000
--- a/src/gmmbin/gmm-fmpe-acc-stats.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-// gmmbin/gmm-fmpe-acc-stats.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "transform/fmpe.h"
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using kaldi::int32;
-  try {
-    const char *usage =
-        "Accumulate stats for fMPE training, using GMM model.  Note: this could\n"
-        "be done using gmm-get-feat-deriv and fmpe-acc-stats (but you'd be computing\n"
-        "the features twice).  Features input should be pre-fMPE features.\n"
-        "\n"
-        "Usage:  gmm-fmpe-acc-stats [options] <model-in> <fmpe-in> <feature-rspecifier> "
-        "<gselect-rspecifier> <posteriors-rspecifier> <fmpe-stats-out>\n"
-        "e.g.: \n"
-        " gmm-fmpe-acc-stats --model-derivative 1.accs 1.mdl 1.fmpe \"$feats\" ark:1.gselect ark:1.post 1.fmpe_stats\n";
-        
-    ParseOptions po(usage);
-    bool binary = true;
-    std::string model_derivative_rxfilename;
-    po.Register("binary", &binary, "If true, write stats in binary mode.");
-    po.Register("model-derivative", &model_derivative_rxfilename,
-                "GMM-accs file containing model derivative [note: contains no transition stats].  Used for indirect differential.  Warning: this will only work correctly in the case of MMI/BMMI objective function, with non-canceled stats.");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_rxfilename = po.GetArg(1),
-        fmpe_rxfilename = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        gselect_rspecifier = po.GetArg(4),
-        posteriors_rspecifier = po.GetArg(5),
-        stats_wxfilename = po.GetArg(6);
-    
-    AmDiagGmm am_gmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-
-    Fmpe fmpe;
-    ReadKaldiObject(fmpe_rxfilename, &fmpe);
-
-
-    bool have_indirect = (model_derivative_rxfilename != "");
-    AccumAmDiagGmm model_derivative;
-    if (have_indirect)
-      ReadKaldiObject(model_derivative_rxfilename, &model_derivative);
-    
-    FmpeStats fmpe_stats(fmpe);
-    
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
-
-    BaseFloat tot_like = 0.0; // tot like weighted by posterior.
-    int32 num_frames = 0;
-    int32 num_done = 0, num_err = 0;
-    
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      std::string key = feature_reader.Key();
-      if (!posteriors_reader.HasKey(key)) {
-        num_err++;
-        KALDI_WARN << "No posteriors for utterance " << key;
-        continue;
-      } 
-      const Matrix<BaseFloat> &feat_in = feature_reader.Value();
-      const Posterior &posterior = posteriors_reader.Value(key);
-
-      if (static_cast<int32>(posterior.size()) != feat_in.NumRows()) {
-        KALDI_WARN << "Posterior vector has wrong size " <<
-            (posterior.size()) << " vs. "<< (feat_in.NumRows());
-        num_err++;
-        continue;
-      }
-
-      if (!gselect_reader.HasKey(key)) {
-        KALDI_WARN << "No gselect information for key " << key;
-        num_err++;
-        continue;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(key);
-      if (static_cast<int32>(gselect.size()) != feat_in.NumRows()) {
-        KALDI_WARN << "gselect information has wrong size";
-        num_err++;
-        continue;
-      }
-      
-      num_done++;
-      Matrix<BaseFloat> fmpe_feat(feat_in.NumRows(), feat_in.NumCols());
-      fmpe.ComputeFeatures(feat_in, gselect, &fmpe_feat);
-      fmpe_feat.AddMat(1.0, feat_in);
-      
-      Matrix<BaseFloat> direct_deriv, indirect_deriv;
-
-      tot_like += ComputeAmGmmFeatureDeriv(am_gmm, trans_model, posterior,
-                                           fmpe_feat, &direct_deriv,
-                                           (have_indirect ? &model_derivative : NULL),
-                                           (have_indirect ? &indirect_deriv : NULL));
-      num_frames += feat_in.NumRows();
-
-      fmpe.AccStats(feat_in, gselect, direct_deriv,
-                    (have_indirect ? &indirect_deriv : NULL), &fmpe_stats);
-      
-      if (num_done % 100 == 0)
-        KALDI_LOG << "Processed " << num_done << " utterances.";
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, " << num_err
-              << " with errors.";
-    KALDI_LOG << "Overall weighted acoustic likelihood per frame is "
-              << (tot_like/num_frames) << " over " << num_frames << " frames.";
-
-    Output ko(stats_wxfilename, binary);
-    fmpe_stats.Write(ko.Stream(), binary);
-    
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/gmmbin/gmm-get-stats-deriv.cc b/src/gmmbin/gmm-get-stats-deriv.cc
index 939fe260b34..a6fd9764719 100644
--- a/src/gmmbin/gmm-get-stats-deriv.cc
+++ b/src/gmmbin/gmm-get-stats-deriv.cc
@@ -21,7 +21,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/indirect-diff-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -64,7 +64,7 @@ int main(int argc, char *argv[]) {
         deriv_wxfilename = po.GetArg(5);
         
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_rxfilename, &binary_read);
diff --git a/src/gmmbin/gmm-global-est-fmllr.cc b/src/gmmbin/gmm-global-est-fmllr.cc
index b1d5b68e594..951b8addf2d 100644
--- a/src/gmmbin/gmm-global-est-fmllr.cc
+++ b/src/gmmbin/gmm-global-est-fmllr.cc
@@ -25,7 +25,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 
 namespace kaldi {
diff --git a/src/gmmbin/gmm-global-est-lvtln-trans.cc b/src/gmmbin/gmm-global-est-lvtln-trans.cc
index 10bb5bec5d5..95b56503f2c 100644
--- a/src/gmmbin/gmm-global-est-lvtln-trans.cc
+++ b/src/gmmbin/gmm-global-est-lvtln-trans.cc
@@ -26,7 +26,7 @@ using std::vector;
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/lvtln.h"
 #include "hmm/posterior.h"
 
diff --git a/src/gmmbin/gmm-global-info.cc b/src/gmmbin/gmm-global-info.cc
index 7c21005b449..00222ef81c3 100644
--- a/src/gmmbin/gmm-global-info.cc
+++ b/src/gmmbin/gmm-global-info.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
diff --git a/src/gmmbin/gmm-gselect.cc b/src/gmmbin/gmm-gselect.cc
index a873b962591..357998e996d 100644
--- a/src/gmmbin/gmm-gselect.cc
+++ b/src/gmmbin/gmm-gselect.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
diff --git a/src/gmmbin/gmm-info.cc b/src/gmmbin/gmm-info.cc
index 31f7aea0921..689c68150ec 100644
--- a/src/gmmbin/gmm-info.cc
+++ b/src/gmmbin/gmm-info.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -46,7 +46,7 @@ int main(int argc, char *argv[]) {
     std::string model_in_filename = po.GetArg(1);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
@@ -58,8 +58,6 @@ int main(int argc, char *argv[]) {
     std::cout << "number of pdfs " << trans_model.NumPdfs() << '\n';
     std::cout << "number of transition-ids " << trans_model.NumTransitionIds()
               << '\n';
-    std::cout << "number of transition-states "
-              << trans_model.NumTransitionStates() << '\n';
     std::cout << "feature dimension " << am_gmm.Dim() << '\n';
     std::cout << "number of gaussians " << am_gmm.NumGauss() << '\n';
     return 0;
diff --git a/src/gmmbin/gmm-init-biphone.cc b/src/gmmbin/gmm-init-biphone.cc
index 0775a5c7b23..10fc9ad4048 100644
--- a/src/gmmbin/gmm-init-biphone.cc
+++ b/src/gmmbin/gmm-init-biphone.cc
@@ -23,8 +23,8 @@
 #include "gmm/am-diag-gmm.h"
 #include "tree/event-map.h"
 #include "tree/context-dep.h"
-#include "hmm/hmm-topology.h"
-#include "hmm/transition-model.h"
+#include "hmm/topology.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 // This function reads a file like:
@@ -314,7 +314,7 @@ int main(int argc, char *argv[]) {
     Vector<BaseFloat> glob_mean(dim);
     glob_mean.Set(1.0);
 
-    HmmTopology topo;
+    Topology topo;
     bool binary_in;
     Input ki(topo_filename, &binary_in);
     topo.Read(ki.Stream(), binary_in);
@@ -375,7 +375,7 @@ int main(int argc, char *argv[]) {
       am_gmm.AddPdf(gmm);
 
     // Now the transition model:
-    TransitionModel trans_model(*ctx_dep, topo);
+    Transitions trans_model(*ctx_dep, topo);
 
     {
       Output ko(model_filename, binary);
diff --git a/src/gmmbin/gmm-init-model-flat.cc b/src/gmmbin/gmm-init-model-flat.cc
index fecd91f49fd..d41b99c35e6 100644
--- a/src/gmmbin/gmm-init-model-flat.cc
+++ b/src/gmmbin/gmm-init-model-flat.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 #include "tree/build-tree-utils.h"
 #include "tree/context-dep.h"
@@ -104,7 +104,7 @@ int main(int argc, char *argv[]) {
     ContextDependency ctx_dep;
     ReadKaldiObject(tree_filename, &ctx_dep);
 
-    HmmTopology topo; 
+    Topology topo; 
     ReadKaldiObject(topo_filename, &topo);
 
     Vector<BaseFloat> global_inverse_var, global_mean;
@@ -138,7 +138,7 @@ int main(int argc, char *argv[]) {
     for (int i = 0; i < num_pdfs; i++)
       am_gmm.AddPdf(gmm);
     
-    TransitionModel trans_model(ctx_dep, topo);
+    Transitions trans_model(ctx_dep, topo);
 
     {
       Output ko(model_out_filename, binary);
diff --git a/src/gmmbin/gmm-init-model.cc b/src/gmmbin/gmm-init-model.cc
index e2d943b19eb..a081f326b1c 100644
--- a/src/gmmbin/gmm-init-model.cc
+++ b/src/gmmbin/gmm-init-model.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 #include "tree/build-tree-utils.h"
 #include "tree/context-dep.h"
@@ -35,7 +35,7 @@ namespace kaldi {
 void InitAmGmm(const BuildTreeStatsType &stats,
                const EventMap &to_pdf_map,
                AmDiagGmm *am_gmm,
-               const TransitionModel &trans_model,
+               const Transitions &trans_model,
                BaseFloat var_floor) {
   // Get stats split by tree-leaf ( == pdf):
   std::vector<BuildTreeStatsType> split_stats;
@@ -126,7 +126,7 @@ void InitAmGmmFromOld(const BuildTreeStatsType &stats,
   ContextDependency old_tree;
   {  // Read old_gm_gmm
     bool binary_in;
-    TransitionModel old_trans_model;
+    Transitions old_trans_model;
     Input ki(old_model_rxfilename, &binary_in);
     old_trans_model.Read(ki.Stream(), binary_in);
     old_am_gmm.Read(ki.Stream(), binary_in);
@@ -270,12 +270,12 @@ int main(int argc, char *argv[]) {
     }
     KALDI_LOG << "Number of separate statistics is " << stats.size();
 
-    HmmTopology topo;
+    Topology topo;
     ReadKaldiObject(topo_filename, &topo);
 
     const EventMap &to_pdf = ctx_dep.ToPdfMap();  // not owned here.
 
-    TransitionModel trans_model(ctx_dep, topo);
+    Transitions trans_model(ctx_dep, topo);
     
     // Now, the summed_stats will be used to initialize the GMM.
     AmDiagGmm am_gmm;
diff --git a/src/gmmbin/gmm-init-mono.cc b/src/gmmbin/gmm-init-mono.cc
index 3c370c36515..a91948e446b 100644
--- a/src/gmmbin/gmm-init-mono.cc
+++ b/src/gmmbin/gmm-init-mono.cc
@@ -21,8 +21,8 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/hmm-topology.h"
-#include "hmm/transition-model.h"
+#include "hmm/topology.h"
+#include "hmm/transitions.h"
 #include "tree/context-dep.h"
 
 namespace kaldi {
@@ -116,7 +116,7 @@ int main(int argc, char *argv[]) {
       glob_mean.CopyFromVec(mean_stats);
     }
 
-    HmmTopology topo;
+    Topology topo;
     bool binary_in;
     Input ki(topo_filename, &binary_in);
     topo.Read(ki.Stream(), binary_in);
@@ -164,7 +164,7 @@ int main(int argc, char *argv[]) {
     }
 
     // Now the transition model:
-    TransitionModel trans_model(*ctx_dep, topo);
+    Transitions trans_model(*ctx_dep, topo);
 
     {
       Output ko(model_filename, binary);
diff --git a/src/gmmbin/gmm-ismooth-stats.cc b/src/gmmbin/gmm-ismooth-stats.cc
index b29e1efc1c3..a524d27b47b 100644
--- a/src/gmmbin/gmm-ismooth-stats.cc
+++ b/src/gmmbin/gmm-ismooth-stats.cc
@@ -21,7 +21,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/ebw-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -77,7 +77,7 @@ int main(int argc, char *argv[]) {
       stats.Write(ko.Stream(), binary_write);
     } else if (smooth_from_model) { // Smoothing from model...
       AmDiagGmm am_gmm;
-      TransitionModel trans_model;
+      Transitions trans_model;
       Vector<double> dst_transition_accs;
       AccumAmDiagGmm dst_stats;
       { // read src model
diff --git a/src/gmmbin/gmm-latgen-biglm-faster.cc b/src/gmmbin/gmm-latgen-biglm-faster.cc
index d4e0645b16c..0d881b41ebb 100644
--- a/src/gmmbin/gmm-latgen-biglm-faster.cc
+++ b/src/gmmbin/gmm-latgen-biglm-faster.cc
@@ -24,7 +24,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/lattice-biglm-faster-decoder.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -35,7 +35,7 @@ namespace kaldi {
 // Takes care of output.  Returns true on success.
 bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
                      DecodableInterface &decodable, // not const but is really an input.
-                     const TransitionModel &trans_model,
+                     const Transitions &trans_model,
                      const fst::SymbolTable *word_syms,
                      std::string utt,
                      double acoustic_scale,
@@ -186,7 +186,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(7),
         alignment_wspecifier = po.GetOptArg(8);
     
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-latgen-faster-parallel.cc b/src/gmmbin/gmm-latgen-faster-parallel.cc
index 41f414bcb9c..8cc0aa5dad4 100644
--- a/src/gmmbin/gmm-latgen-faster-parallel.cc
+++ b/src/gmmbin/gmm-latgen-faster-parallel.cc
@@ -24,7 +24,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -82,7 +82,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-latgen-faster-regtree-fmllr.cc b/src/gmmbin/gmm-latgen-faster-regtree-fmllr.cc
deleted file mode 100644
index 36031b13c1e..00000000000
--- a/src/gmmbin/gmm-latgen-faster-regtree-fmllr.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-// gmmbin/gmm-latgen-faster-regtree-fmllr.cc
-
-// Copyright 2009-2012  Microsoft Corporation
-//           2012-2013  Johns Hopkins University (author: Daniel Povey)
-//                2014  Alpha Cephei Inc.
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "gmm/decodable-am-diag-gmm.h"
-#include "base/timer.h"
-#include "transform/regression-tree.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-#include "transform/decodable-am-diag-gmm-regtree.h"
-#include "feat/feature-functions.h"  // feature reversal
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::Fst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Generate lattices using GMM-based model and RegTree-FMLLR adaptation.\n"
-        "Usage: gmm-latgen-faster-regtree-fmllr [options] model-in regtree-in (fst-in|fsts-rspecifier) features-rspecifier transform-rspecifier"
-        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
-    ParseOptions po(usage);
-    Timer timer;
-    bool allow_partial = false;
-    BaseFloat acoustic_scale = 0.1;
-    LatticeFasterDecoderConfig config;
-
-    std::string word_syms_filename, utt2spk_rspecifier;
-    config.Register(&po);
-    po.Register("utt2spk", &utt2spk_rspecifier, "rspecifier for utterance to "
-                "speaker map used to load the transform");
-    po.Register("acoustic-scale", &acoustic_scale,
-                "Scaling factor for acoustic likelihoods");
-    po.Register("word-symbol-table", &word_syms_filename,
-                "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial,
-                "If true, produce output even if end state was not reached.");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        regtree_in_str = po.GetArg(2),
-        fst_in_str = po.GetArg(3),
-        feature_rspecifier = po.GetArg(4),
-        xforms_rspecifier = po.GetArg(5),
-        lattice_wspecifier = po.GetArg(6),
-        words_wspecifier = po.GetOptArg(7),
-        alignment_wspecifier = po.GetOptArg(8);
-
-    TransitionModel trans_model;
-    AmDiagGmm am_gmm;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_gmm.Read(ki.Stream(), binary);
-    }
-
-    RegressionTree regtree;
-    {
-      bool binary_read;
-      Input in(regtree_in_str, &binary_read);
-      regtree.Read(in.Stream(), binary_read, am_gmm);
-    }
-
-    RandomAccessRegtreeFmllrDiagGmmReaderMapped fmllr_reader(xforms_rspecifier,
-                                                             utt2spk_rspecifier);
-
-    bool determinize = config.determinize_lattice;
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_done = 0, num_err = 0;
-
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      // Input FST is just one FST, not a table of FSTs.
-      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-      
-      {
-        LatticeFasterDecoder decoder(*decode_fst, config);
-    
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          std::string utt = feature_reader.Key();
-          Matrix<BaseFloat> features (feature_reader.Value());
-          feature_reader.FreeCurrent();
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_err++;
-            continue;
-          }
-          if (!fmllr_reader.HasKey(utt)) {
-            KALDI_WARN << "Not decoding utterance " << utt
-                       << " because no transform available.";
-            num_err++;
-            continue;
-          }
-
-          RegtreeFmllrDiagGmm fmllr(fmllr_reader.Value(utt));
-
-          kaldi::DecodableAmDiagGmmRegtreeFmllr gmm_decodable(am_gmm, trans_model,
-                                                            features, fmllr,
-                                                            regtree,
-                                                            acoustic_scale);
-          double like;
-          if (DecodeUtteranceLatticeFaster(
-                  decoder, gmm_decodable, trans_model, word_syms, utt, acoustic_scale,
-                  determinize, allow_partial, &alignment_writer, &words_writer,
-                  &compact_lattice_writer, &lattice_writer, &like)) {
-            tot_like += like;
-            frame_count += features.NumRows();
-            num_done++;
-          } else num_err++;
-        }
-      }
-      delete decode_fst; // delete this only after decoder goes out of scope.
-    } else { // We have different FSTs for different utterances.
-      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);          
-      for (; !fst_reader.Done(); fst_reader.Next()) {
-        std::string utt = fst_reader.Key();
-        const Matrix<BaseFloat> &features = feature_reader.Value(utt);
-        if (features.NumRows() == 0) {
-          KALDI_WARN << "Zero-length utterance: " << utt;
-          num_err++;
-          continue;
-        }
-        if (!fmllr_reader.HasKey(utt)) {
-          KALDI_WARN << "Not decoding utterance " << utt
-                     << " because no transform available.";
-          num_err++;
-          continue;
-        }
-
-        RegtreeFmllrDiagGmm fmllr(fmllr_reader.Value(utt));
-        kaldi::DecodableAmDiagGmmRegtreeFmllr gmm_decodable(am_gmm, trans_model,
-                                                            features, fmllr,
-                                                            regtree,
-                                                            acoustic_scale);
-
-        LatticeFasterDecoder decoder(fst_reader.Value(), config);
-        double like;
-        if (DecodeUtteranceLatticeFaster(
-                decoder, gmm_decodable, trans_model, word_syms, utt, acoustic_scale,
-                determinize, allow_partial, &alignment_writer, &words_writer,
-                &compact_lattice_writer, &lattice_writer, &like)) {
-          tot_like += like;
-          frame_count += features.NumRows();
-          num_done++;
-        } else num_err++;
-      }
-    }
-      
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken "<< elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
-    KALDI_LOG << "Done " << num_done << " utterances, failed for "
-              << num_err;
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
-              << frame_count << " frames.";
-
-    delete word_syms;
-    if (num_done != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/gmmbin/gmm-latgen-faster.cc b/src/gmmbin/gmm-latgen-faster.cc
index 6bc475d1b79..75a9d95aacd 100644
--- a/src/gmmbin/gmm-latgen-faster.cc
+++ b/src/gmmbin/gmm-latgen-faster.cc
@@ -24,7 +24,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-latgen-map.cc b/src/gmmbin/gmm-latgen-map.cc
index ccc15f5a20c..6717eaadacb 100644
--- a/src/gmmbin/gmm-latgen-map.cc
+++ b/src/gmmbin/gmm-latgen-map.cc
@@ -26,7 +26,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "gmm/mle-am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/fmllr-diag-gmm.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
@@ -85,7 +85,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(6),
         alignment_wspecifier = po.GetOptArg(7);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input is(model_in_filename, &binary_read);
diff --git a/src/gmmbin/gmm-latgen-simple.cc b/src/gmmbin/gmm-latgen-simple.cc
index 812bee7fef4..d7ffe86c4ae 100644
--- a/src/gmmbin/gmm-latgen-simple.cc
+++ b/src/gmmbin/gmm-latgen-simple.cc
@@ -24,7 +24,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "gmm/decodable-am-diag-gmm.h"
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
     
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/gmmbin/gmm-make-regtree.cc b/src/gmmbin/gmm-make-regtree.cc
deleted file mode 100644
index 8c79d013e0d..00000000000
--- a/src/gmmbin/gmm-make-regtree.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// gmmbin/gmm-make-regtree.cc
-
-// Copyright 2009-2011  Saarland University;  Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/kaldi-io.h"
-#include "util/text-utils.h"
-#include "gmm/mle-am-diag-gmm.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "transform/regression-tree.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    typedef kaldi::BaseFloat BaseFloat;
-
-    const char *usage =
-        "Build regression class tree.\n"
-        "Usage: gmm-make-regtree [options] <model-file> <regtree-out>\n"
-        "E.g.: gmm-make-regtree --silphones=1:2:3 --state-occs=1.occs 1.mdl 1.regtree\n"
-        " [Note: state-occs come from --write-occs option of gmm-est]\n";
-
-    std::string occs_in_filename;
-    std::string sil_phones_str;
-    bool binary_write = true;
-    int32 max_leaves = 1;
-    kaldi::ParseOptions po(usage);
-    po.Register("state-occs", &occs_in_filename, "File containing state occupancies (use --write-occs in gmm-est)");
-    po.Register("sil-phones", &sil_phones_str, "Colon-separated list of integer ids of silence phones, e.g. 1:2:3; if used, create top-level speech/sil split (only one reg-class for silence).");
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("max-leaves", &max_leaves, "Maximum number of leaves in regression tree.");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        tree_out_filename = po.GetArg(2);
-
-    kaldi::AmDiagGmm am_gmm;
-    kaldi::TransitionModel trans_model;
-    {
-      bool binary_read;
-      kaldi::Input ki(model_in_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_gmm.Read(ki.Stream(), binary_read);
-    }
-
-    kaldi::Vector<BaseFloat> state_occs;
-    if (occs_in_filename != "") {
-      bool binary_read;
-      kaldi::Input ki(occs_in_filename, &binary_read);
-      state_occs.Read(ki.Stream(), binary_read);
-    } else {
-      KALDI_LOG << "--state-occs option not provided so using constant occupancies.";
-      state_occs.Resize(am_gmm.NumPdfs());
-      state_occs.Set(1.0);
-    }
-
-    std::vector<int32> sil_pdfs;
-    if (sil_phones_str != "") {
-      std::vector<int32> sil_phones;
-      if (!kaldi::SplitStringToIntegers(sil_phones_str, ":", false, &sil_phones))
-        KALDI_ERR << "invalid sil-phones option " << sil_phones_str;
-      std::sort(sil_phones.begin(), sil_phones.end());
-      bool ans = GetPdfsForPhones(trans_model, sil_phones, &sil_pdfs);
-      if (!ans)
-        KALDI_WARN << "Pdfs associated with silence phones are not only "
-            "associated with silence phones: your speech-silence split "
-            "may not be meaningful.";
-    }
-
-    kaldi::RegressionTree regtree;
-    regtree.BuildTree(state_occs, sil_pdfs, am_gmm, max_leaves);
-    // Write out the regression tree
-    {
-      kaldi::Output ko(tree_out_filename, binary_write);
-      regtree.Write(ko.Stream(), binary_write);
-    }
-
-    KALDI_LOG << "Written regression tree to " << tree_out_filename;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/gmmbin/gmm-mixup.cc b/src/gmmbin/gmm-mixup.cc
index a76b3805d89..51919560b10 100644
--- a/src/gmmbin/gmm-mixup.cc
+++ b/src/gmmbin/gmm-mixup.cc
@@ -21,7 +21,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/mle-am-diag-gmm.h"
 
 int main(int argc, char *argv[]) {
@@ -70,7 +70,7 @@ int main(int argc, char *argv[]) {
         model_out_filename = po.GetArg(3);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
diff --git a/src/gmmbin/gmm-post-to-gpost.cc b/src/gmmbin/gmm-post-to-gpost.cc
index 59da0f9a1ac..1260c9b922a 100644
--- a/src/gmmbin/gmm-post-to-gpost.cc
+++ b/src/gmmbin/gmm-post-to-gpost.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 
 int main(int argc, char *argv[]) {
@@ -56,7 +56,7 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int32 int32;
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/gmmbin/gmm-rescore-lattice.cc b/src/gmmbin/gmm-rescore-lattice.cc
index 54156442e64..36088cac304 100644
--- a/src/gmmbin/gmm-rescore-lattice.cc
+++ b/src/gmmbin/gmm-rescore-lattice.cc
@@ -22,7 +22,7 @@
 #include "util/common-utils.h"
 #include "util/stl-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
@@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
         lats_wspecifier = po.GetArg(4);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/gmmbin/gmm-sum-accs.cc b/src/gmmbin/gmm-sum-accs.cc
index c9886e867f5..6d99c4a35c9 100644
--- a/src/gmmbin/gmm-sum-accs.cc
+++ b/src/gmmbin/gmm-sum-accs.cc
@@ -19,7 +19,7 @@
 
 #include "util/common-utils.h"
 #include "gmm/mle-am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 
 int main(int argc, char *argv[]) {
@@ -50,16 +50,12 @@ int main(int argc, char *argv[]) {
       std::string stats_in_filename = po.GetArg(i);
       bool binary_read;
       kaldi::Input ki(stats_in_filename, &binary_read);
-      transition_accs.Read(ki.Stream(), binary_read, true /*add read values*/);
       gmm_accs.Read(ki.Stream(), binary_read, true /*add read values*/);
     }
 
     // Write out the accs
-    {
-      kaldi::Output ko(stats_out_filename, binary);
-      transition_accs.Write(ko.Stream(), binary);
-      gmm_accs.Write(ko.Stream(), binary);
-    }
+    WriteKaldiObject(gmm_accs, stats_out_filename, binary);
+
     KALDI_LOG << "Summed " << num_accs << " stats, total count "
               << gmm_accs.TotCount() << ", avg like/frame "
               << (gmm_accs.TotLogLike() / gmm_accs.TotCount());
@@ -70,5 +66,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/gmmbin/gmm-transform-means-global.cc b/src/gmmbin/gmm-transform-means-global.cc
index 6b1a6be8330..857b602c19b 100644
--- a/src/gmmbin/gmm-transform-means-global.cc
+++ b/src/gmmbin/gmm-transform-means-global.cc
@@ -22,7 +22,7 @@
 #include "util/common-utils.h"
 #include "gmm/diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/mllt.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/gmmbin/gmm-transform-means.cc b/src/gmmbin/gmm-transform-means.cc
index 5c08ec32b10..3a27d73a947 100644
--- a/src/gmmbin/gmm-transform-means.cc
+++ b/src/gmmbin/gmm-transform-means.cc
@@ -22,7 +22,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "transform/mllt.h"
 
 int main(int argc, char *argv[]) {
@@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
     ReadKaldiObject(mat_rxfilename, &mat);
 
     AmDiagGmm am_gmm;
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary_read;
       Input ki(model_in_rxfilename, &binary_read);
diff --git a/src/gst-plugin/gst-online-gmm-decode-faster.cc b/src/gst-plugin/gst-online-gmm-decode-faster.cc
index 958bce41d80..094d398960a 100644
--- a/src/gst-plugin/gst-online-gmm-decode-faster.cc
+++ b/src/gst-plugin/gst-online-gmm-decode-faster.cc
@@ -389,7 +389,7 @@ gst_online_gmm_decode_faster_allocate(GstOnlineGmmDecodeFaster * filter) {
       Input ki(filter->lda_mat_rspecifier_, &binary_in);
       filter->lda_transform_->Read(ki.Stream(), binary_in);
     }
-    filter->trans_model_ = new TransitionModel();
+    filter->trans_model_ = new Transitions();
     filter->am_gmm_ = new AmDiagGmm();
     {
       bool binary;
diff --git a/src/gst-plugin/gst-online-gmm-decode-faster.h b/src/gst-plugin/gst-online-gmm-decode-faster.h
index b950d1e0a12..529c510115a 100644
--- a/src/gst-plugin/gst-online-gmm-decode-faster.h
+++ b/src/gst-plugin/gst-online-gmm-decode-faster.h
@@ -65,7 +65,7 @@ struct _GstOnlineGmmDecodeFaster {
 
   OnlineFasterDecoder *decoder_;
   Matrix<BaseFloat> *lda_transform_;
-  TransitionModel *trans_model_;
+  Transitions *trans_model_;
   AmDiagGmm *am_gmm_;
   fst::Fst<fst::StdArc> *decode_fst_;
   fst::SymbolTable *word_syms_;
diff --git a/src/hmm/Makefile b/src/hmm/Makefile
index 0ad5da74c28..fb8c57397c8 100644
--- a/src/hmm/Makefile
+++ b/src/hmm/Makefile
@@ -3,14 +3,13 @@ all:
 
 include ../kaldi.mk
 
-TESTFILES = hmm-topology-test hmm-utils-test transition-model-test posterior-test
+TESTFILES = topology-test hmm-utils-test transitions-test posterior-test
 
-OBJFILES = hmm-topology.o transition-model.o hmm-utils.o tree-accu.o \
+OBJFILES = topology.o transitions.o hmm-utils.o tree-accu.o \
         posterior.o hmm-test-utils.o
 
 LIBNAME = kaldi-hmm
-ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
-
diff --git a/src/hmm/hmm-test-utils.cc b/src/hmm/hmm-test-utils.cc
index ceca116c828..6eae1a119b2 100644
--- a/src/hmm/hmm-test-utils.cc
+++ b/src/hmm/hmm-test-utils.cc
@@ -23,7 +23,7 @@
 
 namespace kaldi {
 
-TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep_out) {
+Transitions *GenRandTransitions(ContextDependency **ctx_dep_out) {
   std::vector<int32> phones;
   phones.push_back(1);
   for (int32 i = 2; i < 20; i++)
@@ -38,16 +38,16 @@ TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep_out) {
       GenRandContextDependencyLarge(phones, N, P,
                                     true, &num_pdf_classes);
 
-  HmmTopology topo = GenRandTopology(phones, num_pdf_classes);
+  Topology topo = GenRandTopology(phones, num_pdf_classes);
 
-  TransitionModel *trans_model = new TransitionModel(*ctx_dep, topo);
+  Transitions *trans_model = new Transitions(*ctx_dep, topo);
 
   if (ctx_dep_out == NULL) delete ctx_dep;
   else *ctx_dep_out = ctx_dep;
   return trans_model;
 }
 
-HmmTopology GetDefaultTopology(const std::vector<int32> &phones_in) {
+Topology GetDefaultTopology(const std::vector<int32> &phones_in) {
   std::vector<int32> phones(phones_in);
   std::sort(phones.begin(), phones.end());
   KALDI_ASSERT(IsSortedAndUniq(phones) && !phones.empty());
@@ -59,24 +59,19 @@ HmmTopology GetDefaultTopology(const std::vector<int32> &phones_in) {
   for (size_t i = 0; i < phones.size(); i++)
     topo_string << phones[i] << " ";
 
-  topo_string << "</ForPhones>\n"
-      "<State> 0 <PdfClass> 0\n"
-      "<Transition> 0 0.5\n"
-      "<Transition> 1 0.5\n"
-      "</State> \n"
-      "<State> 1 <PdfClass> 1 \n"
-      "<Transition> 1 0.5\n"
-      "<Transition> 2 0.5\n"
-      "</State>  \n"
-      " <State> 2 <PdfClass> 2\n"
-      " <Transition> 2 0.5\n"
-      " <Transition> 3 0.5\n"
-      " </State>   \n"
-      " <State> 3 </State>\n"
-      " </TopologyEntry>\n"
-      " </Topology>\n";
-
-  HmmTopology topo;
+  topo_string <<
+      "</ForPhones>\n"
+      "0  1  1  0.0\n"
+      "1  1  1  0.693\n"
+      "1  2  2  0.693\n"
+      "2  2  2  0.693\n"
+      "2  3  3  0.693\n"
+      "3  3  3  0.693\n"
+      "3  0.693\n\n"
+      "</TopologyEntry>\n"
+      "</Topology>\n";
+
+  Topology topo;
   std::istringstream iss(topo_string.str());
   topo.Read(iss, false);
   return topo;
@@ -84,15 +79,15 @@ HmmTopology GetDefaultTopology(const std::vector<int32> &phones_in) {
 }
 
 
-HmmTopology GenRandTopology(const std::vector<int32> &phones_in,
-                            const std::vector<int32> &num_pdf_classes) {
+Topology GenRandTopology(const std::vector<int32> &phones_in,
+                         const std::vector<int32> &num_pdf_classes) {
   std::vector<int32> phones(phones_in);
   std::sort(phones.begin(), phones.end());
   KALDI_ASSERT(IsSortedAndUniq(phones) && !phones.empty());
 
   std::ostringstream topo_string;
 
-   std::map<int32, std::vector<int32> > num_pdf_classes_to_phones;
+  std::map<int32, std::vector<int32> > num_pdf_classes_to_phones;
   for (size_t i = 0; i < phones.size(); i++) {
     int32 p = phones[i];
     KALDI_ASSERT(static_cast<size_t>(p) < num_pdf_classes.size());
@@ -112,66 +107,43 @@ HmmTopology GenRandTopology(const std::vector<int32> &phones_in,
     const std::vector<int32> &phones = iter->second;
     for (size_t i = 0; i < phones.size(); i++)
       topo_string << phones[i] << " ";
-    topo_string << "</ForPhones> ";
-    bool ergodic = (RandInt(0, 1) == 0);
-    if (ergodic) {
-      // Note, this type of topology is not something we ever use in practice- it
-      // has an initial nonemitting state (no PdfClass specified).  But it's
-      // supported so we're testing it.
-      std::vector<int32> state_to_pdf_class;
-      state_to_pdf_class.push_back(-1);  // state zero, nonemitting.
-      for (int32 i = 0; i < this_num_pdf_classes; i++) {
-        int32 num_states = RandInt(1, 2);
-        for (int32 j = 0; j < num_states; j++)
-          state_to_pdf_class.push_back(i);
-      }
-      state_to_pdf_class.push_back(-1);  // final non-emitting state.
-      { // state zero is nonemitting.  This is not something used in any current
-        // example script.
-        topo_string << "<State> 0\n";
-        BaseFloat prob = 1.0 / (state_to_pdf_class.size() - 2);
-        for (size_t i = 1; i + 1 < state_to_pdf_class.size(); i++) {
-          topo_string << "<Transition> " << i << ' ' << prob << '\n';
-        }
-        topo_string << "</State>\n";
-      }
-      // ergodic part.
-      for (size_t i = 1; i + 1 < state_to_pdf_class.size(); i++) {
-        BaseFloat prob = 1.0 / (state_to_pdf_class.size() - 1);
-        topo_string << "<State> " << i << " <PdfClass> "
-                    << state_to_pdf_class[i] << '\n';
-        for (size_t j = 1; j < state_to_pdf_class.size(); j++)
-          topo_string << "<Transition> " << j << ' ' << prob << '\n';
-        topo_string << "</State>\n";
-      }
-      // final, nonemitting state.  No pdf-class, no transitions.
-      topo_string << "<State> " << (state_to_pdf_class.size() - 1) << " </State>\n";
-    } else {
-      // feedforward topology.
-      int32 cur_state = 0;
-      for (int32 pdf_class = 0; pdf_class < this_num_pdf_classes; pdf_class++) {
-        int32 this_num_states = RandInt(1, 2);
-        for (int32 s = 0; s < this_num_states; s++) {
-          topo_string << "<State> " << cur_state << " <PdfClass> " << pdf_class
-                      << "\n<Transition> " << cur_state << " 0.5\n<Transition> "
-                      << (cur_state + 1) << " 0.5\n</State>\n";
-          cur_state++;
-        }
-      }
-      // final, non-emitting state.
-      topo_string << "<State> " << cur_state << " </State>\n";
+    topo_string << "</ForPhones>\n";
+
+    switch (this_num_pdf_classes)  {
+      case 1:
+        topo_string << "0   1   1   0.0\n"
+                       "1   1   1   0.693\n"
+                      "1  0.693\n\n";
+        break;
+      case 2:
+        topo_string << "0   1   1   0.0\n"
+                       "1   1   1   0.693\n"
+                       "1   2   2  0.693\n"
+                       "2   2   2  0.693\n"
+                       "2  0.693\n\n";
+        break;
+      case 3:
+        topo_string << "0   1   1   0.0\n"
+                       "1   1   1   0.693\n"
+                       "1   2   2  0.693\n"
+                       "2   3   3  0.0\n"  // mix it up a bit.
+                       "3   3   3  0.693\n"
+                       "3  0.693\n\n";
+        break;
+      default:
+        KALDI_ERR << "Un-handled num-pdf-classes\n";
     }
     topo_string << "</TopologyEntry>\n";
   }
   topo_string << "</Topology>\n";
 
-  HmmTopology topo;
+  Topology topo;
   std::istringstream iss(topo_string.str());
   topo.Read(iss, false);
   return topo;
 }
 
-HmmTopology GenRandTopology() {
+Topology GenRandTopology() {
   std::vector<int32> phones;
   phones.push_back(1);
   for (int32 i = 2; i < 20; i++)
@@ -182,63 +154,54 @@ HmmTopology GenRandTopology() {
   } else {
     std::vector<int32> num_pdf_classes(phones.back() + 1, -1);
     for (int32 i = 0; i < phones.size(); i++)
-      num_pdf_classes[phones[i]] = RandInt(1, 5);
+      num_pdf_classes[phones[i]] = RandInt(1, 3);
     return GenRandTopology(phones, num_pdf_classes);
   }
 }
 
-void GeneratePathThroughHmm(const HmmTopology &topology,
-                            bool reorder,
+void GeneratePathThroughHmm(const Topology &topology,
                             int32 phone,
                             std::vector<std::pair<int32, int32> > *path) {
   path->clear();
-  const HmmTopology::TopologyEntry &this_entry =
-      topology.TopologyForPhone(phone);
+  auto const &this_entry = topology.TopologyForPhone(phone); // an FST
   int32 cur_state = 0;  // start-state is always state zero.
-  int32 num_states = this_entry.size(), final_state = num_states - 1;
+
+  // Note: final_state == num_states - 1 is actually not something
+  // that would be generally true, but it is true for the topologies we
+  // use in the test code.
+  int32 num_states = this_entry.NumStates(), final_state = num_states - 1;
   KALDI_ASSERT(num_states > 1);  // there has to be a final nonemitting state
   // that's different from the start state.
-  std::vector<std::pair<int32, int32> > pending_self_loops;
+
   while (cur_state != final_state) {
-    const HmmTopology::HmmState &cur_hmm_state = this_entry[cur_state];
-    int32 num_transitions = cur_hmm_state.transitions.size(),
-        transition_index = RandInt(0, num_transitions - 1);
-    if (cur_hmm_state.forward_pdf_class != -1) {
-      std::pair<int32, int32> pr(cur_state, transition_index);
-      if (!reorder) {
-        path->push_back(pr);
-      } else {
-        bool is_self_loop = (cur_state ==
-                             cur_hmm_state.transitions[transition_index].first);
-        if (is_self_loop) { // save these up, we'll put them after the forward
-                            // transition.
-          pending_self_loops.push_back(pr);
-        } else {
-          // non-self-loop: output it and then flush out any self-loops we
-          // stored up.
-          path->push_back(pr);
-          path->insert(path->end(), pending_self_loops.begin(),
-                       pending_self_loops.end());
-          pending_self_loops.clear();
-        }
-      }
-    }
-    cur_state = cur_hmm_state.transitions[transition_index].first;
+    int32 num_transitions = this_entry.NumArcs(cur_state),
+        arc_index = RandInt(0, num_transitions - 1);
+    fst::ArcIterator<fst::StdVectorFst> aiter(this_entry, cur_state);
+    aiter.Seek(arc_index);
+    auto const &arc(aiter.Value());
+    KALDI_ASSERT(arc.ilabel > 0);
+    std::pair<int32, int32> pr(cur_state, arc_index);
+    path->push_back(pr);
+    cur_state = arc.nextstate;
   }
-  KALDI_ASSERT(pending_self_loops.empty());
 }
 
 
 void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
-                             const TransitionModel &trans_model,
-                             bool reorder,
+                             const Transitions &trans_model,
                              const std::vector<int32> &phone_sequence,
                              std::vector<int32> *alignment) {
   int32 context_width = ctx_dep.ContextWidth(),
       central_position = ctx_dep.CentralPosition(),
       num_phones = phone_sequence.size();
+
+  auto all_phones = trans_model.GetPhones();
+  int32 model_max_phone = *std::max_element(all_phones.begin(),
+                                            all_phones.end());
   alignment->clear();
   for (int32 i = 0; i < num_phones; i++) {
+    KALDI_ASSERT(phone_sequence[i] > 0
+                 && phone_sequence[i] <= model_max_phone);
     std::vector<int32> context_window;
     context_window.reserve(context_width);
     for (int32 j = i - central_position;
@@ -248,26 +211,35 @@ void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
       else context_window.push_back(0);  // zero for out-of-window phones
     }
     // 'path' is the path through this phone's HMM, represented as
-    // (emitting-HMM-state, transition-index) pairs
+    // (source-HMM-state, transition-index) pairs
     std::vector<std::pair<int32, int32> > path;
     int32 phone = phone_sequence[i];
-    GeneratePathThroughHmm(trans_model.GetTopo(), reorder, phone, &path);
+    GeneratePathThroughHmm(trans_model.GetTopo(), phone, &path);
     for (size_t k = 0; k < path.size(); k++) {
-      const HmmTopology::TopologyEntry &entry =
-          trans_model.GetTopo().TopologyForPhone(phone);
+      auto const &entry = trans_model.GetTopo().TopologyForPhone(phone);
       int32 hmm_state = path[k].first,
-          transition_index = path[k].second,
-          forward_pdf_class = entry[hmm_state].forward_pdf_class,
-          self_loop_pdf_class = entry[hmm_state].self_loop_pdf_class,
+          arc_index = path[k].second,
           forward_pdf_id, self_loop_pdf_id;
+      fst::ArcIterator<fst::StdVectorFst> aiter(entry, hmm_state);
+      aiter.Seek(arc_index);
+      auto const &arc(aiter.Value());
+      int32 forward_pdf_class = arc.ilabel,
+          self_loop_pdf_class = -1;
+      for (fst::ArcIterator<fst::StdVectorFst> aiter_next(entry, arc.nextstate);
+           !aiter_next.Done(); aiter_next.Next())
+        if (aiter_next.Value().nextstate == arc.nextstate)
+          self_loop_pdf_class = aiter_next.Value().ilabel;
+
       bool ans = ctx_dep.Compute(context_window, forward_pdf_class, &forward_pdf_id);
       KALDI_ASSERT(ans && "context-dependency computation failed.");
-      ans = ctx_dep.Compute(context_window, self_loop_pdf_class, &self_loop_pdf_id);
-      KALDI_ASSERT(ans && "context-dependency computation failed.");
-      int32 transition_state = trans_model.TupleToTransitionState(
-                               phone, hmm_state, forward_pdf_id, self_loop_pdf_id),
-          transition_id = trans_model.PairToTransitionId(transition_state,
-                                                         transition_index);
+      if (self_loop_pdf_class != -1) {
+        ans = ctx_dep.Compute(context_window, self_loop_pdf_class, &self_loop_pdf_id);
+        KALDI_ASSERT(ans && "context-dependency computation failed.");
+      } else {
+        self_loop_pdf_id = -1;
+      }
+      int32 transition_id = trans_model.TupleToTransitionId(phone, hmm_state, arc_index,
+                                                            forward_pdf_id, self_loop_pdf_id);
       alignment->push_back(transition_id);
     }
   }
diff --git a/src/hmm/hmm-test-utils.h b/src/hmm/hmm-test-utils.h
index 4faaa92fa66..32c901c1791 100644
--- a/src/hmm/hmm-test-utils.h
+++ b/src/hmm/hmm-test-utils.h
@@ -21,38 +21,38 @@
 #ifndef KALDI_HMM_HMM_TEST_UTILS_H_
 #define KALDI_HMM_HMM_TEST_UTILS_H_
 
-#include "hmm/hmm-topology.h"
-#include "hmm/transition-model.h"
+#include "hmm/topology.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 #include "tree/context-dep.h"
 
 namespace kaldi {
 
-// Here we put a convenience function for generating a TransitionModel object --
+// Here we put a convenience function for generating a Transitions object --
 // useful in test code.  We may put other testing-related things here in time.
 
-// This function returns a randomly generated TransitionModel object.
+// This function returns a randomly generated Transitions object.
 // If 'ctx_dep' is not NULL, it outputs to *ctx_dep a pointer to the
 // tree that was used to generate the transition model.
-TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep);
+Transitions *GenRandTransitions(ContextDependency **ctx_dep);
 
-/// This function returns a HmmTopology object giving a normal 3-state topology,
+/// This function returns a Topology object giving a normal 3-state topology,
 /// covering all phones in the list "phones".  This is mainly of use in testing
 /// code.
-HmmTopology GetDefaultTopology(const std::vector<int32> &phones);
+Topology GetDefaultTopology(const std::vector<int32> &phones);
 
 
-/// This method of generating an arbitrary HmmTopology object allows you to
+/// This method of generating an arbitrary Topology object allows you to
 /// specify the number of pdf-classes for each phone separately.
 /// 'num_pdf_classes' is indexed by the phone-index (so the length will be
 /// longer than the length of the 'phones' vector, which for example lacks the
 /// zero index and may have gaps).
-HmmTopology GenRandTopology(const std::vector<int32> &phones,
+Topology GenRandTopology(const std::vector<int32> &phones,
                             const std::vector<int32> &num_pdf_classes);
 
 /// This version of GenRandTopology() generates the phone list and number of pdf
 /// classes randomly.
-HmmTopology GenRandTopology();
+Topology GenRandTopology();
 
 /// This function generates a random path through the HMM for the given
 /// phone.  The 'path' output is a list of pairs (HMM-state, transition-index)
@@ -60,8 +60,7 @@ HmmTopology GenRandTopology();
 /// used in other test code.
 /// the 'reorder' option is as described in the documentation; if true, the
 /// self-loops from a state are reordered to come after the forward-transition.
-void GeneratePathThroughHmm(const HmmTopology &topology,
-                            bool reorder,
+void GeneratePathThroughHmm(const Topology &topology,
                             int32 phone,
                             std::vector<std::pair<int32, int32> > *path);
 
@@ -69,8 +68,7 @@ void GeneratePathThroughHmm(const HmmTopology &topology,
 /// For use in test code, this function generates an alignment (a sequence of
 /// transition-ids) corresponding to a given phone sequence.
 void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
-                             const TransitionModel &trans_model,
-                             bool reorder,
+                             const Transitions &trans_model,
                              const std::vector<int32> &phone_sequence,
                              std::vector<int32> *alignment);
 
diff --git a/src/hmm/hmm-topology.cc b/src/hmm/hmm-topology.cc
deleted file mode 100644
index 29634ecda0b..00000000000
--- a/src/hmm/hmm-topology.cc
+++ /dev/null
@@ -1,387 +0,0 @@
-// hmm/hmm-topology.cc
-
-// Copyright 2009-2011  Microsoft Corporation
-//                2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-
-#include "hmm/hmm-topology.h"
-#include "util/text-utils.h"
-
-
-namespace kaldi {
-
-
-
-void HmmTopology::GetPhoneToNumPdfClasses(std::vector<int32> *phone2num_pdf_classes) const {
-  KALDI_ASSERT(!phones_.empty());
-  phone2num_pdf_classes->clear();
-  phone2num_pdf_classes->resize(phones_.back() + 1, -1);
-  for (size_t i = 0; i < phones_.size(); i++)
-    (*phone2num_pdf_classes)[phones_[i]] = NumPdfClasses(phones_[i]);
-}
-
-void HmmTopology::Read(std::istream &is, bool binary) {
-  ExpectToken(is, binary, "<Topology>");
-  if (!binary) {  // Text-mode read, different "human-readable" format.
-    phones_.clear();
-    phone2idx_.clear();
-    entries_.clear();
-    std::string token;
-    while ( ! (is >> token).fail() ) {
-      if (token == "</Topology>") { break; } // finished parsing.
-      else  if (token != "<TopologyEntry>") {
-        KALDI_ERR << "Reading HmmTopology object, expected </Topology> or <TopologyEntry>, got "<<token;
-      } else {
-        ExpectToken(is, binary, "<ForPhones>");
-        std::vector<int32> phones;
-        std::string s;
-        while (1) {
-          is >> s;
-          if (is.fail()) KALDI_ERR << "Reading HmmTopology object, unexpected end of file while expecting phones.";
-          if (s == "</ForPhones>") break;
-          else {
-            int32 phone;
-            if (!ConvertStringToInteger(s, &phone))
-              KALDI_ERR << "Reading HmmTopology object, expected "
-                        << "integer, got instead " << s;
-            phones.push_back(phone);
-          }
-        }
-
-        std::vector<HmmState> this_entry;
-        std::string token;
-        ReadToken(is, binary, &token);
-        while (token != "</TopologyEntry>") {
-          if (token != "<State>")
-            KALDI_ERR << "Expected </TopologyEntry> or <State>, got instead " << token;
-          int32 state;
-          ReadBasicType(is, binary, &state);
-          if (state != static_cast<int32>(this_entry.size()))
-            KALDI_ERR << "States are expected to be in order from zero, expected "
-                      << this_entry.size() <<  ", got " << state;
-          ReadToken(is, binary, &token);
-          int32 forward_pdf_class = kNoPdf;  // -1 by default, means no pdf.
-          if (token == "<PdfClass>") {
-            ReadBasicType(is, binary, &forward_pdf_class);
-            this_entry.push_back(HmmState(forward_pdf_class));
-            ReadToken(is, binary, &token);
-            if (token == "<SelfLoopPdfClass>")
-              KALDI_ERR << "pdf classes should be defined using <PdfClass> "
-                        << "or <ForwardPdfClass>/<SelfLoopPdfClass> pair";
-          } else if (token == "<ForwardPdfClass>") {
-            int32 self_loop_pdf_class = kNoPdf;
-            ReadBasicType(is, binary, &forward_pdf_class);
-            ReadToken(is, binary, &token);
-            if (token != "<SelfLoopPdfClass>")
-              KALDI_ERR << "Expected <SelfLoopPdfClass>, got instead " << token;
-            ReadBasicType(is, binary, &self_loop_pdf_class);
-            this_entry.push_back(HmmState(forward_pdf_class, self_loop_pdf_class));
-            ReadToken(is, binary, &token);
-          } else
-            this_entry.push_back(HmmState(forward_pdf_class));
-          while (token == "<Transition>") {
-            int32 dst_state;
-            BaseFloat trans_prob;
-            ReadBasicType(is, binary, &dst_state);
-            ReadBasicType(is, binary, &trans_prob);
-            this_entry.back().transitions.push_back(std::make_pair(dst_state, trans_prob));
-            ReadToken(is, binary, &token);
-          }
-          if (token == "<Final>")  // TODO: remove this clause after a while.
-            KALDI_ERR << "You are trying to read old-format topology with new Kaldi.";
-          if (token != "</State>")
-            KALDI_ERR << "Expected </State>, got instead " << token;
-          ReadToken(is, binary, &token);
-        }
-        int32 my_index = entries_.size();
-        entries_.push_back(this_entry);
-
-        for (size_t i = 0; i < phones.size(); i++) {
-          int32 phone = phones[i];
-          if (static_cast<int32>(phone2idx_.size()) <= phone)
-            phone2idx_.resize(phone+1, -1);  // -1 is invalid index.
-          KALDI_ASSERT(phone > 0);
-          if (phone2idx_[phone] != -1)
-            KALDI_ERR << "Phone with index "<<(i)<<" appears in multiple topology entries.";
-          phone2idx_[phone] = my_index;
-          phones_.push_back(phone);
-        }
-      }
-    }
-    std::sort(phones_.begin(), phones_.end());
-    KALDI_ASSERT(IsSortedAndUniq(phones_));
-  } else {  // binary I/O, just read member objects directly from disk.
-    ReadIntegerVector(is, binary, &phones_);
-    ReadIntegerVector(is, binary, &phone2idx_);
-    int32 sz;
-    ReadBasicType(is, binary, &sz);
-    bool is_hmm = true;
-    if (sz == -1) {
-      is_hmm = false;
-      ReadBasicType(is, binary, &sz);
-    }
-    entries_.resize(sz);
-    for (int32 i = 0; i < sz; i++) {
-      int32 thist_sz;
-      ReadBasicType(is, binary, &thist_sz);
-      entries_[i].resize(thist_sz);
-      for (int32 j = 0 ; j < thist_sz; j++) {
-        ReadBasicType(is, binary, &(entries_[i][j].forward_pdf_class));
-        if (is_hmm)
-          entries_[i][j].self_loop_pdf_class = entries_[i][j].forward_pdf_class;
-        else
-          ReadBasicType(is, binary, &(entries_[i][j].self_loop_pdf_class));
-        int32 thiss_sz;
-        ReadBasicType(is, binary, &thiss_sz);
-        entries_[i][j].transitions.resize(thiss_sz);
-        for (int32 k = 0; k < thiss_sz; k++) {
-          ReadBasicType(is, binary, &(entries_[i][j].transitions[k].first));
-          ReadBasicType(is, binary, &(entries_[i][j].transitions[k].second));
-        }
-      }
-    }
-    ExpectToken(is, binary, "</Topology>");
-  }
-  Check();  // Will throw if not ok.
-}
-
-
-void HmmTopology::Write(std::ostream &os, bool binary) const {
-  bool is_hmm = IsHmm();
-  WriteToken(os, binary, "<Topology>");
-  if (!binary) {  // Text-mode write.
-    os << "\n";
-    for (int32 i = 0; i < static_cast<int32> (entries_.size()); i++) {
-      WriteToken(os, binary, "<TopologyEntry>");
-      os << "\n";
-      WriteToken(os, binary, "<ForPhones>");
-      os << "\n";
-      for (size_t j = 0; j < phone2idx_.size(); j++) {
-        if (phone2idx_[j] == i)
-          os << j << " ";
-      }
-      os << "\n";
-      WriteToken(os, binary, "</ForPhones>");
-      os << "\n";
-      for (size_t j = 0; j < entries_[i].size(); j++) {
-        WriteToken(os, binary, "<State>");
-        WriteBasicType(os, binary, static_cast<int32>(j));
-        if (entries_[i][j].forward_pdf_class != kNoPdf) {
-          if (is_hmm) {
-            WriteToken(os, binary, "<PdfClass>");
-            WriteBasicType(os, binary, entries_[i][j].forward_pdf_class);
-          } else {
-            WriteToken(os, binary, "<ForwardPdfClass>");
-            WriteBasicType(os, binary, entries_[i][j].forward_pdf_class);
-            KALDI_ASSERT(entries_[i][j].self_loop_pdf_class != kNoPdf);
-            WriteToken(os, binary, "<SelfLoopPdfClass>");
-            WriteBasicType(os, binary, entries_[i][j].self_loop_pdf_class);
-          }
-        }
-        for (size_t k = 0; k < entries_[i][j].transitions.size(); k++) {
-          WriteToken(os, binary, "<Transition>");
-          WriteBasicType(os, binary, entries_[i][j].transitions[k].first);
-          WriteBasicType(os, binary, entries_[i][j].transitions[k].second);
-        }
-        WriteToken(os, binary, "</State>");
-        os << "\n";
-      }
-      WriteToken(os, binary, "</TopologyEntry>");
-      os << "\n";
-    }
-  } else {
-    WriteIntegerVector(os, binary, phones_);
-    WriteIntegerVector(os, binary, phone2idx_);
-    // -1 is put here as a signal that the object has the new,
-    // extended format with SelfLoopPdfClass
-    if (!is_hmm) WriteBasicType(os, binary, static_cast<int32>(-1));
-    WriteBasicType(os, binary, static_cast<int32>(entries_.size()));
-    for (size_t i = 0; i < entries_.size(); i++) {
-      WriteBasicType(os, binary, static_cast<int32>(entries_[i].size()));
-      for (size_t j = 0; j < entries_[i].size(); j++) {
-        WriteBasicType(os, binary, entries_[i][j].forward_pdf_class);
-        if (!is_hmm) WriteBasicType(os, binary, entries_[i][j].self_loop_pdf_class);
-        WriteBasicType(os, binary, static_cast<int32>(entries_[i][j].transitions.size()));
-        for (size_t k = 0; k < entries_[i][j].transitions.size(); k++) {
-          WriteBasicType(os, binary, entries_[i][j].transitions[k].first);
-          WriteBasicType(os, binary, entries_[i][j].transitions[k].second);
-        }
-      }
-    }
-  }
-  WriteToken(os, binary, "</Topology>");
-  if (!binary) os << "\n";
-}
-
-void HmmTopology::Check() {
-  if (entries_.empty() || phones_.empty() || phone2idx_.empty())
-    KALDI_ERR << "HmmTopology::Check(), empty object.";
-  std::vector<bool> is_seen(entries_.size(), false);
-  for (size_t i = 0; i < phones_.size(); i++) {
-    int32 phone = phones_[i];
-    if (static_cast<size_t>(phone) >= phone2idx_.size() ||
-        static_cast<size_t>(phone2idx_[phone]) >= entries_.size())
-      KALDI_ERR << "HmmTopology::Check(), phone has no valid index.";
-    is_seen[phone2idx_[phone]] = true;
-  }
-  for (size_t i = 0; i < entries_.size(); i++) {
-    if (!is_seen[i])
-      KALDI_ERR << "HmmTopoloy::Check(), entry with no corresponding phones.";
-    int32 num_states = static_cast<int32>(entries_[i].size());
-    if (num_states <= 1)
-      KALDI_ERR << "HmmTopology::Check(), cannot only have one state (i.e., must "
-          "have at least one emitting state).";
-    if (!entries_[i][num_states-1].transitions.empty())
-      KALDI_ERR << "HmmTopology::Check(), last state must have no transitions.";
-    // not sure how necessary this next stipulation is.
-    if (entries_[i][num_states-1].forward_pdf_class != kNoPdf)
-      KALDI_ERR << "HmmTopology::Check(), last state must not be emitting.";
-
-    std::vector<bool> has_trans_in(num_states, false);
-    std::vector<int32> seen_pdf_classes;
-
-    for (int32 j = 0; j < num_states; j++) {  // j is the state-id.
-      BaseFloat tot_prob = 0.0;
-      if (entries_[i][j].forward_pdf_class != kNoPdf) {
-        seen_pdf_classes.push_back(entries_[i][j].forward_pdf_class);
-        seen_pdf_classes.push_back(entries_[i][j].self_loop_pdf_class);
-      }
-      std::set<int32> seen_transition;
-      for (int32 k = 0;
-           static_cast<size_t>(k) < entries_[i][j].transitions.size();
-           k++) {
-        tot_prob += entries_[i][j].transitions[k].second;
-        if (entries_[i][j].transitions[k].second <= 0.0)
-          KALDI_ERR << "HmmTopology::Check(), negative or zero transition prob.";
-        int32 dst_state = entries_[i][j].transitions[k].first;
-        // The commented code in the next few lines disallows a completely
-        // skippable phone, as this would cause to stop working some mechanisms
-        // that are being built, which enable the creation of phone-level lattices
-        // and rescoring these with a different lexicon and LM.
-        if (dst_state == num_states-1 // && j != 0
-            && entries_[i][j].forward_pdf_class == kNoPdf)
-          KALDI_ERR << "We do not allow any state to be "
-              "nonemitting and have a transition to the final-state (this would "
-              "stop the SplitToPhones function from identifying the last state "
-              "of a phone.";
-        if (dst_state < 0 || dst_state >= num_states)
-          KALDI_ERR << "HmmTopology::Check(), invalid dest state " << (dst_state);
-        if (seen_transition.count(dst_state) != 0)
-          KALDI_ERR << "HmmTopology::Check(), duplicate transition found.";
-        if (dst_state == k) {  // self_loop...
-          KALDI_ASSERT(entries_[i][j].self_loop_pdf_class != kNoPdf &&
-                       "Nonemitting states cannot have self-loops.");
-        }
-        seen_transition.insert(dst_state);
-        has_trans_in[dst_state] = true;
-      }
-      if (j+1 < num_states) {
-        KALDI_ASSERT(tot_prob > 0.0 && "Non-final state must have transitions out."
-                     "(with nonzero probability)");
-        if (fabs(tot_prob - 1.0) > 0.01)
-          KALDI_WARN << "Total probability for state " << j <<
-              " in topology entry is " << tot_prob;
-      } else
-        KALDI_ASSERT(tot_prob == 0.0);
-    }
-    // make sure all but start state have input transitions.
-    for (int32 j = 1; j < num_states; j++)
-      if (!has_trans_in[j])
-        KALDI_ERR << "HmmTopology::Check, state "<<(j)<<" has no input transitions.";
-    SortAndUniq(&seen_pdf_classes);
-    if (seen_pdf_classes.front() != 0 ||
-        seen_pdf_classes.back() != static_cast<int32>(seen_pdf_classes.size()) - 1) {
-      KALDI_ERR << "HmmTopology::Check(), pdf_classes are expected to be "
-          "contiguous and start from zero.";
-    }
-  }
-}
-
-bool HmmTopology::IsHmm() const {
-  const std::vector<int32> &phones = GetPhones();
-  KALDI_ASSERT(!phones.empty());
-  for (size_t i = 0; i < phones.size(); i++) {
-    int32 phone = phones[i];
-    const TopologyEntry &entry = TopologyForPhone(phone);
-    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
-      int32 forward_pdf_class = entry[j].forward_pdf_class,
-            self_loop_pdf_class = entry[j].self_loop_pdf_class;
-      if (forward_pdf_class != self_loop_pdf_class)
-        return false;
-    }
-  }
-  return true;
-}
-
-const HmmTopology::TopologyEntry& HmmTopology::TopologyForPhone(int32 phone) const {  // Will throw if phone not covered.
-  if (static_cast<size_t>(phone) >= phone2idx_.size() || phone2idx_[phone] == -1) {
-    KALDI_ERR << "TopologyForPhone(), phone "<<(phone)<<" not covered.";
-  }
-  return entries_[phone2idx_[phone]];
-}
-
-int32 HmmTopology::NumPdfClasses(int32 phone) const {
-  // will throw if phone not covered.
-  const TopologyEntry &entry = TopologyForPhone(phone);
-  int32 max_pdf_class = 0;
-  for (size_t i = 0; i < entry.size(); i++) {
-    max_pdf_class = std::max(max_pdf_class, entry[i].forward_pdf_class);
-    max_pdf_class = std::max(max_pdf_class, entry[i].self_loop_pdf_class);
-  }
-  return max_pdf_class+1;
-}
-
-int32 HmmTopology::MinLength(int32 phone) const {
-  const TopologyEntry &entry = TopologyForPhone(phone);
-  // min_length[state] gives the minimum length for sequences up to and
-  // including that state.
-  std::vector<int32> min_length(entry.size(),
-                                std::numeric_limits<int32>::max());
-  KALDI_ASSERT(!entry.empty());
-
-  min_length[0] = (entry[0].forward_pdf_class == -1 ? 0 : 1);
-  int32 num_states = min_length.size();
-  bool changed = true;
-  while (changed) {
-    changed = false;
-    for (int32 s = 0; s < num_states; s++) {
-      const HmmState &this_state = entry[s];
-      std::vector<std::pair<int32, BaseFloat> >::const_iterator
-          iter = this_state.transitions.begin(),
-          end = this_state.transitions.end();
-      for (; iter != end; ++iter) {
-        int32 next_state = iter->first;
-        KALDI_ASSERT(next_state < num_states);
-        int32 next_state_min_length = min_length[s] +
-            (entry[next_state].forward_pdf_class == -1 ? 0 : 1);
-        if (next_state_min_length < min_length[next_state]) {
-          min_length[next_state] = next_state_min_length;
-          if (next_state < s)
-            changed = true;
-          // the test of 'next_state < s' is an optimization for speed.
-        }
-      }
-    }
-  }
-  KALDI_ASSERT(min_length.back() != std::numeric_limits<int32>::max());
-  // the last state is the final-state.
-  return min_length.back();
-}
-
-} // End namespace kaldi
diff --git a/src/hmm/hmm-topology.h b/src/hmm/hmm-topology.h
deleted file mode 100644
index 750d35bcfe4..00000000000
--- a/src/hmm/hmm-topology.h
+++ /dev/null
@@ -1,194 +0,0 @@
-// hmm/hmm-topology.h
-
-// Copyright 2009-2011  Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_HMM_HMM_TOPOLOGY_H_
-#define KALDI_HMM_HMM_TOPOLOGY_H_
-
-#include "base/kaldi-common.h"
-#include "util/const-integer-set.h"
-
-
-namespace kaldi {
-
-
-/// \addtogroup hmm_group
-/// @{
-
-/*
- // The following would be the text form for the "normal" HMM topology.
- // Note that the first state is the start state, and the final state,
- // which must have no output transitions and must be nonemitting, has
- // an exit probability of one (no other state can have nonzero exit
- // probability; you can treat the transition probability to the final
- // state as an exit probability).
- // Note also that it's valid to omit the "<PdfClass>" entry of the <State>, which
- // will mean we won't have a pdf on that state [non-emitting state].  This is equivalent
- // to setting the <PdfClass> to -1.  We do this normally just for the final state.
- // The Topology object can have multiple <TopologyEntry> blocks.
- // This is useful if there are multiple types of topology in the system.
-
- <Topology>
- <TopologyEntry>
- <ForPhones> 1 2 3 4 5 6 7 8 </ForPhones>
- <State> 0 <PdfClass> 0
- <Transition> 0 0.5
- <Transition> 1 0.5
- </State>
- <State> 1 <PdfClass> 1
- <Transition> 1 0.5
- <Transition> 2 0.5
- </State>
- <State> 2 <PdfClass> 2
- <Transition> 2 0.5
- <Transition> 3 0.5
- <Final> 0.5
- </State>
- <State> 3
- </State>
- </TopologyEntry>
- </Topology>
-*/
-
-// kNoPdf is used where pdf_class or pdf would be used, to indicate,
-// none is there.  Mainly useful in skippable models, but also used
-// for end states.
-// A caveat with nonemitting states is that their out-transitions
-// are not trainable, due to technical issues with the way
-// we decided to accumulate the stats.  Any transitions arising from (*)
-// HMM states with "kNoPdf" as the label are second-class transitions,
-// They do not have "transition-states" or "transition-ids" associated
-// with them.  They are used to create the FST version of the
-// HMMs, where they lead to epsilon arcs.
-// (*) "arising from" is a bit of a technical term here, due to the way
-// (if reorder == true), we put the transition-id associated with the
-// outward arcs of the state, on the input transition to the state.
-
-/// A constant used in the HmmTopology class as the \ref pdf_class "pdf-class"
-/// kNoPdf, which is used when a HMM-state is nonemitting (has no associated
-/// PDF).
-
-static const int32 kNoPdf = -1;
-
-/// A class for storing topology information for phones.  See  \ref hmm for context.
-/// This object is sometimes accessed in a file by itself, but more often
-/// as a class member of the Transition class (this is for convenience to reduce
-/// the number of files programs have to access).
-
-class HmmTopology {
- public:
-  /// A structure defined inside HmmTopology to represent a HMM state.
-  struct HmmState {
-    /// The \ref pdf_class forward-pdf-class, typically 0, 1 or 2 (the same as the HMM-state index),
-    /// but may be different to enable us to hardwire sharing of state, and may be
-    /// equal to \ref kNoPdf == -1 in order to specify nonemitting states (unusual).
-    int32 forward_pdf_class;
-
-    /// The \ref pdf_class self-loop pdf-class, similar to \ref pdf_class forward-pdf-class.
-    /// They will either both be \ref kNoPdf, or neither be \ref kNoPdf.
-    int32 self_loop_pdf_class;
-
-    /// A list of transitions, indexed by what we call a 'transition-index'.
-    /// The first member of each pair is the index of the next HmmState, and the
-    /// second is the default transition probability (before training).
-    std::vector<std::pair<int32, BaseFloat> > transitions;
-
-    explicit HmmState(int32 pdf_class) {
-      this->forward_pdf_class = pdf_class;
-      this->self_loop_pdf_class = pdf_class;
-    }
-    explicit HmmState(int32 forward_pdf_class, int32 self_loop_pdf_class) {
-      KALDI_ASSERT((forward_pdf_class != kNoPdf && self_loop_pdf_class != kNoPdf) ||
-                   (forward_pdf_class == kNoPdf && self_loop_pdf_class == kNoPdf));
-      this->forward_pdf_class = forward_pdf_class;
-      this->self_loop_pdf_class = self_loop_pdf_class;
-    }
-
-    bool operator == (const HmmState &other) const {
-      return (forward_pdf_class == other.forward_pdf_class &&
-              self_loop_pdf_class == other.self_loop_pdf_class &&
-              transitions == other.transitions);
-    }
-
-    HmmState(): forward_pdf_class(-1), self_loop_pdf_class(-1) { }
-  };
-
-  /// TopologyEntry is a typedef that represents the topology of
-  /// a single (prototype) state.
-  typedef std::vector<HmmState> TopologyEntry;
-
-  void Read(std::istream &is, bool binary);
-  void Write(std::ostream &os, bool binary) const;
-
-  // Checks that the object is valid, and throw exception otherwise.
-  void Check();
-
-  /// Returns true if this HmmTopology is really 'hmm-like', i.e. the pdf-class on
-  /// the self-loops and forward transitions of all states are identical. [note: in HMMs,
-  /// the densities are associated with the states.] We have extended this to
-  /// support 'non-hmm-like' topologies (where those pdf-classes are different),
-  /// in order to make for more compact decoding graphs in our so-called 'chain models'
-  /// (AKA lattice-free MMI), where we use 1-state topologies that have different pdf-classes
-  /// for the self-loop and the forward transition. Note that we always use the 'reorder=true'
-  /// option so the 'forward transition' actually comes before the self-loop.
-  bool IsHmm() const;
-
-  /// Returns the topology entry (i.e. vector of HmmState) for this phone;
-  /// will throw exception if phone not covered by the topology.
-  const TopologyEntry &TopologyForPhone(int32 phone) const;
-
-  /// Returns the number of \ref pdf_class "pdf-classes" for this phone;
-  /// throws exception if phone not covered by this topology.
-  int32 NumPdfClasses(int32 phone) const;
-
-  /// Returns a reference to a sorted, unique list of phones covered by
-  /// the topology (these phones will be positive integers, and usually
-  /// contiguous and starting from one but the toolkit doesn't assume
-  /// they are contiguous).
-  const std::vector<int32> &GetPhones() const { return phones_; };
-
-  /// Outputs a vector of int32, indexed by phone, that gives the
-  /// number of \ref pdf_class pdf-classes for the phones; this is
-  /// used by tree-building code such as BuildTree().
-  void GetPhoneToNumPdfClasses(std::vector<int32> *phone2num_pdf_classes) const;
-
-  // Returns the minimum number of frames it takes to traverse this model for
-  // this phone: e.g. 3 for the normal HMM topology.
-  int32 MinLength(int32 phone) const;
-
-  HmmTopology() {}
-
-  bool operator == (const HmmTopology &other) const {
-    return phones_ == other.phones_ && phone2idx_ == other.phone2idx_
-        && entries_ == other.entries_;
-  }
-  // Allow default assignment operator and copy constructor.
- private:
-  std::vector<int32> phones_;  // list of all phones we have topology for.  Sorted, uniq.  no epsilon (zero) phone.
-  std::vector<int32> phone2idx_;  // map from phones to indexes into the entries vector (or -1 for not present).
-  std::vector<TopologyEntry> entries_;
-};
-
-
-/// @} end "addtogroup hmm_group"
-
-
-} // end namespace kaldi
-
-
-#endif
diff --git a/src/hmm/hmm-utils-test.cc b/src/hmm/hmm-utils-test.cc
index 69728cc8ca7..5d7f4fcc2c3 100644
--- a/src/hmm/hmm-utils-test.cc
+++ b/src/hmm/hmm-utils-test.cc
@@ -202,7 +202,7 @@ void TestAccumulateTreeStatsOptions() {
 
 void TestSplitToPhones() {
   ContextDependency *ctx_dep = NULL;
-  TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep);
+  Transitions *trans_model = GenRandTransitions(&ctx_dep);
   std::vector<int32> phone_seq;
   int32 num_phones = RandInt(0, 10);
   const std::vector<int32> &phone_list = trans_model->GetPhones();
@@ -210,18 +210,18 @@ void TestSplitToPhones() {
     int32 rand_phone = phone_list[RandInt(0, phone_list.size() - 1)];
     phone_seq.push_back(rand_phone);
   }
-  bool reorder = (RandInt(0, 1) == 0);
   std::vector<int32> alignment;
-  GenerateRandomAlignment(*ctx_dep, *trans_model, reorder,
+  GenerateRandomAlignment(*ctx_dep, *trans_model,
                           phone_seq, &alignment);
   std::vector<std::vector<int32> > split_alignment;
-  SplitToPhones(*trans_model, alignment, &split_alignment);
+  bool ans = SplitToPhones(*trans_model, alignment, &split_alignment);
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(split_alignment.size() == phone_seq.size());
   for (size_t i = 0; i < split_alignment.size(); i++) {
     KALDI_ASSERT(!split_alignment[i].empty());
     for (size_t j = 0; j < split_alignment[i].size(); j++) {
       int32 transition_id = split_alignment[i][j];
-      KALDI_ASSERT(trans_model->TransitionIdToPhone(transition_id) ==
+      KALDI_ASSERT(trans_model->InfoForTransitionId(transition_id).phone ==
                    phone_seq[i]);
     }
   }
@@ -230,18 +230,14 @@ void TestSplitToPhones() {
 }
 
 void TestConvertAlignment() {
-  bool old_reorder = (RandInt(0, 1) == 1),
-      new_reorder = (RandInt(0, 1) == 1),
-      new_tree = (RandInt(0, 1) == 1),
+  bool new_tree = (RandInt(0, 1) == 1),
       new_topology = (RandInt(0, 1) == 1);
   if (!new_tree)
     new_topology = true;
 
   int32 subsample_factor = RandInt(1, 3);
 
-  KALDI_LOG << " old-reorder = " << old_reorder
-            << ", new-reorder = " << new_reorder
-            << ", new-tree = " << new_tree
+  KALDI_LOG << ", new-tree = " << new_tree
             << ", subsample-factor = " << subsample_factor;
 
   std::vector<int32> phones;
@@ -273,11 +269,11 @@ void TestConvertAlignment() {
   }
 
 
-  HmmTopology topo_old = GenRandTopology(phones, num_pdf_classes_old),
+  Topology topo_old = GenRandTopology(phones, num_pdf_classes_old),
       topo_new =  (new_topology ?
                    GenRandTopology(phones, num_pdf_classes_new) : topo_old);
 
-  TransitionModel trans_model_old(*ctx_dep_old, topo_old),
+  Transitions trans_model_old(*ctx_dep_old, topo_old),
       trans_model_new(*ctx_dep_new, topo_new);
 
   std::vector<int32> phone_sequence;
@@ -286,15 +282,15 @@ void TestConvertAlignment() {
     phone_sequence.push_back(phones[RandInt(0, phones.size() - 1)]);
   std::vector<int32> old_alignment;
   GenerateRandomAlignment(*ctx_dep_old, trans_model_old,
-                          old_reorder, phone_sequence,
+                          phone_sequence,
                           &old_alignment);
 
   std::vector<int32> new_alignment;
 
   bool ans = ConvertAlignment(trans_model_old, trans_model_new, *ctx_dep_new,
                               old_alignment, subsample_factor, false,
-                              new_reorder, NULL, &new_alignment);
-  if(!ans) {
+                              NULL, &new_alignment);
+  if (!ans) {
     KALDI_WARN << "Alignment conversion failed";
     // make sure it failed for a good reason.
     KALDI_ASSERT(new_topology || subsample_factor > 1);
@@ -305,14 +301,14 @@ void TestConvertAlignment() {
     KALDI_ASSERT(b1 && b2);
     KALDI_ASSERT(old_split.size() == new_split.size());
     for (size_t i = 0; i < new_split.size(); i++)
-      KALDI_ASSERT(trans_model_old.TransitionIdToPhone(old_split[i].front()) ==
-                   trans_model_new.TransitionIdToPhone(new_split[i].front()));
+      KALDI_ASSERT(trans_model_old.InfoForTransitionId(old_split[i].front()).phone ==
+                   trans_model_new.InfoForTransitionId(new_split[i].front()).phone);
     if (!new_topology && subsample_factor == 1) {
       // we should be able to convert back and it'll be the same.
       std::vector<int32> old_alignment_copy;
       bool ans = ConvertAlignment(trans_model_new, trans_model_old, *ctx_dep_old,
                                   new_alignment, subsample_factor, false,
-                                  old_reorder, NULL, &old_alignment_copy);
+                                  NULL, &old_alignment_copy);
       KALDI_ASSERT(ans);
       KALDI_ASSERT(old_alignment_copy == old_alignment);
     }
@@ -336,4 +332,3 @@ int main() {
     kaldi::TestConvertAlignment();
   std::cout << "Test OK.\n";
 }
-
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index 06edf8d5976..7bd6070f151 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011  Microsoft Corporation
 //                2018  Johns Hopkins University (author: Daniel Povey)
+//                2019  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -18,6 +19,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+#include <memory>
 #include <vector>
 
 #include "hmm/hmm-utils.h"
@@ -27,16 +29,12 @@
 
 namespace kaldi {
 
-
-
-fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
-    std::vector<int32> phone_window,
+std::shared_ptr<fst::StdVectorFst> GetHmmAsFsa(
+    const std::vector<int32> &phone_window,
     const ContextDependencyInterface &ctx_dep,
-    const TransitionModel &trans_model,
-    const HTransducerConfig &config,
+    const Transitions &trans_model,
+    bool include_self_loops,
     HmmCacheType *cache) {
-  using namespace fst;
-
   if (static_cast<int32>(phone_window.size()) != ctx_dep.ContextWidth())
     KALDI_ERR << "Context size mismatch, ilabel-info [from context FST is "
               << phone_window.size() << ", context-dependency object "
@@ -48,16 +46,14 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
     KALDI_ERR << "phone == 0.  Some mismatch happened, or there is "
           "a code error.";
 
-  const HmmTopology &topo = trans_model.GetTopo();
-  const HmmTopology::TopologyEntry &entry  = topo.TopologyForPhone(phone);
+  const Topology &topo = trans_model.GetTopo();
 
-  // vector of the pdfs, indexed by pdf-class (pdf-classes must start from zero
-  // and be contiguous).
-  std::vector<int32> pdfs(topo.NumPdfClasses(phone));
-  for (int32 pdf_class = 0;
-       pdf_class < static_cast<int32>(pdfs.size());
+  // vector of the pdf-ids, indexed by pdf-class minus one.
+  std::vector<int32> pdf_ids(topo.NumPdfClasses(phone));
+  for (int32 pdf_class = 1;
+       pdf_class <= static_cast<int32>(pdf_ids.size());
        pdf_class++) {
-    if (! ctx_dep.Compute(phone_window, pdf_class, &(pdfs[pdf_class])) ) {
+    if (! ctx_dep.Compute(phone_window, pdf_class, &(pdf_ids[pdf_class - 1])) ) {
       std::ostringstream ctx_ss;
       for (size_t i = 0; i < phone_window.size(); i++)
         ctx_ss << phone_window[i] << ' ';
@@ -70,80 +66,65 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
           " that general nature.";
     }
   }
-  std::pair<int32, std::vector<int32> > cache_index(phone, pdfs);
+
+  std::pair<int32, std::vector<int32> > cache_index(phone, pdf_ids);
   if (cache != NULL) {
     HmmCacheType::iterator iter = cache->find(cache_index);
     if (iter != cache->end())
       return iter->second;
   }
 
-  VectorFst<StdArc> *ans = new VectorFst<StdArc>;
+  using Arc = fst::StdArc;
+  using StateId = Arc::StateId;
+  using Weight = Arc::Weight;
 
-  typedef StdArc Arc;
-  typedef Arc::Weight Weight;
-  typedef Arc::StateId StateId;
-  typedef Arc::Label Label;
+  const fst::StdVectorFst &entry = topo.TopologyForPhone(phone);
+  // the elements correction_factors are factors only in the semiring;
+  // physically they are costs to be added.
+  std::vector<float> correction_factors;
+  if (include_self_loops)
+    correction_factors.resize(entry.NumStates(), 0);
+  else
+    correction_factors = topo.CorrectionFactorsForPhone(phone);
+  const std::vector<int32> &self_loop_pdf_classes =
+      topo.SelfLoopPdfClassesForPhone(phone);
+  std::shared_ptr<fst::StdVectorFst> ans(
+      new fst::StdVectorFst());
+  StateId num_states = entry.NumStates();
+  for (StateId s = 0; s < num_states; s++)
+    ans->AddState();
+  KALDI_PARANOID_ASSERT(entry.Start() == 0);  // required by topology class.
+  ans->SetStart(0);
 
-  std::vector<StateId> state_ids;
-  for (size_t i = 0; i < entry.size(); i++)
-    state_ids.push_back(ans->AddState());
-  KALDI_ASSERT(state_ids.size() != 0);  // Or empty topology entry.
-  ans->SetStart(state_ids[0]);
-  StateId final = state_ids.back();
-  ans->SetFinal(final, Weight::One());
-
-  for (int32 hmm_state = 0;
-       hmm_state < static_cast<int32>(entry.size());
-       hmm_state++) {
-    int32 forward_pdf_class = entry[hmm_state].forward_pdf_class, forward_pdf;
-    int32 self_loop_pdf_class = entry[hmm_state].self_loop_pdf_class, self_loop_pdf;
-    if (forward_pdf_class == kNoPdf) {  // nonemitting state.
-      forward_pdf = kNoPdf;
-      self_loop_pdf = kNoPdf;
-    } else {
-      KALDI_ASSERT(forward_pdf_class < static_cast<int32>(pdfs.size()));
-      KALDI_ASSERT(self_loop_pdf_class < static_cast<int32>(pdfs.size()));
-      forward_pdf = pdfs[forward_pdf_class];
-      self_loop_pdf = pdfs[self_loop_pdf_class];
-    }
-    int32 trans_idx;
-    for (trans_idx = 0;
-        trans_idx < static_cast<int32>(entry[hmm_state].transitions.size());
-        trans_idx++) {
-      BaseFloat log_prob;
-      Label label;
-      int32 dest_state = entry[hmm_state].transitions[trans_idx].first;
-      bool is_self_loop = (dest_state == hmm_state);
-      if (is_self_loop)
-        continue; // We will add self-loops in at a later stage of processing,
-      // not in this function.
-      if (forward_pdf_class == kNoPdf) {
-        // no pdf, hence non-estimated probability.
-        // [would not happen with normal topology] .  There is no transition-state
-        // involved in this case.
-        log_prob = Log(entry[hmm_state].transitions[trans_idx].second);
-        label = 0;
-      } else {  // normal probability.
-        int32 trans_state =
-            trans_model.TupleToTransitionState(phone, hmm_state, forward_pdf, self_loop_pdf);
-        int32 trans_id =
-            trans_model.PairToTransitionId(trans_state, trans_idx);
-        log_prob = trans_model.GetTransitionLogProbIgnoringSelfLoops(trans_id);
-        // log_prob is a negative number (or zero)...
-        label = trans_id;
-      }
-      // Will add probability-scale later (we may want to push first).
-      ans->AddArc(state_ids[hmm_state],
-                  Arc(label, label, Weight(-log_prob), state_ids[dest_state]));
+  for (StateId s = 0; s < num_states; s++) {
+    Weight correction_weight(correction_factors[s]);
+    ans->SetFinal(s, Times(correction_weight, entry.Final(s)));
+
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, s);
+         !aiter.Done(); aiter.Next()) {
+      if (!include_self_loops && aiter.Value().nextstate == s)
+        continue;
+      Arc arc = aiter.Value();
+
+      // self_loop_pdf_class is the pdf-class of the self-loop of the destination
+      // state of this arc, if any, else -1.
+      int32 self_loop_pdf_class = self_loop_pdf_classes[arc.nextstate];
+      // self_loop_pdf_id is the pdf-id of the self-loop in the destination
+      // state of this arc, if any, else -1.
+      int32 self_loop_pdf_id = (self_loop_pdf_class != -1 ?
+                                pdf_ids[self_loop_pdf_class - 1] : -1);
+      int32 pdf_class = arc.ilabel,
+          pdf_id = pdf_ids[pdf_class - 1],
+          trans_id = trans_model.TupleToTransitionId(
+              phone, s, aiter.Position(), pdf_id, self_loop_pdf_id);
+
+      arc.ilabel = trans_id;
+      arc.olabel = trans_id;
+      arc.weight = Times(correction_weight, arc.weight);
+      ans->AddArc(s, arc);
     }
   }
 
-  fst::RemoveEpsLocal(ans);  // this is safe and will not blow up.
-
-  // Now apply probability scale.
-  // We waited till after the possible weight-pushing steps,
-  // because weight-pushing needs "real" weights in order to work.
-  ApplyProbabilityScale(config.transition_scale, ans);
   if (cache != NULL)
     (*cache)[cache_index] = ans;
   return ans;
@@ -151,95 +132,14 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
 
 
 
-fst::VectorFst<fst::StdArc>*
-GetHmmAsFsaSimple(std::vector<int32> phone_window,
-                  const ContextDependencyInterface &ctx_dep,
-                  const TransitionModel &trans_model,
-                  BaseFloat prob_scale) {
-  using namespace fst;
-
-  if (static_cast<int32>(phone_window.size()) != ctx_dep.ContextWidth())
-    KALDI_ERR <<"Context size mismatch, ilabel-info [from context FST is "
-              <<(phone_window.size())<<", context-dependency object "
-        "expects "<<(ctx_dep.ContextWidth());
-
-  int P = ctx_dep.CentralPosition();
-  int32 phone = phone_window[P];
-  KALDI_ASSERT(phone != 0);
-
-  const HmmTopology &topo = trans_model.GetTopo();
-  const HmmTopology::TopologyEntry &entry  = topo.TopologyForPhone(phone);
-
-  VectorFst<StdArc> *ans = new VectorFst<StdArc>;
-
-  // Create a mini-FST with a superfinal state [in case we have emitting
-  // final-states, which we usually will.]
-  typedef StdArc Arc;
-  typedef Arc::Weight Weight;
-  typedef Arc::StateId StateId;
-  typedef Arc::Label Label;
-
-  std::vector<StateId> state_ids;
-  for (size_t i = 0; i < entry.size(); i++)
-    state_ids.push_back(ans->AddState());
-  KALDI_ASSERT(state_ids.size() > 1);  // Or invalid topology entry.
-  ans->SetStart(state_ids[0]);
-  StateId final = state_ids.back();
-  ans->SetFinal(final, Weight::One());
-
-  for (int32 hmm_state = 0;
-       hmm_state < static_cast<int32>(entry.size());
-       hmm_state++) {
-    int32 forward_pdf_class = entry[hmm_state].forward_pdf_class, forward_pdf;
-    int32 self_loop_pdf_class = entry[hmm_state].self_loop_pdf_class, self_loop_pdf;
-    if (forward_pdf_class == kNoPdf) {   // nonemitting state; not generally used.
-      forward_pdf = kNoPdf;
-      self_loop_pdf = kNoPdf;
-    } else {
-      bool ans = ctx_dep.Compute(phone_window, forward_pdf_class, &forward_pdf);
-      KALDI_ASSERT(ans && "Context-dependency computation failed.");
-      ans = ctx_dep.Compute(phone_window, self_loop_pdf_class, &self_loop_pdf);
-      KALDI_ASSERT(ans && "Context-dependency computation failed.");
-    }
-    int32 trans_idx;
-    for (trans_idx = 0;
-        trans_idx < static_cast<int32>(entry[hmm_state].transitions.size());
-        trans_idx++) {
-      BaseFloat log_prob;
-      Label label;
-      int32 dest_state = entry[hmm_state].transitions[trans_idx].first;
-      if (forward_pdf_class == kNoPdf) {
-        // no pdf, hence non-estimated probability.  very unusual case.  [would
-        // not happen with normal topology] .  There is no transition-state
-        // involved in this case.
-        KALDI_ASSERT(hmm_state != dest_state);
-        log_prob = Log(entry[hmm_state].transitions[trans_idx].second);
-        label = 0;
-      } else {  // normal probability.
-        int32 trans_state =
-            trans_model.TupleToTransitionState(phone, hmm_state, forward_pdf, self_loop_pdf);
-        int32 trans_id =
-            trans_model.PairToTransitionId(trans_state, trans_idx);
-        log_prob = prob_scale * trans_model.GetTransitionLogProb(trans_id);
-        // log_prob is a negative number (or zero)...
-        label = trans_id;
-      }
-      ans->AddArc(state_ids[hmm_state],
-                  Arc(label, label, Weight(-log_prob), state_ids[dest_state]));
-    }
-  }
-  return ans;
-}
-
-
-
 /// This utility function, used in GetHTransducer(), creates an FSA (finite
 /// state acceptor, i.e. an FST with ilabels equal to olabels) with a single
 /// successful path, with a single label on it.
-static inline fst::VectorFst<fst::StdArc> *MakeTrivialAcceptor(int32 label) {
+static inline std::unique_ptr<fst::VectorFst<fst::StdArc>>
+MakeTrivialAcceptor(int32 label) {
   typedef fst::StdArc Arc;
   typedef Arc::Weight Weight;
-  fst::VectorFst<Arc> *ans = new fst::VectorFst<Arc>;
+  std::unique_ptr<fst::VectorFst<Arc>> ans(new fst::VectorFst<Arc>);
   ans->AddState();
   ans->AddState();
   ans->SetStart(0);
@@ -251,11 +151,12 @@ static inline fst::VectorFst<fst::StdArc> *MakeTrivialAcceptor(int32 label) {
 
 
 // The H transducer has a separate outgoing arc for each of the symbols in ilabel_info.
-fst::VectorFst<fst::StdArc> *GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
-                                            const ContextDependencyInterface &ctx_dep,
-                                            const TransitionModel &trans_model,
-                                            const HTransducerConfig &config,
-                                            std::vector<int32> *disambig_syms_left) {
+std::unique_ptr<fst::VectorFst<fst::StdArc>>
+GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
+               const ContextDependencyInterface &ctx_dep,
+               const Transitions &trans_model,
+               const HTransducerConfig &config,
+               std::vector<int32> *disambig_syms_left) {
   KALDI_ASSERT(ilabel_info.size() >= 1 && ilabel_info[0].size() == 0);  // make sure that eps == eps.
   HmmCacheType cache;
   // "cache" is an optimization that prevents GetHmmAsFsa repeating work
@@ -266,7 +167,14 @@ fst::VectorFst<fst::StdArc> *GetHTransducer(const std::vector<std::vector<int32>
   typedef Arc::StateId StateId;
   typedef Arc::Label Label;
 
-  std::vector<const ExpandedFst<Arc>* > fsts(ilabel_info.size(), NULL);
+  // I would prefer to do this:
+  // std::vector<std::unique_ptr<const ExpandedFst<Arc>>> fsts(ilabel_info.size(), std::unique_ptr(nullptr));
+  // But the second arg of constructor (2) at https://en.cppreference.com/w/cpp/container/vector/vector
+  // must be able to be turned into a const-reference, which std::unique_ptr cannot be.
+  std::vector<std::unique_ptr<const ExpandedFst<Arc>>> fsts;
+  for(std::size_t i = 0; i < ilabel_info.size(); ++i) {
+    fsts.emplace_back(std::unique_ptr<const ExpandedFst<Arc>>(nullptr));
+  }
   std::vector<int32> phones = trans_model.GetPhones();
 
   KALDI_ASSERT(disambig_syms_left != 0);
@@ -315,26 +223,29 @@ fst::VectorFst<fst::StdArc> *GetHTransducer(const std::vector<std::vector<int32>
     } else {  // Real phone-in-context.
       std::vector<int32> phone_window = ilabel_info[j];
 
-      VectorFst<Arc> *fst = GetHmmAsFsa(phone_window,
-                                        ctx_dep,
-                                        trans_model,
-                                        config,
-                                        &cache);
-      fsts[j] = fst;
+      std::shared_ptr<ExpandedFst<Arc>> fst = GetHmmAsFsa(phone_window,
+                                                          ctx_dep,
+                                                          trans_model,
+                                                          config.include_self_loops,
+                                                          &cache);
+      std::unique_ptr<ExpandedFst<Arc>> u_fst(fst->Copy());
+      fsts[j] = std::move(u_fst);
     }
   }
 
-  VectorFst<Arc> *ans = MakeLoopFst(fsts);
-  SortAndUniq(&fsts); // remove duplicate pointers, which we will have
-  // in general, since we used the cache.
-  DeletePointers(&fsts);
+  // fsts_bare is as fsts, but with bare pointers.
+  std::vector<const fst::ExpandedFst<Arc> *> fsts_bare(fsts.size());
+  for (size_t i = 0; i < fsts.size(); i++)
+    fsts_bare[i] = fsts[i].get();
+
+  std::unique_ptr<VectorFst<Arc>> ans(MakeLoopFst(fsts_bare));
   return ans;
 }
 
 
 void GetIlabelMapping (const std::vector<std::vector<int32> > &ilabel_info_old,
                        const ContextDependencyInterface &ctx_dep,
-                       const TransitionModel &trans_model,
+                       const Transitions &trans_model,
                        std::vector<int32> *old2new_map) {
   KALDI_ASSERT(old2new_map != NULL);
 
@@ -366,8 +277,8 @@ void GetIlabelMapping (const std::vector<std::vector<int32> > &ilabel_info_old,
       int32 central_phone = vec[P];
       int32 num_pdf_classes = trans_model.GetTopo().NumPdfClasses(central_phone);
       std::vector<int32> state_seq(num_pdf_classes);  // Indexed by pdf-class
-      for (int32 pdf_class = 0; pdf_class < num_pdf_classes; pdf_class++) {
-        if (!ctx_dep.Compute(vec, pdf_class, &(state_seq[pdf_class]))) {
+      for (int32 pdf_class = 1; pdf_class <= num_pdf_classes; pdf_class++) {
+        if (!ctx_dep.Compute(vec, pdf_class, &(state_seq[pdf_class - 1]))) {
           std::ostringstream ss;
           WriteIntegerVector(ss, false, vec);
           KALDI_ERR << "tree did not succeed in converting phone window "<<ss.str();
@@ -404,248 +315,230 @@ void GetIlabelMapping (const std::vector<std::vector<int32> > &ilabel_info_old,
 
 
 
-fst::VectorFst<fst::StdArc> *GetPdfToTransitionIdTransducer(const TransitionModel &trans_model) {
+std::unique_ptr<fst::VectorFst<fst::StdArc>>
+GetPdfToTransitionIdTransducer(const Transitions &trans_model) {
   using namespace fst;
-  VectorFst<StdArc> *ans = new VectorFst<StdArc>;
+  std::unique_ptr<VectorFst<StdArc>> ans(new VectorFst<StdArc>);
   typedef VectorFst<StdArc>::Weight Weight;
   typedef StdArc Arc;
   ans->AddState();
   ans->SetStart(0);
   ans->SetFinal(0, Weight::One());
   for (int32 tid = 1; tid <= trans_model.NumTransitionIds(); tid++) {
-    int32 pdf = trans_model.TransitionIdToPdf(tid);
-    ans->AddArc(0, Arc(pdf+1, tid, Weight::One(), 0));  // note the offset of 1 on the pdfs.
+    int32 pdf = trans_model.TransitionIdToPdfFast(tid);
+    ans->AddArc(0, Arc(pdf+1, tid, Weight::One(), 0));  // note the offset of 1 on the pdf_ids.
     // it's because 0 is a valid pdf.
   }
   return ans;
 }
 
+struct TransitionState {
+public:
+  TransitionState(const Transitions::TransitionIdInfo& info):
+    info(info) { }
 
+  bool operator==(const TransitionState& other) const {
+    return info.phone == other.info.phone &&
+      info.topo_state == other.info.topo_state &&
+      info.pdf_id == other.info.pdf_id;
+  }
+
+  bool operator!=(const TransitionState& other) const {
+    return !(*this == other);
+  }
 
-class TidToTstateMapper {
+  TransitionState& operator=(TransitionState other) {
+// TODO: Fix this bizarre error when I uncomment this:
+    // this->info = other.info;
+    KALDI_ASSERT(false);
+// hmm-utils.cc: In member function ‘kaldi::TransitionState& kaldi::TransitionState::operator=(kaldi::TransitionState)’:
+// hmm-utils.cc:351:24: error: passing ‘const kaldi::Transitions::TransitionIdInfo’ as ‘this’ argument discards qualifiers [-fpermissive]
+//      this->info = other.info;
+//                         ^~~~
+// In file included from ../hmm/hmm-utils.h:27:0,
+//                  from hmm-utils.cc:25:
+// ../hmm/transitions.h:107:10: note:   in call to ‘kaldi::Transitions::TransitionIdInfo& kaldi::Transitions::TransitionIdInfo::operator=(const kaldi::Transitions::TransitionIdInfo&)’
+
+    return *this;
+  }
+
+  bool operator<(const TransitionState& other) const {
+    return info < other.info;
+  }
+
+  const Transitions::TransitionIdInfo& info;
+};
+
+class TidToSelfLoopMapper {
 public:
-  // Function object used in MakePrecedingInputSymbolsSameClass and
-  // MakeFollowingInputSymbolsSameClass (as called by AddSelfLoopsReorder and
-  // AddSelfLoopsNoReorder).  It maps transition-ids to transition-states (and
-  // -1 to -1, 0 to 0 and disambiguation symbols to 0).  If check_no_self_loops
-  // == true, it also checks that there are no self-loops in the graph (i.e. in
-  // the labels it is called with).  This is just a convenient place to put this
-  // check.
-
-  // This maps valid transition-ids to transition states, maps kNoLabel to -1, and
-  // maps all other symbols (i.e. epsilon symbols, disambig symbols, and symbols
-  // with values over 100000/kNontermBigNumber) to zero.
-  // Its point is to provide an equivalence class on labels that's relevant to what
-  // the self-loop will be on the following (or preceding) state.
-  TidToTstateMapper(const TransitionModel &trans_model,
-                    const std::vector<int32> &disambig_syms,
-                    bool check_no_self_loops):
+  // Function object used in MakePrecedingInputSymbolsSameClass and.
+  // It maps a transition-ids t to the transition-id on the self-loop
+  // of the destination-state of t (or 0 if there is no self-loop).
+  //
+  // If currently_self_loop_free == true, it also checks that there are no
+  // self-loops in the graph (i.e. in the labels it is called with).  This is
+  // just a convenient place to put this check.
+
+  // This maps valid transition-ids to transition states, and maps all other
+  // symbols (i.e. epsilon symbols, disambig symbols, and symbols with values
+  // over 100000/kNontermBigNumber) to zero.  (and -1 == kNoLabel to -1).
+  // Its purpose is to provide an
+  // equivalence class on labels that's relevant to what the self-loop will be
+  // on the following state.
+  TidToSelfLoopMapper(const Transitions &trans_model,
+                      const std::vector<int32> &disambig_syms,
+                      bool currently_self_loop_free):
       trans_model_(trans_model),
       disambig_syms_(disambig_syms),
-      check_no_self_loops_(check_no_self_loops) { }
-  typedef int32 Result;
-  int32 operator() (int32 label) const {
-    if (label == static_cast<int32>(fst::kNoLabel)) return -1;  // -1 -> -1
-    else if (label >= 1 && label <= trans_model_.NumTransitionIds()) {
-      if (check_no_self_loops_ && trans_model_.IsSelfLoop(label))
+      currently_self_loop_free_(currently_self_loop_free) { }
+
+  int32 operator() (int32 tid) const {
+    if (tid > 0 && tid <= trans_model_.NumTransitionIds()) {
+      if (currently_self_loop_free_ && trans_model_.InfoForTransitionId(tid).is_self_loop)
         KALDI_ERR << "AddSelfLoops: graph already has self-loops.";
-      return trans_model_.TransitionIdToTransitionState(label);
+      return trans_model_.InfoForTransitionId(tid).self_loop_transition_id;
+    } else if (tid == fst::kNoLabel) {
+      return -1;
     } else {  // 0 or (presumably) disambiguation symbol.  Map to zero
       int32 big_number = fst::kNontermBigNumber;  // 1000000
-      if (label != 0 && label < big_number)
+      if (tid != 0 && tid < big_number) {
         KALDI_ASSERT(std::binary_search(disambig_syms_.begin(),
                                         disambig_syms_.end(),
-                                        label));  // or invalid label
+                                        tid) &&
+                    "It looks like you have an invalid symbol in your graph: ");
+      }
       return 0;
     }
   }
 
 private:
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   const std::vector<int32> &disambig_syms_;  // sorted.
-  bool check_no_self_loops_;
+  bool currently_self_loop_free_;
 };
 
-// This is the code that expands an FST from transition-states to
-// transition-ids, in the case where reorder == true, i.e. the non-optional
-// transition is before the self-loop.
-static void AddSelfLoopsReorder(const TransitionModel &trans_model,
-                                const std::vector<int32> &disambig_syms,
-                                BaseFloat self_loop_scale,
-                                bool check_no_self_loops,
-                                fst::VectorFst<fst::StdArc> *fst) {
+// Returns true if the outgoing arcs of the state s sum to 1.0
+template<typename FST>
+static bool StateIsStochastic(FST fst, typename FST::StateId s) {
+  using namespace fst;
+  using Arc = typename FST::Arc;
+  using Weight = typename Arc::Weight;
+  Weight total_prob = Weight::Zero();
+  for (MutableArcIterator<MutableFst<Arc> > aiter(&fst, s);
+       !aiter.Done();
+       aiter.Next()) {
+    total_prob = Plus(total_prob, aiter.Value().weight);
+  }
+  return fst::ApproxEqual(total_prob, Weight::One());
+}
+
+void AddSelfLoops(const Transitions &trans_model,
+                  const std::vector<int32> &disambig_syms,
+                  bool currently_self_loop_free,
+                  bool use_weights,
+                  fst::VectorFst<fst::StdArc> *fst) {
+  KALDI_ASSERT(fst->Start() != fst::kNoStateId);
   using namespace fst;
   typedef StdArc Arc;
   typedef Arc::Label Label;
   typedef Arc::StateId StateId;
   typedef Arc::Weight Weight;
 
-  TidToTstateMapper f(trans_model, disambig_syms, check_no_self_loops);
+  TidToSelfLoopMapper f(trans_model, disambig_syms, currently_self_loop_free);
+
   // Duplicate states as necessary so that each state will require at most one
   // self-loop to be added to it.  Approximately this means that if a
   // state has multiple different symbols on arcs entering it, it will be
   // duplicated, with one copy per incoming symbol.
   MakePrecedingInputSymbolsSameClass(true, fst, f);
 
-  int32 kNoTransState = f(kNoLabel);
-  KALDI_ASSERT(kNoTransState == -1);
-
-  // use the following to keep track of the transition-state for each state.
-  std::vector<int32> state_in(fst->NumStates(), kNoTransState);
-
   // This first loop just works out the label into each state,
   // and converts the transitions in the graph from transition-states
   // to transition-ids.
+  // state_in maps each state in the fst to its TransitionState
 
-  for (StateIterator<VectorFst<Arc> > siter(*fst);
-       !siter.Done();
-       siter.Next()) {
-    StateId s = siter.Value();
+
+  StateId num_states = fst->NumStates();
+  // self_loop_transition_id gives the transition-id of the self-loop of this
+  // state, or zero if it doesn't require a self-loop.
+  // -1 is where we don't know the self-loop transition id (if any)
+  // for this state yet.
+  std::vector<int32> self_loop_transition_id(num_states, -1);
+
+  for (StateId s = 0; s < num_states; s++) {
     for (MutableArcIterator<VectorFst<Arc> > aiter(fst, s);
          !aiter.Done();
          aiter.Next()) {
-      Arc arc = aiter.Value();
-      int32 trans_state = f(arc.ilabel);
-      if (state_in[arc.nextstate] == kNoTransState)
-        state_in[arc.nextstate] = trans_state;
-      else {
-        KALDI_ASSERT(state_in[arc.nextstate] == trans_state);
+      const Arc &arc = aiter.Value();
+      int32 next_state_self_loop_transition_id = f(arc.ilabel);
+      if (self_loop_transition_id[arc.nextstate] == -1) {
+        // Note: next_state_self_loop_transition_id could be
+        self_loop_transition_id[arc.nextstate] =
+            next_state_self_loop_transition_id;
+      } else {
+        KALDI_ASSERT(self_loop_transition_id[arc.nextstate] ==
+                     next_state_self_loop_transition_id);
         // or probably an error in MakePrecedingInputSymbolsSame.
       }
     }
   }
 
-  KALDI_ASSERT(state_in[fst->Start()] == kNoStateId || state_in[fst->Start()] == 0);
-  // or MakePrecedingInputSymbolsSame failed.
-
-  // The next loop looks at each graph state, adds the self-loop [if needed] and
-  // multiples all the out-transitions' probs (and final-prob) by the
-  // forward-prob for that state (which is one minus self-loop-prob).  We do it
-  // like this to maintain stochasticity (i.e. rather than multiplying the arcs
-  // with the corresponding labels on them by this probability).
-
-  for (StateId s = 0; s < static_cast<StateId>(state_in.size()); s++) {
-    if (state_in[s] > 0) {  // defined, and not eps or a disambiguation symbol or a
-                            // nonterminal-related sybol for grammar decoding...
-      int32 trans_state = static_cast<int32>(state_in[s]);
-      // First multiply all probabilities by "forward" probability.
-      BaseFloat log_prob = trans_model.GetNonSelfLoopLogProb(trans_state);
-      fst->SetFinal(s, Times(fst->Final(s), Weight(-log_prob*self_loop_scale)));
-      for (MutableArcIterator<MutableFst<Arc> > aiter(fst, s);
-          !aiter.Done();
-          aiter.Next()) {
-        Arc arc = aiter.Value();
-        arc.weight = Times(arc.weight, Weight(-log_prob*self_loop_scale));
-        aiter.SetValue(arc);
-      }
-      // Now add self-loop, if needed.
-      int32 trans_id = trans_model.SelfLoopOf(trans_state);
-      if (trans_id != 0) {  // has self-loop.
-        BaseFloat log_prob = trans_model.GetTransitionLogProb(trans_id);
-        fst->AddArc(s, Arc(trans_id, 0, Weight(-log_prob*self_loop_scale), s));
+  if (!currently_self_loop_free) {
+    // there might be some self-loops present already, so make sure we don't
+    // duplicate them.
+    for (StateId s = 0; s < num_states; s++) {
+      for (MutableArcIterator<VectorFst<Arc> > aiter(fst, s);
+           !aiter.Done();
+           aiter.Next()) {
+        const Arc &arc = aiter.Value();
+        int32 tid = arc.ilabel;
+        if (tid > 0 && tid <= trans_model.NumTransitionIds() &&
+            trans_model.InfoForTransitionId(tid).is_self_loop)
+          self_loop_transition_id[s] = 0;
       }
     }
+  } else {
+    // We shouldn't have added a self-loop to the start state.
+    KALDI_ASSERT(self_loop_transition_id[fst->Start()] <= 0);
   }
-}
-
-
-// this is the code that expands an FST from transition-states to
-// transition-ids, in the case where reorder == false, i.e. non-optional
-// transition is after the self-loop.
-static void AddSelfLoopsNoReorder(
-    const TransitionModel &trans_model,
-    const std::vector<int32> &disambig_syms,
-    BaseFloat self_loop_scale,
-    bool check_no_self_loops,
-    fst::VectorFst<fst::StdArc> *fst) {
-  using namespace fst;
-  typedef StdArc Arc;
-  typedef Arc::Label Label;
-  typedef Arc::StateId StateId;
-  typedef Arc::Weight Weight;
-
-  // Duplicate states as necessary so that each state has at most one self-loop
-  // on it.
-  TidToTstateMapper f(trans_model, disambig_syms, check_no_self_loops);
-  MakeFollowingInputSymbolsSameClass(true, fst, f);
 
-  StateId num_states = fst->NumStates();
-  for (StateId s = 0; s < num_states; s++) {
-    int32 my_trans_state = f(kNoLabel);
-    KALDI_ASSERT(my_trans_state == -1);
-    for (MutableArcIterator<VectorFst<Arc> > aiter(fst, s);
-         !aiter.Done();
-         aiter.Next()) {
-      Arc arc = aiter.Value();
-      if (my_trans_state == -1) my_trans_state = f(arc.ilabel);
-      else KALDI_ASSERT(my_trans_state == f(arc.ilabel));  // or MakeFollowingInputSymbolsSameClass failed.
-      if (my_trans_state > 0) {  // transition-id; multiply weight...
-        BaseFloat log_prob = trans_model.GetNonSelfLoopLogProb(my_trans_state);
-        arc.weight = Times(arc.weight, Weight(-log_prob*self_loop_scale));
+  // The next loop looks at each graph state, adds the self-loop [if needed] and
+  // multiples all the out-transitions' probs (and final-prob) by the inverse of
+  // the correction factor that we used when creating the no-self-loops graph.
+  // We do it like this to maintain stochasticity throughout the graph compilation
+  // process.
+
+  if (use_weights) {
+    for (StateId s = 0; s < num_states; s++) {
+      int32 tid = self_loop_transition_id[s];
+      if (tid <= 0)
+        continue;
+      const auto &info(trans_model.InfoForTransitionId(tid));
+
+      BaseFloat self_loop_cost = info.transition_cost,
+          correction_factor = trans_model.GetTopo().CorrectionFactorsForPhone(
+              info.phone)[info.topo_state];
+      Weight correction(-correction_factor),
+          self_loop_weight(self_loop_cost);
+
+      fst->SetFinal(s, Times(fst->Final(s), correction));
+      for (MutableArcIterator<MutableFst<Arc> > aiter(fst, s);
+           !aiter.Done();
+           aiter.Next()) {
+        Arc arc = aiter.Value();
+        arc.weight = Times(arc.weight, correction);
         aiter.SetValue(arc);
       }
+      // Add self-loop.  ilabel is `tid`, olabel is epsilon (0).
+      fst->AddArc(s, Arc(tid, 0, self_loop_weight, s));
     }
-    if (fst->Final(s) != Weight::Zero()) {
-      KALDI_ASSERT(my_trans_state == kNoLabel || my_trans_state == 0);  // or MakeFollowingInputSymbolsSameClass failed.
-    }
-    if (my_trans_state != kNoLabel && my_trans_state != 0) {
-      // a transition-state;  add self-loop, if it has one.
-      int32 trans_id = trans_model.SelfLoopOf(my_trans_state);
-      if (trans_id != 0) {  // has self-loop.
-        BaseFloat log_prob = trans_model.GetTransitionLogProb(trans_id);
-        fst->AddArc(s, Arc(trans_id, 0, Weight(-log_prob*self_loop_scale), s));
-      }
-    }
-  }
-}
-
-void AddSelfLoops(const TransitionModel &trans_model,
-                  const std::vector<int32> &disambig_syms,
-                  BaseFloat self_loop_scale,
-                  bool reorder,
-                  bool check_no_self_loops,
-                  fst::VectorFst<fst::StdArc> *fst) {
-  KALDI_ASSERT(fst->Start() != fst::kNoStateId);
-  if (reorder)
-    AddSelfLoopsReorder(trans_model, disambig_syms, self_loop_scale,
-                        check_no_self_loops, fst);
-  else
-    AddSelfLoopsNoReorder(trans_model, disambig_syms, self_loop_scale,
-                          check_no_self_loops, fst);
-}
-
-// IsReordered returns true if the transitions were possibly reordered.  This reordering
-// can happen in AddSelfLoops, if the "reorder" option was true.
-// This makes the out-transition occur before the self-loop transition.
-// The function returns false (no reordering) if there is not enough information in
-// the alignment to tell (i.e. no self-loop were taken), and in this case the calling
-// code doesn't care what the answer is.
-// The "alignment" vector contains a sequence of TransitionIds.
-
-static bool IsReordered(const TransitionModel &trans_model,
-                        const std::vector<int32> &alignment) {
-  for (size_t i = 0; i + 1 < alignment.size(); i++) {
-    int32 tstate1 = trans_model.TransitionIdToTransitionState(alignment[i]),
-        tstate2 = trans_model.TransitionIdToTransitionState(alignment[i+1]);
-    if (tstate1 != tstate2) {
-      bool is_loop_1 = trans_model.IsSelfLoop(alignment[i]),
-          is_loop_2 = trans_model.IsSelfLoop(alignment[i+1]);
-      KALDI_ASSERT(!(is_loop_1 && is_loop_2));  // Invalid.
-      if (is_loop_1) return true;  // Reordered. self-loop is last.
-      if (is_loop_2) return false;  // Not reordered.  self-loop is first.
+  } else {
+    for (StateId s = 0; s < num_states; s++) {
+      int32 tid = self_loop_transition_id[s];
+      // Add self-loop.  ilabel is `tid`, olabel is epsilon (0).
+      fst->AddArc(s, Arc(tid, 0, Weight::One(), s));
     }
   }
-
-  // Just one trans-state in whole sequence.
-  if (alignment.empty()) return false;
-  else {
-    bool is_loop_front = trans_model.IsSelfLoop(alignment.front()),
-        is_loop_back = trans_model.IsSelfLoop(alignment.back());
-    if (is_loop_front) return false;  // Not reordered.  Self-loop is first.
-    if (is_loop_back) return true;  // Reordered.  Self-loop is last.
-    return false;  // We really don't know in this case but calling code should
-    // not care.
-  }
 }
 
 // SplitToPhonesInternal takes as input the "alignment" vector containing
@@ -656,9 +549,8 @@ static bool IsReordered(const TransitionModel &trans_model,
 // checks (if the input does not start at the start of a phone or does not
 // end at the end of a phone, we should expect that false will be returned).
 
-static bool SplitToPhonesInternal(const TransitionModel &trans_model,
+static bool SplitToPhonesInternal(const Transitions &trans_model,
                                   const std::vector<int32> &alignment,
-                                  bool reordered,
                                   std::vector<std::vector<int32> > *split_output) {
   if (alignment.empty()) return true;  // nothing to split.
   std::vector<size_t> end_points;  // points at which phones end [in an
@@ -666,69 +558,44 @@ static bool SplitToPhonesInternal(const TransitionModel &trans_model,
   // each phone]..
 
   bool was_ok = true;
-  for (size_t i = 0; i < alignment.size(); i++) {
+  int32 prev_phone = trans_model.InfoForTransitionId(alignment[0]).phone;
+  // i = 0 can't be an end point, it's the start of the sequence,
+  // so we start with 1.
+  for (size_t i = 1; i < alignment.size(); i++) {
     int32 trans_id = alignment[i];
-    if (trans_model.IsFinal(trans_id)) {  // is final-prob
-      if (!reordered) end_points.push_back(i+1);
-      else {  // reordered.
-        while (i+1 < alignment.size() &&
-              trans_model.IsSelfLoop(alignment[i+1])) {
-          KALDI_ASSERT(trans_model.TransitionIdToTransitionState(alignment[i]) ==
-                 trans_model.TransitionIdToTransitionState(alignment[i+1]));
-          i++;
-        }
-        end_points.push_back(i+1);
-      }
-    } else if (i+1 == alignment.size()) {
-      // need to have an end-point at the actual end.
-      // but this is an error- should have been detected already.
+    const auto &info = trans_model.InfoForTransitionId(trans_id);
+    if (info.is_initial) {
+      end_points.push_back(i);
+    } else if (info.phone != prev_phone) {
+      KALDI_WARN << "Not OK.";
       was_ok = false;
-      end_points.push_back(i+1);
-    } else {
-      int32 this_state = trans_model.TransitionIdToTransitionState(alignment[i]),
-          next_state = trans_model.TransitionIdToTransitionState(alignment[i+1]);
-      if (this_state == next_state) continue;  // optimization.
-      int32 this_phone = trans_model.TransitionStateToPhone(this_state),
-          next_phone = trans_model.TransitionStateToPhone(next_state);
-      if (this_phone != next_phone) {
-        // The phone changed, but this is an error-- we should have detected this via the
-        // IsFinal check.
-        was_ok = false;
-        end_points.push_back(i+1);
-      }
     }
+    prev_phone = info.phone;
+  }
+  end_points.push_back(alignment.size());
+  if (!trans_model.InfoForTransitionId(alignment.back()).is_final) {
+    KALDI_WARN << "Not OK.";
+    was_ok = false;
   }
 
-  size_t cur_point = 0;
-  for (size_t i = 0; i < end_points.size(); i++) {
+  size_t cur_start = 0;
+  for (int32 end_point: end_points) {
     split_output->push_back(std::vector<int32>());
-    // The next if-statement checks if the initial trans-id at the current end
-    // point is the initial-state of the current phone if that initial-state
-    // is emitting (a cursory check that the alignment is plausible).
-    int32 trans_state =
-      trans_model.TransitionIdToTransitionState(alignment[cur_point]);
-    int32 phone = trans_model.TransitionStateToPhone(trans_state);
-    int32 forward_pdf_class = trans_model.GetTopo().TopologyForPhone(phone)[0].forward_pdf_class;
-    if (forward_pdf_class != kNoPdf)  // initial-state of the current phone is emitting
-      if (trans_model.TransitionStateToHmmState(trans_state) != 0)
-        was_ok = false;
-    for (size_t j = cur_point; j < end_points[i]; j++)
+    for (size_t j = cur_start; j < end_point; j++)
       split_output->back().push_back(alignment[j]);
-    cur_point = end_points[i];
+    cur_start = end_point;
   }
   return was_ok;
 }
 
 
-bool SplitToPhones(const TransitionModel &trans_model,
+bool SplitToPhones(const Transitions &trans_model,
                    const std::vector<int32> &alignment,
                    std::vector<std::vector<int32> > *split_alignment) {
   KALDI_ASSERT(split_alignment != NULL);
   split_alignment->clear();
 
-  bool is_reordered = IsReordered(trans_model, alignment);
-  return SplitToPhonesInternal(trans_model, alignment,
-                               is_reordered, split_alignment);
+  return SplitToPhonesInternal(trans_model, alignment, split_alignment);
 }
 
 
@@ -740,31 +607,32 @@ bool SplitToPhones(const TransitionModel &trans_model,
     'subsample' value is not 1).
  */
 static inline void ConvertAlignmentForPhone(
-    const TransitionModel &old_trans_model,
-    const TransitionModel &new_trans_model,
+    const Transitions &old_trans_model,
+    const Transitions &new_trans_model,
     const ContextDependencyInterface &new_ctx_dep,
     const std::vector<int32> &old_phone_alignment,
     const std::vector<int32> &new_phone_window,
-    bool old_is_reordered,
-    bool new_is_reordered,
     std::vector<int32> *new_phone_alignment) {
+  KALDI_ASSERT(!old_phone_alignment.empty());
   int32 alignment_size = old_phone_alignment.size();
   static bool warned_topology = false;
   int32 P = new_ctx_dep.CentralPosition(),
-      old_central_phone = old_trans_model.TransitionIdToPhone(
-          old_phone_alignment[0]),
+      old_central_phone = old_trans_model.InfoForTransitionId(
+          old_phone_alignment[0]).phone,
       new_central_phone = new_phone_window[P];
-  const HmmTopology &old_topo = old_trans_model.GetTopo(),
+  const Topology &old_topo = old_trans_model.GetTopo(),
       &new_topo = new_trans_model.GetTopo();
 
-  bool topology_mismatch = !(old_topo.TopologyForPhone(old_central_phone) ==
-                             new_topo.TopologyForPhone(new_central_phone));
-  if (topology_mismatch) {
-    if (!warned_topology) {
-      warned_topology = true;
-      KALDI_WARN << "Topology mismatch detected; automatically converting. "
-                 << "Won't warn again.";
-    }
+  // TODO(galv): Do we need the transition costs to be the same? Right
+  // now, I am assuming that we do, but it is unclear to me that we
+  // really need this.
+  bool topology_mismatch = !fst::Equal(old_topo.TopologyForPhone(old_central_phone),
+                                       new_topo.TopologyForPhone(new_central_phone),
+                                       0.0);
+  if (topology_mismatch && !warned_topology) {
+    warned_topology = true;
+    KALDI_WARN << "Topology mismatch detected; automatically converting. "
+               << "Won't warn again.";
   }
   bool length_mismatch =
       (new_phone_alignment->size() != old_phone_alignment.size());
@@ -773,16 +641,12 @@ static inline void ConvertAlignmentForPhone(
     // old alignment.
     GetRandomAlignmentForPhone(new_ctx_dep, new_trans_model,
                                new_phone_window, new_phone_alignment);
-    if (new_is_reordered)
-      ChangeReorderingOfAlignment(new_trans_model, new_phone_alignment);
     return;
   }
 
-  KALDI_ASSERT(!old_phone_alignment.empty());
-
   int32 new_num_pdf_classes = new_topo.NumPdfClasses(new_central_phone);
-  std::vector<int32> pdf_ids(new_num_pdf_classes);  // Indexed by pdf-class
-  for (int32 pdf_class = 0; pdf_class < new_num_pdf_classes; pdf_class++) {
+  std::vector<int32> pdf_ids(new_num_pdf_classes + 1);  // Indexed by pdf-class
+  for (int32 pdf_class = 1; pdf_class <= new_num_pdf_classes; pdf_class++) {
     if (!new_ctx_dep.Compute(new_phone_window, pdf_class,
                              &(pdf_ids[pdf_class]))) {
       std::ostringstream ss;
@@ -793,28 +657,23 @@ static inline void ConvertAlignmentForPhone(
   }
 
   // the topologies and lengths match -> we can directly transfer
-  // the alignment.
+  // the alignment (assume the pdf-classes are identical).
   for (int32 j = 0; j < alignment_size; j++) {
-    int32 old_tid = old_phone_alignment[j],
-        old_tstate = old_trans_model.TransitionIdToTransitionState(old_tid);
-    int32 forward_pdf_class =
-        old_trans_model.TransitionStateToForwardPdfClass(old_tstate),
-        self_loop_pdf_class =
-        old_trans_model.TransitionStateToSelfLoopPdfClass(old_tstate);
-    int32 hmm_state = old_trans_model.TransitionIdToHmmState(old_tid);
-    int32 trans_idx = old_trans_model.TransitionIdToTransitionIndex(old_tid);
-    int32 new_forward_pdf = pdf_ids[forward_pdf_class];
-    int32 new_self_loop_pdf = pdf_ids[self_loop_pdf_class];
-    int32 new_trans_state =
-        new_trans_model.TupleToTransitionState(new_central_phone, hmm_state,
-                                               new_forward_pdf, new_self_loop_pdf);
+    int32 old_tid = old_phone_alignment[j];
+    auto&& info = old_trans_model.InfoForTransitionId(old_tid);
+    int32 old_pdf_class = old_trans_model.PdfClassForTid(old_tid);
+    int32 old_self_loop_pdf_class = (
+        info.self_loop_pdf_id != -1 ?
+        old_trans_model.PdfClassForTid(info.self_loop_transition_id) : -1);
+    int32 new_pdf_id = pdf_ids[old_pdf_class];
+    int32 new_self_loop_pdf_id = (old_self_loop_pdf_class != -1 ?
+                                  pdf_ids[old_self_loop_pdf_class] : -1);
     int32 new_tid =
-        new_trans_model.PairToTransitionId(new_trans_state, trans_idx);
+      new_trans_model.TupleToTransitionId(new_central_phone, info.topo_state,
+                                          info.arc_index, new_pdf_id,
+                                          new_self_loop_pdf_id);
     (*new_phone_alignment)[j] = new_tid;
   }
-
-  if (new_is_reordered != old_is_reordered)
-    ChangeReorderingOfAlignment(new_trans_model, new_phone_alignment);
 }
 
 
@@ -846,7 +705,7 @@ static inline void ConvertAlignmentForPhone(
                                 reduced-frame-rate system.
    @param new_lengths [out]     The vector for storing new lengths.
 */
-static bool ComputeNewPhoneLengths(const HmmTopology &topology,
+static bool ComputeNewPhoneLengths(const Topology &topology,
                                    const std::vector<int32> &mapped_phones,
                                    const std::vector<int32> &old_lengths,
                                    int32 conversion_shift,
@@ -923,17 +782,16 @@ static bool ComputeNewPhoneLengths(const HmmTopology &topology,
   'conversion_shift' is for.
 */
 
-static bool ConvertAlignmentInternal(const TransitionModel &old_trans_model,
-                      const TransitionModel &new_trans_model,
-                      const ContextDependencyInterface &new_ctx_dep,
-                      const std::vector<int32> &old_alignment,
-                      int32 conversion_shift,
-                      int32 subsample_factor,
-                      bool new_is_reordered,
-                      const std::vector<int32> *phone_map,
-                      std::vector<int32> *new_alignment) {
+static bool ConvertAlignmentInternal(
+    const Transitions &old_trans_model,
+    const Transitions &new_trans_model,
+    const ContextDependencyInterface &new_ctx_dep,
+    const std::vector<int32> &old_alignment,
+    int32 conversion_shift,
+    int32 subsample_factor,
+    const std::vector<int32> *phone_map,
+    std::vector<int32> *new_alignment) {
   KALDI_ASSERT(0 <= conversion_shift && conversion_shift < subsample_factor);
-  bool old_is_reordered = IsReordered(old_trans_model, old_alignment);
   KALDI_ASSERT(new_alignment != NULL);
   new_alignment->clear();
   new_alignment->reserve(old_alignment.size());
@@ -944,7 +802,7 @@ static bool ConvertAlignmentInternal(const TransitionModel &old_trans_model,
   std::vector<int32> mapped_phones(phone_sequence_length);
   for (size_t i = 0; i < phone_sequence_length; i++) {
     KALDI_ASSERT(!old_split[i].empty());
-    mapped_phones[i] = old_trans_model.TransitionIdToPhone(old_split[i][0]);
+    mapped_phones[i] = old_trans_model.InfoForTransitionId(old_split[i][0]).phone;
     if (phone_map != NULL) {  // Map the phone sequence.
       int32 sz = phone_map->size();
       if (mapped_phones[i] < 0 || mapped_phones[i] >= sz ||
@@ -998,7 +856,6 @@ static bool ConvertAlignmentInternal(const TransitionModel &old_trans_model,
 
       ConvertAlignmentForPhone(old_trans_model, new_trans_model, new_ctx_dep,
                                old_alignment_for_phone, new_phone_window,
-                               old_is_reordered, new_is_reordered,
                                &new_alignment_for_phone);
       new_alignment->insert(new_alignment->end(),
                             new_alignment_for_phone.begin(),
@@ -1010,29 +867,35 @@ static bool ConvertAlignmentInternal(const TransitionModel &old_trans_model,
   return true;
 }
 
-bool ConvertAlignment(const TransitionModel &old_trans_model,
-                      const TransitionModel &new_trans_model,
+bool ConvertAlignment(const Transitions &old_trans_model,
+                      const Transitions &new_trans_model,
                       const ContextDependencyInterface &new_ctx_dep,
                       const std::vector<int32> &old_alignment,
                       int32 subsample_factor,
                       bool repeat_frames,
-                      bool new_is_reordered,
                       const std::vector<int32> *phone_map,
                       std::vector<int32> *new_alignment) {
-  if (!repeat_frames || subsample_factor == 1) {
+  if (subsample_factor == 1 && repeat_frames)
+    KALDI_WARN << "repeat_frames being set to true has no effect when "
+        "subsample_factor=1 (its default value)";
+
+  if (subsample_factor == 1 || !repeat_frames) {
     return ConvertAlignmentInternal(old_trans_model,
                                     new_trans_model,
                                     new_ctx_dep,
                                     old_alignment,
-                                    subsample_factor - 1,
+                                    subsample_factor - 1, // == 0
                                     subsample_factor,
-                                    new_is_reordered,
                                     phone_map,
                                     new_alignment);
    // The value "subsample_factor - 1" for conversion_shift above ensures the
    // alignments have the same length as the output of 'subsample-feats'
   } else {
+    // either repeat_frames or subsample_factor >= 2. But if repeat_frames == True
+    // then and subsample_factor == 1, then it is the same as the above.
     std::vector<std::vector<int32> > shifted_alignments(subsample_factor);
+    // We create alignments for all shifts from [subsample_factor -1
+    // to 0], inclusive.
     for (int32 conversion_shift = subsample_factor - 1;
          conversion_shift >= 0; conversion_shift--) {
       if (!ConvertAlignmentInternal(old_trans_model,
@@ -1041,7 +904,6 @@ bool ConvertAlignment(const TransitionModel &old_trans_model,
                                     old_alignment,
                                     conversion_shift,
                                     subsample_factor,
-                                    new_is_reordered,
                                     phone_map,
                                     &shifted_alignments[conversion_shift]))
         return false;
@@ -1061,34 +923,9 @@ bool ConvertAlignment(const TransitionModel &old_trans_model,
   return true;
 }
 
-// Returns the scaled, but not negated, log-prob, with the given scaling factors.
-static BaseFloat GetScaledTransitionLogProb(const TransitionModel &trans_model,
-                                            int32 trans_id,
-                                            BaseFloat transition_scale,
-                                            BaseFloat self_loop_scale) {
-  if (transition_scale == self_loop_scale) {
-    return trans_model.GetTransitionLogProb(trans_id) * transition_scale;
-  } else {
-    if (trans_model.IsSelfLoop(trans_id)) {
-      return self_loop_scale * trans_model.GetTransitionLogProb(trans_id);
-    } else {
-      int32 trans_state = trans_model.TransitionIdToTransitionState(trans_id);
-      return self_loop_scale * trans_model.GetNonSelfLoopLogProb(trans_state)
-          + transition_scale * trans_model.GetTransitionLogProbIgnoringSelfLoops(trans_id);
-      // This could be simplified to
-      // (self_loop_scale - transition_scale) * trans_model.GetNonSelfLoopLogProb(trans_state)
-      // + trans_model.GetTransitionLogProb(trans_id);
-      // this simplifies if self_loop_scale == 0.0
-    }
-  }
-}
-
-
-
-void AddTransitionProbs(const TransitionModel &trans_model,
+void AddTransitionProbs(const Transitions &trans_model,
                         const std::vector<int32> &disambig_syms,  // may be empty
                         BaseFloat transition_scale,
-                        BaseFloat self_loop_scale,
                         fst::VectorFst<fst::StdArc> *fst) {
   using namespace fst;
   KALDI_ASSERT(IsSortedAndUniq(disambig_syms));
@@ -1102,25 +939,21 @@ void AddTransitionProbs(const TransitionModel &trans_model,
       StdArc arc = aiter.Value();
       StdArc::Label l = arc.ilabel;
       if (l >= 1 && l <= num_tids) {  // a transition-id.
-        BaseFloat scaled_log_prob = GetScaledTransitionLogProb(trans_model,
-                                                               l,
-                                                               transition_scale,
-                                                               self_loop_scale);
+        BaseFloat scaled_log_prob =
+          trans_model.InfoForTransitionId(l).transition_cost * transition_scale;
         arc.weight = Times(arc.weight, TropicalWeight(-scaled_log_prob));
-      } else if (l != 0) {
-        if (!std::binary_search(disambig_syms.begin(), disambig_syms.end(),
-                               arc.ilabel))
-          KALDI_ERR << "AddTransitionProbs: invalid symbol " << arc.ilabel
-                    << " on graph input side.";
+      } else if (l != 0 && !std::binary_search(disambig_syms.begin(),
+                                               disambig_syms.end(),l)) {
+        KALDI_ERR << "AddTransitionProbs: invalid symbol " << arc.ilabel
+                  << " on graph input side.";
       }
       aiter.SetValue(arc);
     }
   }
 }
 
-void AddTransitionProbs(const TransitionModel &trans_model,
+void AddTransitionProbs(const Transitions &trans_model,
                         BaseFloat transition_scale,
-                        BaseFloat self_loop_scale,
                         Lattice *lat) {
   using namespace fst;
   int num_tids = trans_model.NumTransitionIds();
@@ -1133,10 +966,8 @@ void AddTransitionProbs(const TransitionModel &trans_model,
       LatticeArc arc = aiter.Value();
       LatticeArc::Label l = arc.ilabel;
       if (l >= 1 && l <= num_tids) {  // a transition-id.
-        BaseFloat scaled_log_prob = GetScaledTransitionLogProb(trans_model,
-                                                               l,
-                                                               transition_scale,
-                                                               self_loop_scale);
+        BaseFloat scaled_log_prob =
+          trans_model.InfoForTransitionId(l).transition_cost * transition_scale;
         // cost is negated log prob.
         arc.weight.SetValue1(arc.weight.Value1() - scaled_log_prob);
       } else if (l != 0) {
@@ -1204,16 +1035,77 @@ bool ConvertPhnxToProns(const std::vector<int32> &phnx,
 }
 
 
+
+
+void AddTransitionProbs(const Transitions &trans_model,
+                        const std::vector<int32> &disambig_syms,  // may be empty
+                        fst::VectorFst<fst::StdArc> *fst) {
+  using namespace fst;
+  KALDI_ASSERT(IsSortedAndUniq(disambig_syms));
+  int num_tids = trans_model.NumTransitionIds();
+  for (StateIterator<VectorFst<StdArc> > siter(*fst);
+      !siter.Done();
+      siter.Next()) {
+    for (MutableArcIterator<VectorFst<StdArc> > aiter(fst, siter.Value());
+         !aiter.Done();
+         aiter.Next()) {
+      StdArc arc = aiter.Value();
+      StdArc::Label l = arc.ilabel;
+      if (l >= 1 && l <= num_tids) {  // a transition-id.
+        BaseFloat cost = trans_model.InfoForTransitionId(l).transition_cost;
+        arc.weight = Times(arc.weight, TropicalWeight(cost));
+      } else if (l != 0) {
+        if (!std::binary_search(disambig_syms.begin(), disambig_syms.end(),
+                               arc.ilabel))
+          KALDI_ERR << "AddTransitionProbs: invalid symbol " << arc.ilabel
+                    << " on graph input side.";
+      }
+      aiter.SetValue(arc);
+    }
+  }
+}
+
+void AddTransitionProbs(const Transitions &trans_model,
+                        Lattice *lat) {
+  using namespace fst;
+  int num_tids = trans_model.NumTransitionIds();
+  for (fst::StateIterator<Lattice> siter(*lat);
+       !siter.Done();
+       siter.Next()) {
+    for (MutableArcIterator<Lattice> aiter(lat, siter.Value());
+         !aiter.Done();
+         aiter.Next()) {
+      LatticeArc arc = aiter.Value();
+      LatticeArc::Label l = arc.ilabel;
+      if (l >= 1 && l <= num_tids) {  // a transition-id.
+        BaseFloat cost = trans_model.InfoForTransitionId(l).transition_cost;
+        arc.weight.SetValue1(arc.weight.Value1() + cost);
+      } else if (l != 0) {
+        KALDI_ERR << "AddTransitionProbs: invalid symbol " << arc.ilabel
+                  << " on lattice input side.";
+      }
+      aiter.SetValue(arc);
+    }
+  }
+}
+
+
+
+
+
 void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
-                                const TransitionModel &trans_model,
+                                const Transitions &trans_model,
                                 const std::vector<int32> &phone_window,
                                 std::vector<int32> *alignment) {
   typedef fst::StdArc Arc;
   int32 length = alignment->size();
-  BaseFloat prob_scale = 0.0;
-  fst::VectorFst<Arc> *fst = GetHmmAsFsaSimple(phone_window, ctx_dep,
-                                               trans_model, prob_scale);
-  fst::RmEpsilon(fst);
+  bool include_self_loops = true;
+  std::shared_ptr<fst::StdVectorFst> fst =
+      GetHmmAsFsa(phone_window, ctx_dep,
+                  trans_model,
+                  include_self_loops);
+
+  fst::RmEpsilon(fst.get());
 
   fst::VectorFst<Arc> length_constraint_fst;
   {  // set up length_constraint_fst.
@@ -1253,41 +1145,10 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
   bool ans = fst::GetLinearSymbolSequence<Arc, int32>(
       single_path_fst, &symbol_sequence, NULL, NULL);
   KALDI_ASSERT(ans && symbol_sequence.size() == length);
+  KALDI_PARANOID_ASSERT(
+      trans_model.InfoForTransitionId(symbol_sequence.front()).is_initial &&
+      trans_model.InfoForTransitionId(symbol_sequence.back()).is_final);
   symbol_sequence.swap(*alignment);
-  delete fst;
-}
-
-void ChangeReorderingOfAlignment(const TransitionModel &trans_model,
-                                 std::vector<int32> *alignment) {
-  int32 start_pos = 0, size = alignment->size();
-  while (start_pos != size) {
-    int32 start_tid = (*alignment)[start_pos];
-    int32 cur_tstate = trans_model.TransitionIdToTransitionState(start_tid);
-    bool start_is_self_loop = trans_model.IsSelfLoop(start_tid) ? 0 : 1;
-    int32 end_pos = start_pos + 1;
-    // If the first instance of this transition-state was a self-loop, then eat
-    // only non-self-loops of this state; if it was a non-self-loop, then eat
-    // only self-loops of this state.  Imposing this condition on self-loops
-    // would only actually matter in the rare circumstances that phones can
-    // have length 1.
-    while (end_pos != size &&
-           trans_model.TransitionIdToTransitionState((*alignment)[end_pos]) ==
-           cur_tstate) {
-      bool this_is_self_loop = trans_model.IsSelfLoop((*alignment)[end_pos]);
-      if (!this_is_self_loop) {
-        if (start_is_self_loop) {
-          break;  // stop before including this transition-id.
-        } else {
-          end_pos++;
-          break;  // stop after including this transition-id.
-        }
-      }
-      end_pos++;
-    }
-    std::swap((*alignment)[start_pos], (*alignment)[end_pos - 1]);
-    start_pos = end_pos;
-  }
 }
 
-
 } // namespace kaldi
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index a8ad846949e..bc9e3eaeaa7 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -1,6 +1,7 @@
 // hmm/hmm-utils.h
 
 // Copyright 2009-2011  Microsoft Corporation
+//                2019  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -20,8 +21,10 @@
 #ifndef KALDI_HMM_HMM_UTILS_H_
 #define KALDI_HMM_HMM_UTILS_H_
 
-#include "hmm/hmm-topology.h"
-#include "hmm/transition-model.h"
+#include <memory>
+
+#include "hmm/topology.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 
 namespace kaldi {
@@ -34,19 +37,16 @@ namespace kaldi {
 /// Configuration class for the GetHTransducer() function; see
 /// \ref hmm_graph_config for context.
 struct HTransducerConfig {
-  /// Transition log-prob scale, see \ref hmm_scale.
-  /// Note this doesn't apply to self-loops; GetHTransducer() does
-  /// not include self-loops.
-  BaseFloat transition_scale;
   int32 nonterm_phones_offset;
+  // We don't currently make `include_self_loops` configurable from the command
+  // line; it's included in order to make it obvious how to add the self loops.
+  bool include_self_loops;
 
   HTransducerConfig():
-      transition_scale(1.0),
-      nonterm_phones_offset(-1) { }
+      nonterm_phones_offset(-1),
+      include_self_loops(false) { }
 
   void Register (OptionsItf *opts) {
-    opts->Register("transition-scale", &transition_scale,
-                   "Scale of transition probs (relative to LM)");
     opts->Register("nonterm-phones-offset", &nonterm_phones_offset,
                    "The integer id of #nonterm_bos in phones.txt, if present. "
                    "Only needs to be set if you are doing grammar decoding, "
@@ -66,7 +66,7 @@ struct HmmCacheHash {
 /// HmmCacheType is a map from (central-phone, sequence of pdf-ids) to FST, used
 /// as cache in GetHmmAsFsa, as an optimization.
 typedef unordered_map<std::pair<int32, std::vector<int32> >,
-                      fst::VectorFst<fst::StdArc>*,
+                      std::shared_ptr<fst::StdVectorFst>,
                       HmmCacheHash> HmmCacheType;
 
 
@@ -76,38 +76,32 @@ typedef unordered_map<std::pair<int32, std::vector<int32> >,
 /// "Fst".  This acceptor does not include self-loops; you have to call
 /// AddSelfLoops() for that.  (We do that at a later graph compilation phase,
 /// for efficiency).  The labels on the FSA correspond to transition-ids.
+/// But now we already have self-loops... Problematic?
 ///
 /// as the symbols.
 /// For documentation in context, see \ref hmm_graph_get_hmm_as_fst
-///   @param context_window  A vector representing the phonetic context; see
+///   @param [in] context_window  A vector representing the phonetic context; see
 ///            \ref tree_window "here" for explanation.
-///   @param ctx_dep The object that contains the phonetic decision-tree
-///   @param trans_model The transition-model object, which provides
+///   @param [in] ctx_dep The object that contains the phonetic decision-tree
+///   @param [in] trans_model The transition-model object, which provides
 ///         the mappings to transition-ids and also the transition
 ///         probabilities.
-///   @param config Configuration object, see \ref HTransducerConfig.
+///   @param [in] include_self_loops.  If true, self-loop arcs will be
+///          included in the result; if false, they will be omitted and
+///          the probabilities appropriately renormalized; you can
+///          add them later using AddSelfLoops().
 ///   @param cache Object used as a lookaside buffer to save computation;
 ///       if it finds that the object it needs is already there, it will
-///       just return a pointer value from "cache"-- not that this means
+///       just return a pointer value from "cache"-- note that this means
 ///       you have to be careful not to delete things twice.
-fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
-    std::vector<int32> context_window,
+std::shared_ptr<fst::StdVectorFst> GetHmmAsFsa(
+    const std::vector<int32> &context_window,
     const ContextDependencyInterface &ctx_dep,
-    const TransitionModel &trans_model,
-    const HTransducerConfig &config,
+    const Transitions &trans_model,
+    bool include_self_loops = false,
     HmmCacheType *cache = NULL);
 
 
-/// Included mainly as a form of documentation, not used in any other code
-/// currently.  Creates the acceptor FST with self-loops, and with fewer
-/// options.
-fst::VectorFst<fst::StdArc>*
-GetHmmAsFsaSimple(std::vector<int32> context_window,
-                  const ContextDependencyInterface &ctx_dep,
-                  const TransitionModel &trans_model,
-                  BaseFloat prob_scale);
-
-
 /**
   * Returns the H tranducer; result owned by caller.  Caution: our version of
   * the H transducer does not include self-loops; you have to add those later.
@@ -123,10 +117,10 @@ GetHmmAsFsaSimple(std::vector<int32> context_window,
   * the input of the transducer (i.e. same symbol type as whatever is on the
   * input of the transducer
   */
-fst::VectorFst<fst::StdArc>*
+std::unique_ptr<fst::VectorFst<fst::StdArc>>
 GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
                const ContextDependencyInterface &ctx_dep,
-               const TransitionModel &trans_model,
+               const Transitions &trans_model,
                const HTransducerConfig &config,
                std::vector<int32> *disambig_syms_left);
 
@@ -148,7 +142,7 @@ GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
   */
 void GetIlabelMapping(const std::vector<std::vector<int32> > &ilabel_info_old,
                       const ContextDependencyInterface &ctx_dep,
-                      const TransitionModel &trans_model,
+                      const Transitions &trans_model,
                       std::vector<int32> *old2new_map);
 
 
@@ -164,34 +158,30 @@ void GetIlabelMapping(const std::vector<std::vector<int32> > &ilabel_info_old,
   * same as disambiguation symbols, assuming they are special symbols for
   * grammar decoding.
   *
-  * @param trans_model [in] Transition model
-  * @param disambig_syms [in] Sorted, uniq list of disambiguation symbols, required
-  *       if the graph contains disambiguation symbols but only needed for sanity checks.
-  * @param self_loop_scale [in] Transition-probability scale for self-loops; c.f.
-  *                    \ref hmm_scale
-  * @param reorder [in] If true, reorders the transitions (see \ref hmm_reorder).
-  *                     You'll normally want this to be true.
-  * @param check_no_self_loops [in]  If true, it will check that there are no
-  *                      self-loops in the original graph; you'll normally want
-  *                      this to be true.  If false, it will allow them, and
-  *                      will add self-loops after the original self-loop
-  *                      transitions, assuming reorder==true... this happens to
-  *                      be what we want when converting normal to unconstrained
-  *                      chain examples.  WARNING: this was added in 2018;
-  *                      if you get a compilation error, add this as 'true',
-  *                      which emulates the behavior of older code.
-  * @param  fst [in, out] The FST to be modified.
+  * @param [in] trans_model  Transition model
+  * @param [in] disambig_syms Sorted, unique list of disambiguation symbols, required
+  *         if the graph contains disambiguation symbols but only needed for sanity checks.
+  * @param [in] currently_self_loop_free   If true, we require (and check) that
+  *                      the graph was free of self-loops at entry.  If
+  *                      false, it assumes that some states may already have
+  *                      self-loops, and will refrain from adding duplicate
+  *                      self-loop to them.
+  * @param [in] use_weights  If true, weights will be used (which
+  *                      includes a correction term to make things continue to
+  *                      sum to one); otherwise, we add the new self-loop arcs
+  *                      with probability One().
+  * @param  fst [in, out] The FST to be modified. This should normally be HCLG
+  *                       or any other FST with transition ids as its input
+  *                       labels.
   */
-void AddSelfLoops(const TransitionModel &trans_model,
+void AddSelfLoops(const Transitions &trans_model,
                   const std::vector<int32> &disambig_syms,  // used as a check only.
-                  BaseFloat self_loop_scale,
-                  bool reorder,
-                  bool check_no_self_loops,
+                  bool currently_self_loop_free,
+                  bool use_weights,
                   fst::VectorFst<fst::StdArc> *fst);
 
 /**
-  * Adds transition-probs, with the supplied
-  * scales (see \ref hmm_scale), to the graph.
+  * Adds transition-prob to the graph.
   * Useful if you want to create a graph without transition probs, then possibly
   * train the model (including the transition probs) but keep the graph fixed,
   * and add back in the transition probs.  It assumes the fst has transition-ids
@@ -200,36 +190,29 @@ void AddSelfLoops(const TransitionModel &trans_model,
   * @param disambig_syms [in] A list of disambiguation symbols, required if the
   *                       graph has disambiguation symbols on its input but only
   *                       used for checks.
-  * @param transition_scale [in] A scale on transition-probabilities apart from
-  *                      those involving self-loops; see \ref hmm_scale.
-  * @param self_loop_scale [in] A scale on self-loop transition probabilities;
-  *                      see \ref hmm_scale.
   * @param  fst [in, out] The FST to be modified.
   */
-void AddTransitionProbs(const TransitionModel &trans_model,
+void AddTransitionProbs(const Transitions &trans_model,
                         const std::vector<int32> &disambig_syms,
-                        BaseFloat transition_scale,
-                        BaseFloat self_loop_scale,
                         fst::VectorFst<fst::StdArc> *fst);
 
 /**
    This is as AddSelfLoops(), but operates on a Lattice, where
    it affects the graph part of the weight (the first element
    of the pair). */
-void AddTransitionProbs(const TransitionModel &trans_model,
-                        BaseFloat transition_scale,
-                        BaseFloat self_loop_scale,
+void AddTransitionProbs(const Transitions &trans_model,
                         Lattice *lat);
 
 
+
 /// Returns a transducer from pdfs plus one (input) to  transition-ids (output).
-/// Currenly of use only for testing.
-fst::VectorFst<fst::StdArc>*
-GetPdfToTransitionIdTransducer(const TransitionModel &trans_model);
+/// Currently of use only for testing.
+std::unique_ptr<fst::VectorFst<fst::StdArc>>
+GetPdfToTransitionIdTransducer(const Transitions &trans_model);
 
 /// Converts all transition-ids in the FST to pdfs plus one.
 /// Placeholder: not implemented yet!
-void ConvertTransitionIdsToPdfs(const TransitionModel &trans_model,
+void ConvertTransitionIdsToPdfs(const Transitions &trans_model,
                                 const std::vector<int32> &disambig_syms,
                                 fst::VectorFst<fst::StdArc> *fst);
 
@@ -248,7 +231,7 @@ void ConvertTransitionIdsToPdfs(const TransitionModel &trans_model,
 /// die or throw an exception.
 /// This function works out by itself whether the graph was created
 /// with "reordering", and just does the right thing.
-bool SplitToPhones(const TransitionModel &trans_model,
+bool SplitToPhones(const Transitions &trans_model,
                    const std::vector<int32> &alignment,
                    std::vector<std::vector<int32> > *split_alignment);
 
@@ -277,20 +260,16 @@ bool SplitToPhones(const TransitionModel &trans_model,
                                 'subsample_factor' separately generated
                                 alignments, to keep the phone boundaries
                                 the same as the input where possible.]
-   @param reorder [in]          True if you want the pdf-ids on the new alignment to
-                                be 'reordered'. (vs. the way they appear in
-                                the HmmTopology object)
    @param phone_map [in]        If non-NULL, map from old to new phones.
    @param new_alignment [out]   The converted alignment.
 */
 
-bool ConvertAlignment(const TransitionModel &old_trans_model,
-                      const TransitionModel &new_trans_model,
+bool ConvertAlignment(const Transitions &old_trans_model,
+                      const Transitions &new_trans_model,
                       const ContextDependencyInterface &new_ctx_dep,
                       const std::vector<int32> &old_alignment,
                       int32 subsample_factor,  // 1 in the normal case -> no subsampling.
                       bool repeat_frames,
-                      bool reorder,
                       const std::vector<int32> *phone_map,  // may be NULL
                       std::vector<int32> *new_alignment);
 
@@ -319,16 +298,10 @@ bool ConvertPhnxToProns(const std::vector<int32> &phnx,
    The alignment will be without 'reordering'.
 */
 void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
-                                const TransitionModel &trans_model,
+                                const Transitions &trans_model,
                                 const std::vector<int32> &phone_window,
                                 std::vector<int32> *alignment);
 
-/*
-  If the alignment was non-reordered makes it reordered, and vice versa.
-*/
-void ChangeReorderingOfAlignment(const TransitionModel &trans_model,
-                                 std::vector<int32> *alignment);
-
 /// @} end "addtogroup hmm_group"
 
 } // end namespace kaldi
diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc
index 860a979a0ce..4742c0f7824 100644
--- a/src/hmm/posterior.cc
+++ b/src/hmm/posterior.cc
@@ -299,19 +299,19 @@ void AlignmentToPosterior(const std::vector<int32> &ali,
 }
 
 struct ComparePosteriorByPdfs {
-  const TransitionModel *tmodel_;
-  ComparePosteriorByPdfs(const TransitionModel &tmodel): tmodel_(&tmodel) {}
+  const Transitions *tmodel_;
+  ComparePosteriorByPdfs(const Transitions &tmodel): tmodel_(&tmodel) {}
   bool operator() (const std::pair<int32, BaseFloat> &a,
                    const std::pair<int32, BaseFloat> &b) {
-    if (tmodel_->TransitionIdToPdf(a.first)
-        < tmodel_->TransitionIdToPdf(b.first))
+    if (tmodel_->TransitionIdToPdfFast(a.first)
+        < tmodel_->TransitionIdToPdfFast(b.first))
       return true;
     else
       return false;
   }
 };
 
-void SortPosteriorByPdfs(const TransitionModel &tmodel,
+void SortPosteriorByPdfs(const Transitions &tmodel,
                          Posterior *post) {
   ComparePosteriorByPdfs compare(tmodel);
   for (size_t i = 0; i < post->size(); i++) {
@@ -319,7 +319,7 @@ void SortPosteriorByPdfs(const TransitionModel &tmodel,
   }
 }
 
-void ConvertPosteriorToPdfs(const TransitionModel &tmodel,
+void ConvertPosteriorToPdfs(const Transitions &tmodel,
                             const Posterior &post_in,
                             Posterior *post_out) {
   post_out->clear();
@@ -328,7 +328,7 @@ void ConvertPosteriorToPdfs(const TransitionModel &tmodel,
     unordered_map<int32, BaseFloat> pdf_to_post;
     for (size_t j = 0; j < post_in[i].size(); j++) {
       int32 tid = post_in[i][j].first,
-          pdf_id = tmodel.TransitionIdToPdf(tid);
+          pdf_id = tmodel.TransitionIdToPdfFast(tid);
       BaseFloat post = post_in[i][j].second;
       if (pdf_to_post.count(pdf_id) == 0)
         pdf_to_post[pdf_id] = post;
@@ -345,7 +345,7 @@ void ConvertPosteriorToPdfs(const TransitionModel &tmodel,
   }
 }
 
-void ConvertPosteriorToPhones(const TransitionModel &tmodel,
+void ConvertPosteriorToPhones(const Transitions &tmodel,
                               const Posterior &post_in,
                               Posterior *post_out) {
   post_out->clear();
@@ -354,7 +354,7 @@ void ConvertPosteriorToPhones(const TransitionModel &tmodel,
     std::map<int32, BaseFloat> phone_to_post;
     for (size_t j = 0; j < post_in[i].size(); j++) {
       int32 tid = post_in[i][j].first,
-          phone_id = tmodel.TransitionIdToPhone(tid);
+          phone_id = tmodel.InfoForTransitionId(tid).phone;
       BaseFloat post = post_in[i][j].second;
       if (phone_to_post.count(phone_id) == 0)
         phone_to_post[phone_id] = post;
@@ -372,7 +372,7 @@ void ConvertPosteriorToPhones(const TransitionModel &tmodel,
 }
 
 
-void WeightSilencePost(const TransitionModel &trans_model,
+void WeightSilencePost(const Transitions &trans_model,
                        const ConstIntegerSet<int32> &silence_set,
                        BaseFloat silence_scale,
                        Posterior *post) {
@@ -381,7 +381,7 @@ void WeightSilencePost(const TransitionModel &trans_model,
     this_post.reserve((*post)[i].size());
     for (size_t j = 0; j < (*post)[i].size(); j++) {
       int32 tid = (*post)[i][j].first,
-          phone = trans_model.TransitionIdToPhone(tid);
+          phone = trans_model.InfoForTransitionId(tid).phone;
       BaseFloat weight = (*post)[i][j].second;
       if (silence_set.count(phone) != 0) {  // is a silence.
         if (silence_scale != 0.0)
@@ -395,7 +395,7 @@ void WeightSilencePost(const TransitionModel &trans_model,
 }
 
 
-void WeightSilencePostDistributed(const TransitionModel &trans_model,
+void WeightSilencePostDistributed(const Transitions &trans_model,
                                   const ConstIntegerSet<int32> &silence_set,
                                   BaseFloat silence_scale,
                                   Posterior *post) {
@@ -405,7 +405,7 @@ void WeightSilencePostDistributed(const TransitionModel &trans_model,
     BaseFloat sil_weight = 0.0, nonsil_weight = 0.0;
     for (size_t j = 0; j < (*post)[i].size(); j++) {
       int32 tid = (*post)[i][j].first,
-          phone = trans_model.TransitionIdToPhone(tid);
+          phone = trans_model.InfoForTransitionId(tid).phone;
       BaseFloat weight = (*post)[i][j].second;
       if (silence_set.count(phone) != 0) sil_weight += weight;
       else nonsil_weight += weight;
@@ -537,7 +537,7 @@ template void PosteriorToMatrix<double>(const Posterior &post,
 
 template <typename Real>
 void PosteriorToPdfMatrix(const Posterior &post,
-                          const TransitionModel &model,
+                          const Transitions &model,
                           Matrix<Real> *mat) {
   // Allocate the matrix,
   int32 num_rows = post.size(),
@@ -546,7 +546,7 @@ void PosteriorToPdfMatrix(const Posterior &post,
   // Fill from Posterior,
   for (int32 t = 0; t < post.size(); t++) {
     for (int32 i = 0; i < post[t].size(); i++) {
-      int32 col = model.TransitionIdToPdf(post[t][i].first);
+      int32 col = model.TransitionIdToPdfFast(post[t][i].first);
       if (col >= num_cols) {
         KALDI_ERR << "Out-of-bound Posterior element with index " << col
                   << ", higher than number of columns " << num_cols;
@@ -557,10 +557,10 @@ void PosteriorToPdfMatrix(const Posterior &post,
 }
 // instantiate the template function,
 template void PosteriorToPdfMatrix<float>(const Posterior &post,
-                                          const TransitionModel &model,
+                                          const Transitions &model,
                                           Matrix<float> *mat);
 template void PosteriorToPdfMatrix<double>(const Posterior &post,
-                                           const TransitionModel &model,
+                                           const Transitions &model,
                                            Matrix<double> *mat);
 
 } // End namespace kaldi
diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h
index e153c249740..7663cf0ce42 100644
--- a/src/hmm/posterior.h
+++ b/src/hmm/posterior.h
@@ -26,7 +26,7 @@
 #include "base/kaldi-common.h"
 #include "util/const-integer-set.h"
 #include "util/kaldi-table.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "matrix/kaldi-matrix.h"
 
 
@@ -205,19 +205,19 @@ void AlignmentToPosterior(const std::vector<int32> &ali,
 
 /// Sorts posterior entries so that transition-ids with same pdf-id are next to
 /// each other.
-void SortPosteriorByPdfs(const TransitionModel &tmodel,
+void SortPosteriorByPdfs(const Transitions &tmodel,
                          Posterior *post);
 
 
 /// Converts a posterior over transition-ids to be a posterior
 /// over pdf-ids.
-void ConvertPosteriorToPdfs(const TransitionModel &tmodel,
+void ConvertPosteriorToPdfs(const Transitions &tmodel,
                             const Posterior &post_in,
                             Posterior *post_out);
 
 /// Converts a posterior over transition-ids to be a posterior
 /// over phones.
-void ConvertPosteriorToPhones(const TransitionModel &tmodel,
+void ConvertPosteriorToPhones(const Transitions &tmodel,
                               const Posterior &post_in,
                               Posterior *post_out);
 
@@ -225,7 +225,7 @@ void ConvertPosteriorToPhones(const TransitionModel &tmodel,
 /// in the set "silence_set" by scale "silence_scale".
 /// The interface was changed in Feb 2014 to do the modification
 /// "in-place" rather than having separate input and output.
-void WeightSilencePost(const TransitionModel &trans_model,
+void WeightSilencePost(const Transitions &trans_model,
                        const ConstIntegerSet<int32> &silence_set,
                        BaseFloat silence_scale,
                        Posterior *post);
@@ -236,7 +236,7 @@ void WeightSilencePost(const TransitionModel &trans_model,
 /// has the effect that frames that are mostly silence get down-weighted.
 /// The interface was changed in Feb 2014 to do the modification
 /// "in-place" rather than having separate input and output.
-void WeightSilencePostDistributed(const TransitionModel &trans_model,
+void WeightSilencePostDistributed(const Transitions &trans_model,
                                   const ConstIntegerSet<int32> &silence_set,
                                   BaseFloat silence_scale,
                                   Posterior *post);
@@ -250,11 +250,11 @@ void PosteriorToMatrix(const Posterior &post,
 
 /// This converts a Posterior to a Matrix. The number of matrix-rows is the same
 /// as the 'post.size()', the number of matrix-columns is defined by 'NumPdfs'
-/// in the TransitionModel.
+/// in the Transitions.
 /// The elements which are not specified in 'Posterior' are equal to zero.
 template <typename Real>
 void PosteriorToPdfMatrix(const Posterior &post,
-                          const TransitionModel &model,
+                          const Transitions &model,
                           Matrix<Real> *mat);
 
 /// @} end "addtogroup posterior_group"
diff --git a/src/hmm/hmm-topology-test.cc b/src/hmm/topology-test.cc
similarity index 64%
rename from src/hmm/hmm-topology-test.cc
rename to src/hmm/topology-test.cc
index 14081d2355d..7073ce94866 100644
--- a/src/hmm/hmm-topology-test.cc
+++ b/src/hmm/topology-test.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011  Microsoft Corporation
 //                2015  Johns Hopkins University (author: Daniel Povey)
+//                2019  Hossein Hadian
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -18,58 +19,47 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/hmm-topology.h"
+#include "hmm/topology.h"
 #include "hmm/hmm-test-utils.h"
 
 namespace kaldi {
 
 
-void TestHmmTopology() {
-  bool binary = (Rand()%2 == 0);
+void TestTopology() {
+  bool binary = (Rand() % 2 == 0);
 
   std::string input_str = "<Topology>\n"
       "<TopologyEntry>\n"
       "<ForPhones> 1 2 3 4 5 6 7 8 9 </ForPhones>\n"
-      "<State> 0 <PdfClass> 0\n"
-      "<Transition> 0 0.5\n"
-      "<Transition> 1 0.5\n"
-      "</State> \n"
-      "<State> 1 <PdfClass> 1 \n"
-      "<Transition> 1 0.5\n"
-      "<Transition> 2 0.5\n"
-      "</State>  \n"
-      " <State> 2 <PdfClass> 2\n"
-      " <Transition> 2 0.5\n"
-      " <Transition> 3 0.5\n"
-      " </State>   \n"
-      " <State> 3 </State>\n"
+      " 0  1  1  0\n"
+      " 1  1  1  0.693\n"
+      " 1  2  2  0.693\n"
+      " 2  2  2  0.693\n"
+      " 2  3  3  0.693\n"
+      " 3  3  3  0.693\n"
+      " 3  0.693\n\n"
       " </TopologyEntry>\n"
-      "  <TopologyEntry>\n"
-      "  <ForPhones> 10 11 13  </ForPhones>\n"
-      "  <State> 0 <PdfClass> 0\n"
-      "  <Transition> 0 0.5\n"
-      "  <Transition> 1 0.5\n"
-      "  </State> \n"
-      "  <State> 1 <PdfClass> 1 \n"
-      "  <Transition> 1 0.5\n"
-      "  <Transition> 2 0.5\n"
-      "  </State>  \n"
-      " <State> 2 </State>"
-      "  </TopologyEntry>\n"
-      "  </Topology>\n";
+
+      "<TopologyEntry>\n"
+      "<ForPhones> 10 11 13 </ForPhones>\n"
+      //      " 0  0  1  0.693\n"   // disallowed!
+      " 0  1  1  0\n"
+      " 1  1  2  0.693\n"
+      " 1  2  2  0.693\n"
+      " 2 \n\n"
+      "</TopologyEntry>\n"
+      "</Topology>\n";
 
   std::string chain_input_str = "<Topology>\n"
       "<TopologyEntry>\n"
       "<ForPhones> 1 2 3 4 5 6 7 8 9 </ForPhones>\n"
-      " <State> 0 <ForwardPdfClass> 0 <SelfLoopPdfClass> 1\n"
-      "  <Transition> 0 0.5\n"
-      "  <Transition> 1 0.5\n"
-      " </State> \n"
-      " <State> 1 </State>\n"
+      " 0  1  1  0.0\n"
+      " 1  1  2  0.693\n"
+      " 1  0.693\n\n"
       "</TopologyEntry>\n"
       "</Topology>\n";
 
-  HmmTopology topo;
+  Topology topo;
 
   if (RandInt(0, 1) == 0) {
     topo = GenRandTopology();
@@ -83,8 +73,7 @@ void TestHmmTopology() {
   std::ostringstream oss;
   topo.Write(oss, binary);
 
-  HmmTopology topo2;
-  // std::cout << oss.str() << '\n' << std::flush;
+  Topology topo2;
   std::istringstream iss2(oss.str());
   topo2.Read(iss2, binary);
 
@@ -96,7 +85,7 @@ void TestHmmTopology() {
   }
 
   {  // test chain topology
-    HmmTopology chain_topo;
+    Topology chain_topo;
     std::istringstream chain_iss(chain_input_str);
     chain_topo.Read(chain_iss, false);
     KALDI_ASSERT(chain_topo.MinLength(3) == 1);
@@ -116,8 +105,7 @@ void TestHmmTopology() {
 int main() {
   // repeat the test ten times
   for (int i = 0; i < 10; i++) {
-    kaldi::TestHmmTopology();
+    kaldi::TestTopology();
   }
   std::cout << "Test OK.\n";
 }
-
diff --git a/src/hmm/topology.cc b/src/hmm/topology.cc
new file mode 100644
index 00000000000..4a90a0d5414
--- /dev/null
+++ b/src/hmm/topology.cc
@@ -0,0 +1,366 @@
+// hmm/topology.cc
+
+// Copyright 2009-2011  Microsoft Corporation
+//           2014-2019  Johns Hopkins University (author: Daniel Povey)
+//           2019       Daniel Galvez
+//           2019       Hossein Hadian
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "util/common-utils.h"
+#include "hmm/topology.h"
+#include "util/stl-utils.h"
+#include "util/text-utils.h"
+#include "fstext/kaldi-fst-io.h"
+#include "fstext/fstext-utils.h"
+
+
+namespace kaldi {
+
+void Topology::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<Topology>");
+  if (!binary) {
+    phones_.clear();
+    phone2idx_.clear();
+    entries_.clear();
+    std::string token;
+    while ( ! (is >> token).fail() ) {
+      if (token == "</Topology>") {
+        break; // finished parsing.
+      } else if (token != "<TopologyEntry>") {
+        KALDI_ERR << "Reading Topology object, expected </Topology> or "
+            "<TopologyEntry>, got "<<token;
+      } else {
+        ExpectToken(is, binary, "<ForPhones>");
+        std::vector<int32> phones;
+        std::string s;
+        while (1) {
+          is >> s;
+          if (is.fail())
+            KALDI_ERR << "Reading Topology object, unexpected end of file "
+                "while expecting phones.";
+          if (s == "</ForPhones>") break;
+          else {
+            int32 phone;
+            if (!ConvertStringToInteger(s, &phone))
+              KALDI_ERR << "Reading Topology object, expected "
+                        << "integer, got instead " << s;
+            KALDI_ASSERT(phone > 0);
+            phones.push_back(phone);
+          }
+        }
+
+        int32 entry_index = entries_.size();
+        fst::StdVectorFst fst;
+        ReadFsaKaldi(is, &fst);
+        entries_.push_back(fst);
+
+        for (int32 phone : phones) {
+          if (static_cast<int32>(phone2idx_.size()) <= phone)
+            phone2idx_.resize(phone + 1, -1);  // -1 is invalid index.
+          if (phone2idx_[phone] != -1) {
+            KALDI_ERR << "Phone "
+                      << phone << " appears in multiple topology entries.";
+          }
+          phone2idx_[phone] = entry_index;
+          phones_.push_back(phone);
+        }
+        ExpectToken(is, binary, "</TopologyEntry>");
+      }
+    }
+    std::sort(phones_.begin(), phones_.end());
+    KALDI_ASSERT(IsSortedAndUniq(phones_));
+  } else {
+    ReadIntegerVector(is, binary, &phones_);
+    ReadIntegerVector(is, binary, &phone2idx_);
+    int32 number_topology_entries;
+    ReadBasicType(is, binary, &number_topology_entries);
+    for (size_t index = 0; index < number_topology_entries; ++index) {
+      fst::StdVectorFst fst;
+      ReadFstKaldi(is, binary, &fst);
+      entries_.push_back(fst);
+    }
+    ExpectToken(is, binary, "</Topology>");
+  }
+  ComputeDerived();
+  Check();
+}
+
+// This function writes an FSA in text mode to an output stream.
+template <class Arc>
+static void WriteFsa(std::ostream &os, const fst::VectorFst<Arc> &fst) {
+  os << '\n';
+  bool acceptor = true, write_one = false;
+  fst::FstPrinter<Arc> printer(fst, fst.InputSymbols(), fst.OutputSymbols(),
+                               NULL, acceptor, write_one, "\t");
+  printer.Print(&os, "<unknown>");
+  if (os.fail())
+    KALDI_ERR << "Stream failure detected writing FST to stream.";
+  os << '\n';
+  if (!os.good())
+    KALDI_ERR << "Error writing FST to stream.";
+}
+
+void Topology::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<Topology>");
+  if (!binary) {
+    for (int index = 0; index < entries_.size(); ++index) {
+      WriteToken(os, binary, "<TopologyEntry>");
+      WriteToken(os, binary, "<ForPhones>");
+      for (auto phone: phones_)
+        if (phone2idx_[phone] == index)
+          os << phone << " ";
+      os << "</ForPhones>";
+      WriteFsa(os, entries_[index]);
+      os << "</TopologyEntry>\n";
+    }
+  } else {
+    WriteIntegerVector(os, binary, phones_);
+    WriteIntegerVector(os, binary, phone2idx_);
+    int32 number_topology_entries = entries_.size();
+    WriteBasicType(os, binary, number_topology_entries);
+    for (auto const& fst : entries_)
+      WriteFstKaldi(os, binary, fst);
+  }
+  WriteToken(os, binary, "</Topology>");
+}
+
+void Topology::Check() {
+  if (entries_.empty() || phones_.empty() || phone2idx_.empty())
+    KALDI_ERR << "Empty object.";
+  std::vector<bool> is_seen(entries_.size(), false);
+  for (size_t i = 0; i < phones_.size(); i++) {
+    int32 phone = phones_[i];
+    if (static_cast<size_t>(phone) >= phone2idx_.size() ||
+        static_cast<size_t>(phone2idx_[phone]) >= entries_.size())
+      KALDI_ERR << "Phone " << phone << " has no valid index.";
+    is_seen[phone2idx_[phone]] = true;
+  }
+  if (!std::accumulate(is_seen.begin(),
+                       is_seen.end(), true, std::logical_and<bool>()))
+    KALDI_ERR << "Entry with no corresponding phones.";
+
+  for (auto const& entry: entries_) {
+    if (!fst::Verify(entry)) {
+      KALDI_ERR << "Ill-formed FST provided.";
+    }
+    if (entry.NumStates() <= 1)
+      KALDI_ERR << "Cannot only have one state (must have a "
+                << "final state and a start state).";
+    bool has_final_state = false;
+    std::vector<int32> seen_pdf_classes;
+    for (fst::StateIterator<fst::StdVectorFst> state_iter(entry);
+         !state_iter.Done(); state_iter.Next()) {
+      StateId state = state_iter.Value();
+      if (entry.Final(state) != Weight::Zero())
+        has_final_state = true;
+
+      BaseFloat outward_prob_sum = exp(-entry.Final(state).Value());
+      for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state);
+           !aiter.Done(); aiter.Next()) {
+        const fst::StdArc &arc(aiter.Value());
+        if (arc.ilabel != arc.olabel)
+          KALDI_ERR << "The topology must be an acceptor but ilabel != olabel.";
+        if (arc.ilabel == 0)
+          KALDI_ERR << "Epsilon arcs (pdf-class 0) are not allowed.";
+        if (arc.nextstate == entry.Start())
+          KALDI_ERR << "Start state may not have any inward transitions.";
+        seen_pdf_classes.push_back(arc.ilabel);
+        outward_prob_sum += exp(-arc.weight.Value());
+      }
+      if (!ApproxEqual(outward_prob_sum, 1.0))
+        KALDI_WARN << "Outward transition probabilities should sum to 1.0 "
+            "for each state, value was: " << outward_prob_sum;
+    }
+    if (!has_final_state) {
+      KALDI_ERR << "Must have a final state.";
+    }
+
+    if (entry.Final(entry.Start()) != Weight::Zero())
+      KALDI_ERR << "Start state must not be a final state.";
+
+    if (entry.Start() != 0) {
+      KALDI_ERR << "Topology::Check(), start state must be 0.";
+    }
+
+    SortAndUniq(&seen_pdf_classes);
+    if (seen_pdf_classes.front() != 1 ||
+        seen_pdf_classes.back() != static_cast<int32>(seen_pdf_classes.size()))
+      KALDI_ERR << "pdf_classes are expected to be "
+          "contiguous and start from 1.";
+
+    int num_states = entry.NumStates();
+    int num_arcs = NumArcs(entry);
+    fst::StdVectorFst fst(entry);  // Call Connect on a copy.
+    fst::Connect(&fst);
+    if (entry.NumStates() == 0)
+      KALDI_ERR << "Some of the states in the topolgy are not reachable.";
+    if (fst.NumStates() != num_states || NumArcs(fst) != num_arcs)
+      KALDI_ERR << "Topology changed after calling Connect().";
+  }
+  KALDI_ASSERT(self_loop_correction_factors_.size() == entries_.size() &&
+               self_loop_pdf_classes_.size() == entries_.size());
+}
+
+const fst::StdVectorFst& Topology::TopologyForPhone(int32 phone) const {
+  if (static_cast<size_t>(phone) >= phone2idx_.size()
+      || phone2idx_[phone] == -1)
+    KALDI_ERR << "TopologyForPhone(), phone " << phone << " not covered.";
+  return entries_[phone2idx_[phone]];
+}
+
+const std::vector<float>& Topology::CorrectionFactorsForPhone(int32 phone) const {
+  if (static_cast<size_t>(phone) >= phone2idx_.size()
+      || phone2idx_[phone] == -1)
+    KALDI_ERR << "TopologyForPhone(), phone " << phone << " not covered.";
+  return self_loop_correction_factors_[phone2idx_[phone]];
+}
+
+const std::vector<int32>& Topology::SelfLoopPdfClassesForPhone(int32 phone) const {
+  if (static_cast<size_t>(phone) >= phone2idx_.size()
+      || phone2idx_[phone] == -1) {
+    KALDI_ERR << "TopologyForPhone(), phone " << phone << " not covered.";
+  }
+  return self_loop_pdf_classes_[phone2idx_[phone]];
+}
+
+
+int32 Topology::NumPdfClasses(int32 phone) const {
+  // will throw if phone not covered.
+  const fst::StdVectorFst &entry = TopologyForPhone(phone);
+
+  std::set<int32> pdfs;
+  for (fst::StateIterator<fst::StdVectorFst> siter(entry);
+       !siter.Done(); siter.Next()) {
+    StateId state_id = siter.Value();
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state_id);
+         !aiter.Done(); aiter.Next()) {
+      pdfs.insert(aiter.Value().ilabel);
+    }
+  }
+  return pdfs.size();
+}
+
+void Topology::GetPhoneToNumPdfClasses(
+    std::vector<int32> *phone2num_pdf_classes) const {
+  KALDI_ASSERT(!phones_.empty());
+  phone2num_pdf_classes->clear();
+  phone2num_pdf_classes->resize(phones_.back() + 1, -1);
+  for (auto phone: phones_)
+    (*phone2num_pdf_classes)[phone] = NumPdfClasses(phone);
+}
+
+int32 Topology::MinLength(int32 phone) const {
+  using Weight = typename fst::StdFst::Weight;
+  using StateId = typename fst::StdFst::StateId;
+  const fst::StdVectorFst& this_topo = TopologyForPhone(phone);
+  // 1) Prepare a new FST with arc weight of 1.f and final state weight of 0.f
+  // (Note that 0.f == Weight::One() in Tropical Semiring).
+  // Since we are using the Std
+  // We need to use a VectorFst in order to mutate members
+  std::unique_ptr<fst::StdVectorFst> topo_copy(this_topo.Copy());
+
+  std::vector<StateId> final_states;
+  for (fst::StateIterator<fst::StdVectorFst> siter(*topo_copy);
+       !siter.Done(); siter.Next()) {
+    StateId state_id = siter.Value();
+
+    if (topo_copy->Final(state_id) != Weight::Zero()) {
+      final_states.push_back(state_id);
+      topo_copy->SetFinal(state_id, Weight::One());
+    }
+
+    for (fst::MutableArcIterator<fst::StdVectorFst> aiter(topo_copy.get(), state_id);
+         !aiter.Done(); aiter.Next()) {
+      Arc original_arc = aiter.Value();
+      Arc distance_one_arc(original_arc.ilabel, original_arc.olabel,
+                           Weight(1.0f), original_arc.nextstate);
+      aiter.SetValue(distance_one_arc);
+    }
+  }
+  KALDI_ASSERT(!final_states.empty());
+  // Now run single-source nearest neightbors
+  std::vector<Weight> distances;
+  fst::ShortestDistance(*topo_copy, &distances);
+  fst::NaturalLess<Weight> less;
+  auto min_final_state_iter =
+    std::min_element(final_states.begin(), final_states.end(),
+                     [&distances, &less](StateId state1, StateId state2) {
+                       return less(distances[state1], distances[state2]);
+                     });
+  Weight distance = distances[*min_final_state_iter];
+  return static_cast<int32>(distance.Value());
+}
+
+bool Topology::operator==(const Topology &other) const {
+  if (phones_ != other.phones_ || phone2idx_ != other.phone2idx_ ||
+      entries_.size() != other.entries_.size()) {
+    return false;
+  } else {
+    for(size_t i = 0; i < entries_.size(); ++i) {
+      if (!fst::Equal(entries_[i], other.entries_[i], /*delta=*/0,
+                      fst::kEqualFsts)) {
+        return false;
+      }
+    }
+    return true;
+  }
+}
+
+
+void Topology::ComputeDerived() {
+  using Arc = fst::StdArc;
+  using StateId = Arc::StateId;
+  using Weight = Arc::Weight;
+
+  self_loop_correction_factors_.resize(entries_.size());
+  self_loop_pdf_classes_.resize(entries_.size());
+  for (size_t i = 0; i < entries_.size(); i++) {
+    const fst::StdVectorFst &entry = entries_[i];
+    std::vector<float> &correction_factors(
+        self_loop_correction_factors_[i]);
+    std::vector<int32> &self_loop_pdf_classes(
+        self_loop_pdf_classes_[i]);
+    StateId num_states = entry.NumStates();
+    correction_factors.resize(num_states);
+    self_loop_pdf_classes.resize(num_states, -1);
+    for (StateId s = 0; s < num_states; s++) {
+      float tot_prob = exp(-entry.Final(s).Value()),
+          self_loop_prob = 0.0;
+      for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, s);
+           !aiter.Done(); aiter.Next()) {
+        const Arc& arc = aiter.Value();
+        float this_prob = exp(-arc.weight.Value());
+        tot_prob += this_prob;
+        if (arc.nextstate == s) {
+          self_loop_prob += this_prob;
+          KALDI_ASSERT(self_loop_pdf_classes[s] == -1 &&
+                       "State in topology has more than one self-loop");
+          self_loop_pdf_classes[s] = arc.ilabel;
+        }
+      }
+      KALDI_ASSERT(tot_prob > 0 && "Invalid topology");
+      // correction_factor is initialized with a number <= 0 that will be added
+      // to costs.  It will result in properly normalized probs after removing
+      // the self-loop, assuming the topo was properly normalized before.
+      correction_factors[s] = log((tot_prob - self_loop_prob) / tot_prob);
+    }
+  }
+}
+
+} // End namespace kaldi
diff --git a/src/hmm/topology.h b/src/hmm/topology.h
new file mode 100644
index 00000000000..55ec4dcf35c
--- /dev/null
+++ b/src/hmm/topology.h
@@ -0,0 +1,186 @@
+// hmm/topology.h
+
+// Copyright 2009-2011  Microsoft Corporation
+//                2019  Johns Hopkins University (author: Daniel Povey)
+//                2019  Daniel Galvez
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_HMM_HMM_TOPOLOGY_H_
+#define KALDI_HMM_HMM_TOPOLOGY_H_
+
+#include <fst/fstlib.h>
+#include "base/kaldi-common.h"
+
+
+namespace kaldi {
+
+
+/// \addtogroup hmm_group
+/// @{
+
+/*
+  The following would be the text form for the "normal" 3-state HMM topology/
+  "bakis model", with the typical reordering that we do to improve the
+  compactness of the compiled FSTs.  The format is the OpenFst acceptor format.
+  The fields are, for transitions,
+  <from-state> <to-state> <pdf-class> <transition-cost>
+ and, for final-states,
+  <state> <final-cost>
+
+  The <transition-cost> may be interpreted as negative log probabilities.
+  We normally set them so as to sum to one, in order to keep the fully
+  compiled (HCLG) graph fairly stochastic (meaning: sum-to-one, like an
+  HMM).
+
+  The integers on the arcs, which we call 'pdf-classes', define which
+  arcs share the same "pdf" and which ones are distinct.
+
+  Preconditions on topology:
+     - pdf-classes (3rd field on arcs) must
+       form a contiguous list of numbers starting from 1, although
+       different arcs with the same pdf-class are allowed.  (We avoid 0
+       because it is "special" in OpenFST, it is used for epsilon).
+     - The start state must be state 0 and there must be no
+       transitions entering it.
+     - The start state must not be final.
+     - No phone (in the <ForPhones>...</ForPhones> block) may have the value 0.
+
+
+ <Topology>
+ <TopologyEntry>
+ <ForPhones> 1 2 3 4 5 6 7 8 </ForPhones>
+ 0  1  1  0.0
+ 1  1  1  0.693
+ 1  2  2  0.693
+ 2  2  2  0.693
+ 2  3  3  0.693
+ 3  3  3  0.693
+ 3  0.693
+ </TopologyEntry>
+ </Topology>
+*/
+
+
+/// A class for storing topology information for phones.  See  \ref hmm for context.
+/// This object is sometimes accessed in a file by itself, but more often
+/// as a class member of the Transition class (this is for convenience to reduce
+/// the number of files programs have to access).
+
+class Topology {
+ public:
+
+  void Read(std::istream &is, bool binary);
+  void Write(std::ostream &os, bool binary) const;
+
+  // Checks that the object is valid, and throw exception otherwise.
+  void Check();
+
+  /// Returns the topology entry for this phone;
+  /// will throw exception if phone not covered by the topology.
+  const fst::StdVectorFst &TopologyForPhone(int32 phone) const;
+
+  /// Returns a reference to a vector of floats of size
+  /// `TopologyForPhone(phone).NumStates()`; this contains numbers <= 0 which are to be
+  /// added to the final-costs and non-self-loop arc costs when creating graphs
+  /// without self-loops (we call it a correction factor becuause in the
+  /// semiring it's multiplied, although physically it is added); this
+  /// correction factor will ensure that the probability sum of the
+  /// non-self-loop arcs and final-prob of each state has the same value that it
+  /// did before removing the self-loop.  It's used to make sure that
+  /// intermediate FSTs made during graph compilation are as stochastic as
+  /// possible.
+  /// The user could compute this themselves, but we provide it
+  /// directly for speed.
+  const std::vector<float> &CorrectionFactorsForPhone(int32 phone) const;
+
+  /// For each phone, this will return a vector of size
+  /// `TopologyForPhone(phone).NumStates()` containing, for each state
+  /// in this phone's topology entry, the pdf-class of the self-loop on
+  /// that state (if any), and otherwise, -1.  This could be computed
+  /// by the user from the FST, but is provided for convenience.
+  const std::vector<int32> &SelfLoopPdfClassesForPhone(int32 phone) const;
+
+  /// Returns the number of \ref pdf_class "pdf-classes" for this phone;
+  /// throws exception if phone not covered by this topology.
+  int32 NumPdfClasses(int32 phone) const;
+
+  /// Returns a reference to a sorted, unique list of phones covered by
+  /// the topology (these phones will be positive integers, and usually
+  /// contiguous and starting from one but the toolkit doesn't assume
+  /// they are contiguous).
+  const std::vector<int32> &GetPhones() const { return phones_; };
+
+  /// Outputs a vector of int32, indexed by phone, that gives the
+  /// number of \ref pdf_class pdf-classes for the phones; this is
+  /// used by tree-building code such as BuildTree().
+  void GetPhoneToNumPdfClasses(std::vector<int32> *phone2num_pdf_classes) const;
+
+  // Returns the minimum number of arcs/frames it takes to traverse this model
+  // for this phone: e.g. 3 for the normal HMM topology.
+  int32 MinLength(int32 phone) const;
+
+  Topology() {}
+
+  bool operator == (const Topology &other) const;
+
+  // Allow default assignment operator and copy constructor.
+ private:
+
+  void ComputeDerived();
+
+  using Arc     = typename fst::StdVectorFst::Arc;
+  using StateId = typename fst::StdVectorFst::StateId;
+  using Weight  = typename fst::StdVectorFst::Weight;
+
+  std::vector<int32> phones_;  // list of all phones we have topology for.
+                               // Sorted, uniq.  no epsilon (zero) phone.
+  std::vector<int32> phone2idx_;  // map from phones to indexes into the entries
+                                  // vector (or -1 for not present).
+  std::vector<fst::StdVectorFst> entries_;  // list of topology entries, indexed
+                                            // by the elements of phone2indx_.
+
+  // Below this point are 'derived quantities' (things not written to disk,
+  // that can be worked out from the information above).
+
+  // This is a vector indexed by 'idx' (the same as the index into entries_) and
+  // then by state-id in the corresponding topology entry; it contains the
+  // correction factor that we add to the costs of arcs leaving that state (and
+  // its final-cost) if we remove the self-loop; it's a number <= 0.  This will
+  // make the probability sum of this state have the same value it did before
+  // removing the self-loop, hopefully 1.0.  (viewing the costs as negated
+  // log-probs, of course).  Doing this will make the no-self-loop FST
+  // stochastic if it was stochastic with the self-loops.
+  std::vector<std::vector<float> > self_loop_correction_factors_;
+
+  // This is a vector indexed by 'idx' (the same as the index into entries_) and
+  // then by state-id in the corresponding topology entry; it contains the
+  // pdf-class of the self-loop of each state that had a self-loop, or -1
+  // for the states that didn't have self-loops.  Note: the pdf-class is
+  // a number >0 which is the label on the arc in the topology entries (ilabel
+  // or olabel; they are the same because the topology entries are
+  // acceptors).
+  std::vector<std::vector<int32> > self_loop_pdf_classes_;
+};
+
+
+/// @} end "addtogroup hmm_group"
+
+
+} // end namespace kaldi
+
+
+#endif
diff --git a/src/hmm/transition-model.cc b/src/hmm/transition-model.cc
deleted file mode 100644
index 420a94585ea..00000000000
--- a/src/hmm/transition-model.cc
+++ /dev/null
@@ -1,924 +0,0 @@
-// hmm/transition-model.cc
-
-// Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
-//        Johns Hopkins University (author: Guoguo Chen)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-namespace kaldi {
-
-void TransitionModel::ComputeTuples(const ContextDependencyInterface &ctx_dep) {
-  if (IsHmm())
-    ComputeTuplesIsHmm(ctx_dep);
-  else
-    ComputeTuplesNotHmm(ctx_dep);
-
-  // now tuples_ is populated with all possible tuples of (phone, hmm_state, pdf, self_loop_pdf).
-  std::sort(tuples_.begin(), tuples_.end());  // sort to enable reverse lookup.
-  // this sorting defines the transition-ids.
-}
-
-void TransitionModel::ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_dep) {
-  const std::vector<int32> &phones = topo_.GetPhones();
-  KALDI_ASSERT(!phones.empty());
-
-  // this is the case for normal models. but not for chain models
-  std::vector<std::vector<std::pair<int32, int32> > > pdf_info;
-  std::vector<int32> num_pdf_classes( 1 + *std::max_element(phones.begin(), phones.end()), -1);
-  for (size_t i = 0; i < phones.size(); i++)
-    num_pdf_classes[phones[i]] = topo_.NumPdfClasses(phones[i]);
-  ctx_dep.GetPdfInfo(phones, num_pdf_classes, &pdf_info);
-  // pdf_info is list indexed by pdf of which (phone, pdf_class) it
-  // can correspond to.
-
-  std::map<std::pair<int32, int32>, std::vector<int32> > to_hmm_state_list;
-  // to_hmm_state_list is a map from (phone, pdf_class) to the list
-  // of hmm-states in the HMM for that phone that that (phone, pdf-class)
-  // can correspond to.
-  for (size_t i = 0; i < phones.size(); i++) {  // setting up to_hmm_state_list.
-    int32 phone = phones[i];
-    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
-    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
-      int32 pdf_class = entry[j].forward_pdf_class;
-      if (pdf_class != kNoPdf) {
-        to_hmm_state_list[std::make_pair(phone, pdf_class)].push_back(j);
-      }
-    }
-  }
-
-  for (int32 pdf = 0; pdf < static_cast<int32>(pdf_info.size()); pdf++) {
-    for (size_t j = 0; j < pdf_info[pdf].size(); j++) {
-      int32 phone = pdf_info[pdf][j].first,
-            pdf_class = pdf_info[pdf][j].second;
-      const std::vector<int32> &state_vec = to_hmm_state_list[std::make_pair(phone, pdf_class)];
-      KALDI_ASSERT(!state_vec.empty());
-      // state_vec is a list of the possible HMM-states that emit this
-      // pdf_class.
-      for (size_t k = 0; k < state_vec.size(); k++) {
-        int32 hmm_state = state_vec[k];
-        tuples_.push_back(Tuple(phone, hmm_state, pdf, pdf));
-      }
-    }
-  }
-}
-
-void TransitionModel::ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_dep) {
-  const std::vector<int32> &phones = topo_.GetPhones();
-  KALDI_ASSERT(!phones.empty());
-
-  // pdf_info is a set of lists indexed by phone. Each list is indexed by
-  // (pdf-class, self-loop pdf-class) of each state of that phone, and the element
-  // is a list of possible (pdf, self-loop pdf) pairs that (pdf-class, self-loop pdf-class)
-  // pair generates.
-  std::vector<std::vector<std::vector<std::pair<int32, int32> > > > pdf_info;
-  // pdf_class_pairs is a set of lists indexed by phone. Each list stores
-  // (pdf-class, self-loop pdf-class) of each state of that phone.
-  std::vector<std::vector<std::pair<int32, int32> > > pdf_class_pairs;
-  pdf_class_pairs.resize(1 + *std::max_element(phones.begin(), phones.end()));
-  for (size_t i = 0; i < phones.size(); i++) {
-    int32 phone = phones[i];
-    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
-    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
-      int32 forward_pdf_class = entry[j].forward_pdf_class, self_loop_pdf_class = entry[j].self_loop_pdf_class;
-      if (forward_pdf_class != kNoPdf)
-        pdf_class_pairs[phone].push_back(std::make_pair(forward_pdf_class, self_loop_pdf_class));
-    }
-  }
-  ctx_dep.GetPdfInfo(phones, pdf_class_pairs, &pdf_info);
-
-  std::vector<std::map<std::pair<int32, int32>, std::vector<int32> > > to_hmm_state_list;
-  to_hmm_state_list.resize(1 + *std::max_element(phones.begin(), phones.end()));
-  // to_hmm_state_list is a phone-indexed set of maps from (pdf-class, self-loop pdf_class) to the list
-  // of hmm-states in the HMM for that phone that that (pdf-class, self-loop pdf-class)
-  // can correspond to.
-  for (size_t i = 0; i < phones.size(); i++) {  // setting up to_hmm_state_list.
-    int32 phone = phones[i];
-    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
-    std::map<std::pair<int32, int32>, std::vector<int32> > phone_to_hmm_state_list;
-    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
-      int32 forward_pdf_class = entry[j].forward_pdf_class, self_loop_pdf_class = entry[j].self_loop_pdf_class;
-      if (forward_pdf_class != kNoPdf) {
-        phone_to_hmm_state_list[std::make_pair(forward_pdf_class, self_loop_pdf_class)].push_back(j);
-      }
-    }
-    to_hmm_state_list[phone] = phone_to_hmm_state_list;
-  }
-
-  for (int32 i = 0; i < phones.size(); i++) {
-    int32 phone = phones[i];
-    for (int32 j = 0; j < static_cast<int32>(pdf_info[phone].size()); j++) {
-      int32 pdf_class = pdf_class_pairs[phone][j].first,
-            self_loop_pdf_class = pdf_class_pairs[phone][j].second;
-      const std::vector<int32> &state_vec =
-              to_hmm_state_list[phone][std::make_pair(pdf_class, self_loop_pdf_class)];
-      KALDI_ASSERT(!state_vec.empty());
-      for (size_t k = 0; k < state_vec.size(); k++) {
-        int32 hmm_state = state_vec[k];
-        for (size_t m = 0; m < pdf_info[phone][j].size(); m++) {
-          int32 pdf = pdf_info[phone][j][m].first,
-            self_loop_pdf = pdf_info[phone][j][m].second;
-          tuples_.push_back(Tuple(phone, hmm_state, pdf, self_loop_pdf));
-        }
-      }
-    }
-  }
-}
-
-void TransitionModel::ComputeDerived() {
-  state2id_.resize(tuples_.size()+2);  // indexed by transition-state, which
-  // is one based, but also an entry for one past end of list.
-
-  int32 cur_transition_id = 1;
-  num_pdfs_ = 0;
-  for (int32 tstate = 1;
-      tstate <= static_cast<int32>(tuples_.size()+1);  // not a typo.
-      tstate++) {
-    state2id_[tstate] = cur_transition_id;
-    if (static_cast<size_t>(tstate) <= tuples_.size()) {
-      int32 phone = tuples_[tstate-1].phone,
-          hmm_state = tuples_[tstate-1].hmm_state,
-          forward_pdf = tuples_[tstate-1].forward_pdf,
-          self_loop_pdf = tuples_[tstate-1].self_loop_pdf;
-      num_pdfs_ = std::max(num_pdfs_, 1 + forward_pdf);
-      num_pdfs_ = std::max(num_pdfs_, 1 + self_loop_pdf);
-      const HmmTopology::HmmState &state = topo_.TopologyForPhone(phone)[hmm_state];
-      int32 my_num_ids = static_cast<int32>(state.transitions.size());
-      cur_transition_id += my_num_ids;  // # trans out of this state.
-    }
-  }
-
-  id2state_.resize(cur_transition_id);   // cur_transition_id is #transition-ids+1.
-  id2pdf_id_.resize(cur_transition_id);
-  for (int32 tstate = 1; tstate <= static_cast<int32>(tuples_.size()); tstate++) {
-    for (int32 tid = state2id_[tstate]; tid < state2id_[tstate+1]; tid++) {
-      id2state_[tid] = tstate;
-      if (IsSelfLoop(tid))
-        id2pdf_id_[tid] = tuples_[tstate-1].self_loop_pdf;
-      else
-        id2pdf_id_[tid] = tuples_[tstate-1].forward_pdf;
-    }
-  }
-
-  // The following statements put copies a large number in the region of memory
-  // past the end of the id2pdf_id_ array, while leaving the array as it was
-  // before.  The goal of this is to speed up decoding by disabling a check
-  // inside TransitionIdToPdf() that the transition-id was within the correct
-  // range.
-  int32 num_big_numbers = std::min<int32>(2000, cur_transition_id);
-  id2pdf_id_.resize(cur_transition_id + num_big_numbers,
-                    std::numeric_limits<int32>::max());
-  id2pdf_id_.resize(cur_transition_id);
-}
-
-void TransitionModel::InitializeProbs() {
-  log_probs_.Resize(NumTransitionIds()+1);  // one-based array, zeroth element empty.
-  for (int32 trans_id = 1; trans_id <= NumTransitionIds(); trans_id++) {
-    int32 trans_state = id2state_[trans_id];
-    int32 trans_index = trans_id - state2id_[trans_state];
-    const Tuple &tuple = tuples_[trans_state-1];
-    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
-    KALDI_ASSERT(static_cast<size_t>(tuple.hmm_state) < entry.size());
-    BaseFloat prob = entry[tuple.hmm_state].transitions[trans_index].second;
-    if (prob <= 0.0)
-      KALDI_ERR << "TransitionModel::InitializeProbs, zero "
-          "probability [should remove that entry in the topology]";
-    if (prob > 1.0)
-      KALDI_WARN << "TransitionModel::InitializeProbs, prob greater than one.";
-    log_probs_(trans_id) = Log(prob);
-  }
-  ComputeDerivedOfProbs();
-}
-
-void TransitionModel::Check() const {
-  KALDI_ASSERT(NumTransitionIds() != 0 && NumTransitionStates() != 0);
-  {
-    int32 sum = 0;
-    for (int32 ts = 1; ts <= NumTransitionStates(); ts++) sum += NumTransitionIndices(ts);
-    KALDI_ASSERT(sum == NumTransitionIds());
-  }
-  for (int32 tid = 1; tid <= NumTransitionIds(); tid++) {
-    int32 tstate = TransitionIdToTransitionState(tid),
-        index = TransitionIdToTransitionIndex(tid);
-    KALDI_ASSERT(tstate > 0 && tstate <=NumTransitionStates() && index >= 0);
-    KALDI_ASSERT(tid == PairToTransitionId(tstate, index));
-    int32 phone = TransitionStateToPhone(tstate),
-        hmm_state = TransitionStateToHmmState(tstate),
-        forward_pdf = TransitionStateToForwardPdf(tstate),
-        self_loop_pdf = TransitionStateToSelfLoopPdf(tstate);
-    KALDI_ASSERT(tstate == TupleToTransitionState(phone, hmm_state, forward_pdf, self_loop_pdf));
-    KALDI_ASSERT(log_probs_(tid) <= 0.0 && log_probs_(tid) - log_probs_(tid) == 0.0);
-    // checking finite and non-positive (and not out-of-bounds).
-  }
-}
-
-bool TransitionModel::IsHmm() const {
-  const std::vector<int32> &phones = topo_.GetPhones();
-  KALDI_ASSERT(!phones.empty());
-  for (size_t i = 0; i < phones.size(); i++) {
-    int32 phone = phones[i];
-    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
-    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
-      if (entry[j].forward_pdf_class != entry[j].self_loop_pdf_class)
-        return false;
-    }
-  }
-  return true;
-}
-
-TransitionModel::TransitionModel(const ContextDependencyInterface &ctx_dep,
-                                 const HmmTopology &hmm_topo): topo_(hmm_topo) {
-  // First thing is to get all possible tuples.
-  ComputeTuples(ctx_dep);
-  ComputeDerived();
-  InitializeProbs();
-  Check();
-}
-
-int32 TransitionModel::TupleToTransitionState(int32 phone, int32 hmm_state, int32 pdf, int32 self_loop_pdf) const {
-  Tuple tuple(phone, hmm_state, pdf, self_loop_pdf);
-  // Note: if this ever gets too expensive, which is unlikely, we can refactor
-  // this code to sort first on pdf, and then index on pdf, so those
-  // that have the same pdf are in a contiguous range.
-  std::vector<Tuple>::const_iterator iter =
-      std::lower_bound(tuples_.begin(), tuples_.end(), tuple);
-  if (iter == tuples_.end() || !(*iter == tuple)) {
-    KALDI_ERR << "TransitionModel::TupleToTransitionState, tuple not found."
-              << " (incompatible tree and model?)";
-  }
-  // tuples_ is indexed by transition_state-1, so add one.
-  return static_cast<int32>((iter - tuples_.begin())) + 1;
-}
-
-
-int32 TransitionModel::NumTransitionIndices(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  return static_cast<int32>(state2id_[trans_state+1]-state2id_[trans_state]);
-}
-
-int32 TransitionModel::TransitionIdToTransitionState(int32 trans_id) const {
-  KALDI_ASSERT(trans_id != 0 &&  static_cast<size_t>(trans_id) < id2state_.size());
-  return id2state_[trans_id];
-}
-
-int32 TransitionModel::TransitionIdToTransitionIndex(int32 trans_id) const {
-  KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
-  return trans_id - state2id_[id2state_[trans_id]];
-}
-
-int32 TransitionModel::TransitionStateToPhone(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  return tuples_[trans_state-1].phone;
-}
-
-int32 TransitionModel::TransitionStateToForwardPdf(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  return tuples_[trans_state-1].forward_pdf;
-}
-
-int32 TransitionModel::TransitionStateToForwardPdfClass(
-    int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  const Tuple &t = tuples_[trans_state-1];
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
-  KALDI_ASSERT(static_cast<size_t>(t.hmm_state) < entry.size());
-  return entry[t.hmm_state].forward_pdf_class;
-}
-
-
-int32 TransitionModel::TransitionStateToSelfLoopPdfClass(
-    int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  const Tuple &t = tuples_[trans_state-1];
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
-  KALDI_ASSERT(static_cast<size_t>(t.hmm_state) < entry.size());
-  return entry[t.hmm_state].self_loop_pdf_class;
-}
-
-
-int32 TransitionModel::TransitionStateToSelfLoopPdf(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  return tuples_[trans_state-1].self_loop_pdf;
-}
-
-int32 TransitionModel::TransitionStateToHmmState(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  return tuples_[trans_state-1].hmm_state;
-}
-
-int32 TransitionModel::PairToTransitionId(int32 trans_state, int32 trans_index) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
-  KALDI_ASSERT(trans_index < state2id_[trans_state+1] - state2id_[trans_state]);
-  return state2id_[trans_state] + trans_index;
-}
-
-int32 TransitionModel::NumPhones() const {
-  int32 num_trans_state = tuples_.size();
-  int32 max_phone_id = 0;
-  for (int32 i = 0; i < num_trans_state; ++i) {
-    if (tuples_[i].phone > max_phone_id)
-      max_phone_id = tuples_[i].phone;
-  }
-  return max_phone_id;
-}
-
-
-bool TransitionModel::IsFinal(int32 trans_id) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_id) < id2state_.size());
-  int32 trans_state = id2state_[trans_id];
-  int32 trans_index = trans_id - state2id_[trans_state];
-  const Tuple &tuple = tuples_[trans_state-1];
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
-  KALDI_ASSERT(static_cast<size_t>(tuple.hmm_state) < entry.size());
-  KALDI_ASSERT(static_cast<size_t>(tuple.hmm_state) < entry.size());
-  KALDI_ASSERT(static_cast<size_t>(trans_index) <
-               entry[tuple.hmm_state].transitions.size());
-  // return true if the transition goes to the final state of the
-  // topology entry.
-  return (entry[tuple.hmm_state].transitions[trans_index].first + 1 ==
-          static_cast<int32>(entry.size()));
-}
-
-
-
-int32 TransitionModel::SelfLoopOf(int32 trans_state) const {  // returns the self-loop transition-id,
-  KALDI_ASSERT(static_cast<size_t>(trans_state-1) < tuples_.size());
-  const Tuple &tuple = tuples_[trans_state-1];
-  // or zero if does not exist.
-  int32 phone = tuple.phone, hmm_state = tuple.hmm_state;
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
-  KALDI_ASSERT(static_cast<size_t>(hmm_state) < entry.size());
-  for (int32 trans_index = 0;
-      trans_index < static_cast<int32>(entry[hmm_state].transitions.size());
-      trans_index++)
-    if (entry[hmm_state].transitions[trans_index].first == hmm_state)
-      return PairToTransitionId(trans_state, trans_index);
-  return 0;  // invalid transition id.
-}
-
-void TransitionModel::ComputeDerivedOfProbs() {
-  non_self_loop_log_probs_.Resize(NumTransitionStates()+1);  // this array indexed
-  //  by transition-state with nothing in zeroth element.
-  for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    int32 tid = SelfLoopOf(tstate);
-    if (tid == 0) {  // no self-loop
-      non_self_loop_log_probs_(tstate) = 0.0;  // log(1.0)
-    } else {
-      BaseFloat self_loop_prob = Exp(GetTransitionLogProb(tid)),
-          non_self_loop_prob = 1.0 - self_loop_prob;
-      if (non_self_loop_prob <= 0.0) {
-        KALDI_WARN << "ComputeDerivedOfProbs(): non-self-loop prob is " << non_self_loop_prob;
-        non_self_loop_prob = 1.0e-10;  // just so we can continue...
-      }
-      non_self_loop_log_probs_(tstate) = Log(non_self_loop_prob);  // will be negative.
-    }
-  }
-}
-
-void TransitionModel::Read(std::istream &is, bool binary) {
-  ExpectToken(is, binary, "<TransitionModel>");
-  topo_.Read(is, binary);
-  std::string token;
-  ReadToken(is, binary, &token);
-  int32 size;
-  ReadBasicType(is, binary, &size);
-  tuples_.resize(size);
-  for (int32 i = 0; i < size; i++) {
-    ReadBasicType(is, binary, &(tuples_[i].phone));
-    ReadBasicType(is, binary, &(tuples_[i].hmm_state));
-    ReadBasicType(is, binary, &(tuples_[i].forward_pdf));
-    if (token == "<Tuples>")
-      ReadBasicType(is, binary, &(tuples_[i].self_loop_pdf));
-    else if (token == "<Triples>")
-      tuples_[i].self_loop_pdf = tuples_[i].forward_pdf;
-  }
-  ReadToken(is, binary, &token);
-  KALDI_ASSERT(token == "</Triples>" || token == "</Tuples>");
-  ComputeDerived();
-  ExpectToken(is, binary, "<LogProbs>");
-  log_probs_.Read(is, binary);
-  ExpectToken(is, binary, "</LogProbs>");
-  ExpectToken(is, binary, "</TransitionModel>");
-  ComputeDerivedOfProbs();
-  Check();
-}
-
-void TransitionModel::Write(std::ostream &os, bool binary) const {
-  bool is_hmm = IsHmm();
-  WriteToken(os, binary, "<TransitionModel>");
-  if (!binary) os << "\n";
-  topo_.Write(os, binary);
-  if (is_hmm)
-    WriteToken(os, binary, "<Triples>");
-  else
-    WriteToken(os, binary, "<Tuples>");
-  WriteBasicType(os, binary, static_cast<int32>(tuples_.size()));
-  if (!binary) os << "\n";
-  for (int32 i = 0; i < static_cast<int32> (tuples_.size()); i++) {
-    WriteBasicType(os, binary, tuples_[i].phone);
-    WriteBasicType(os, binary, tuples_[i].hmm_state);
-    WriteBasicType(os, binary, tuples_[i].forward_pdf);
-    if (!is_hmm)
-      WriteBasicType(os, binary, tuples_[i].self_loop_pdf);
-    if (!binary) os << "\n";
-  }
-  if (is_hmm)
-    WriteToken(os, binary, "</Triples>");
-  else
-    WriteToken(os, binary, "</Tuples>");
-  if (!binary) os << "\n";
-  WriteToken(os, binary, "<LogProbs>");
-  if (!binary) os << "\n";
-  log_probs_.Write(os, binary);
-  WriteToken(os, binary, "</LogProbs>");
-  if (!binary) os << "\n";
-  WriteToken(os, binary, "</TransitionModel>");
-  if (!binary) os << "\n";
-}
-
-BaseFloat TransitionModel::GetTransitionProb(int32 trans_id) const {
-  return Exp(log_probs_(trans_id));
-}
-
-BaseFloat TransitionModel::GetTransitionLogProb(int32 trans_id) const {
-  return log_probs_(trans_id);
-}
-
-BaseFloat TransitionModel::GetNonSelfLoopLogProb(int32 trans_state) const {
-  KALDI_ASSERT(trans_state != 0);
-  return non_self_loop_log_probs_(trans_state);
-}
-
-BaseFloat TransitionModel::GetTransitionLogProbIgnoringSelfLoops(int32 trans_id) const {
-  KALDI_ASSERT(trans_id != 0);
-  KALDI_PARANOID_ASSERT(!IsSelfLoop(trans_id));
-  return log_probs_(trans_id) - GetNonSelfLoopLogProb(TransitionIdToTransitionState(trans_id));
-}
-
-// stats are counts/weights, indexed by transition-id.
-void TransitionModel::MleUpdate(const Vector<double> &stats,
-                                const MleTransitionUpdateConfig &cfg,
-                                BaseFloat *objf_impr_out,
-                                BaseFloat *count_out) {
-  if (cfg.share_for_pdfs) {
-    MleUpdateShared(stats, cfg, objf_impr_out, count_out);
-    return;
-  }
-  BaseFloat count_sum = 0.0, objf_impr_sum = 0.0;
-  int32 num_skipped = 0, num_floored = 0;
-  KALDI_ASSERT(stats.Dim() == NumTransitionIds()+1);
-  for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    int32 n = NumTransitionIndices(tstate);
-    KALDI_ASSERT(n>=1);
-    if (n > 1) {  // no point updating if only one transition...
-      Vector<double> counts(n);
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        int32 tid = PairToTransitionId(tstate, tidx);
-        counts(tidx) = stats(tid);
-      }
-      double tstate_tot = counts.Sum();
-      count_sum += tstate_tot;
-      if (tstate_tot < cfg.mincount) { num_skipped++; }
-      else {
-        Vector<BaseFloat> old_probs(n), new_probs(n);
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          int32 tid = PairToTransitionId(tstate, tidx);
-          old_probs(tidx) = new_probs(tidx) = GetTransitionProb(tid);
-        }
-        for (int32 tidx = 0; tidx < n; tidx++)
-          new_probs(tidx) = counts(tidx) / tstate_tot;
-        for (int32 i = 0; i < 3; i++) {  // keep flooring+renormalizing for 3 times..
-          new_probs.Scale(1.0 / new_probs.Sum());
-          for (int32 tidx = 0; tidx < n; tidx++)
-            new_probs(tidx) = std::max(new_probs(tidx), cfg.floor);
-        }
-        // Compute objf change
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          if (new_probs(tidx) == cfg.floor) num_floored++;
-          double objf_change = counts(tidx) * (Log(new_probs(tidx))
-                                               - Log(old_probs(tidx)));
-          objf_impr_sum += objf_change;
-        }
-        // Commit updated values.
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          int32 tid = PairToTransitionId(tstate, tidx);
-          log_probs_(tid) = Log(new_probs(tidx));
-          if (log_probs_(tid) - log_probs_(tid) != 0.0)
-            KALDI_ERR << "Log probs is inf or NaN: error in update or bad stats?";
-        }
-      }
-    }
-  }
-  KALDI_LOG << "TransitionModel::Update, objf change is "
-            << (objf_impr_sum / count_sum) << " per frame over " << count_sum
-            << " frames. ";
-  KALDI_LOG <<  num_floored << " probabilities floored, " << num_skipped
-            << " out of " << NumTransitionStates() << " transition-states "
-      "skipped due to insuffient data (it is normal to have some skipped.)";
-  if (objf_impr_out) *objf_impr_out = objf_impr_sum;
-  if (count_out) *count_out = count_sum;
-  ComputeDerivedOfProbs();
-}
-
-
-// stats are counts/weights, indexed by transition-id.
-void TransitionModel::MapUpdate(const Vector<double> &stats,
-                                const MapTransitionUpdateConfig &cfg,
-                                BaseFloat *objf_impr_out,
-                                BaseFloat *count_out) {
-  KALDI_ASSERT(cfg.tau > 0.0);
-  if (cfg.share_for_pdfs) {
-    MapUpdateShared(stats, cfg, objf_impr_out, count_out);
-    return;
-  }
-  BaseFloat count_sum = 0.0, objf_impr_sum = 0.0;
-  KALDI_ASSERT(stats.Dim() == NumTransitionIds()+1);
-  for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    int32 n = NumTransitionIndices(tstate);
-    KALDI_ASSERT(n>=1);
-    if (n > 1) {  // no point updating if only one transition...
-      Vector<double> counts(n);
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        int32 tid = PairToTransitionId(tstate, tidx);
-        counts(tidx) = stats(tid);
-      }
-      double tstate_tot = counts.Sum();
-      count_sum += tstate_tot;
-      Vector<BaseFloat> old_probs(n), new_probs(n);
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        int32 tid = PairToTransitionId(tstate, tidx);
-        old_probs(tidx) = new_probs(tidx) = GetTransitionProb(tid);
-      }
-      for (int32 tidx = 0; tidx < n; tidx++)
-        new_probs(tidx) = (counts(tidx) + cfg.tau * old_probs(tidx)) /
-            (cfg.tau + tstate_tot);
-      // Compute objf change
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        double objf_change = counts(tidx) * (Log(new_probs(tidx))
-                                             - Log(old_probs(tidx)));
-        objf_impr_sum += objf_change;
-      }
-      // Commit updated values.
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        int32 tid = PairToTransitionId(tstate, tidx);
-        log_probs_(tid) = Log(new_probs(tidx));
-        if (log_probs_(tid) - log_probs_(tid) != 0.0)
-          KALDI_ERR << "Log probs is inf or NaN: error in update or bad stats?";
-      }
-    }
-  }
-  KALDI_LOG << "Objf change is " << (objf_impr_sum / count_sum)
-            << " per frame over " << count_sum
-            << " frames.";
-  if (objf_impr_out) *objf_impr_out = objf_impr_sum;
-  if (count_out) *count_out = count_sum;
-  ComputeDerivedOfProbs();
-}
-
-
-
-/// This version of the Update() function is for if the user specifies
-/// --share-for-pdfs=true.  We share the transitions for all states that
-/// share the same pdf.
-void TransitionModel::MleUpdateShared(const Vector<double> &stats,
-                                      const MleTransitionUpdateConfig &cfg,
-                                      BaseFloat *objf_impr_out,
-                                      BaseFloat *count_out) {
-  KALDI_ASSERT(cfg.share_for_pdfs);
-
-  BaseFloat count_sum = 0.0, objf_impr_sum = 0.0;
-  int32 num_skipped = 0, num_floored = 0;
-  KALDI_ASSERT(stats.Dim() == NumTransitionIds()+1);
-  std::map<int32, std::set<int32> > pdf_to_tstate;
-
-  for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    int32 pdf = TransitionStateToForwardPdf(tstate);
-    pdf_to_tstate[pdf].insert(tstate);
-    if (!IsHmm()) {
-      pdf = TransitionStateToSelfLoopPdf(tstate);
-      pdf_to_tstate[pdf].insert(tstate);
-    }
-  }
-  std::map<int32, std::set<int32> >::iterator map_iter;
-  for (map_iter = pdf_to_tstate.begin();
-       map_iter != pdf_to_tstate.end();
-       ++map_iter) {
-    // map_iter->first is pdf-id... not needed.
-    const std::set<int32> &tstates = map_iter->second;
-    KALDI_ASSERT(!tstates.empty());
-    int32 one_tstate = *(tstates.begin());
-    int32 n = NumTransitionIndices(one_tstate);
-    KALDI_ASSERT(n >= 1);
-    if (n > 1) { // Only update if >1 transition...
-      Vector<double> counts(n);
-      for (std::set<int32>::const_iterator iter = tstates.begin();
-           iter != tstates.end();
-           ++iter) {
-        int32 tstate = *iter;
-        if (NumTransitionIndices(tstate) != n)
-          KALDI_ERR << "Mismatch in #transition indices: you cannot "
-              "use the --share-for-pdfs option with this topology "
-              "and sharing scheme.";
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          int32 tid = PairToTransitionId(tstate, tidx);
-          counts(tidx) += stats(tid);
-        }
-      }
-      double pdf_tot = counts.Sum();
-      count_sum += pdf_tot;
-      if (pdf_tot < cfg.mincount) { num_skipped++; }
-      else {
-        // Note: when calculating objf improvement, we
-        // assume we previously had the same tying scheme so
-        // we can get the params from one_tstate and they're valid
-        // for all.
-        Vector<BaseFloat> old_probs(n), new_probs(n);
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          int32 tid = PairToTransitionId(one_tstate, tidx);
-          old_probs(tidx) = new_probs(tidx) = GetTransitionProb(tid);
-        }
-        for (int32 tidx = 0; tidx < n; tidx++)
-          new_probs(tidx) = counts(tidx) / pdf_tot;
-        for (int32 i = 0; i < 3; i++) {  // keep flooring+renormalizing for 3 times..
-          new_probs.Scale(1.0 / new_probs.Sum());
-          for (int32 tidx = 0; tidx < n; tidx++)
-            new_probs(tidx) = std::max(new_probs(tidx), cfg.floor);
-        }
-        // Compute objf change
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          if (new_probs(tidx) == cfg.floor) num_floored++;
-          double objf_change = counts(tidx) * (Log(new_probs(tidx))
-                                               - Log(old_probs(tidx)));
-          objf_impr_sum += objf_change;
-        }
-        // Commit updated values.
-        for (std::set<int32>::const_iterator iter = tstates.begin();
-             iter != tstates.end();
-             ++iter) {
-          int32 tstate = *iter;
-          for (int32 tidx = 0; tidx < n; tidx++) {
-            int32 tid = PairToTransitionId(tstate, tidx);
-            log_probs_(tid) = Log(new_probs(tidx));
-            if (log_probs_(tid) - log_probs_(tid) != 0.0)
-              KALDI_ERR << "Log probs is inf or NaN: error in update or bad stats?";
-          }
-        }
-      }
-    }
-  }
-  KALDI_LOG << "Objf change is " << (objf_impr_sum / count_sum)
-            << " per frame over " << count_sum << " frames; "
-            << num_floored << " probabilities floored, "
-            << num_skipped << " pdf-ids skipped due to insuffient data.";
-  if (objf_impr_out) *objf_impr_out = objf_impr_sum;
-  if (count_out) *count_out = count_sum;
-  ComputeDerivedOfProbs();
-}
-
-
-/// This version of the MapUpdate() function is for if the user specifies
-/// --share-for-pdfs=true.  We share the transitions for all states that
-/// share the same pdf.
-void TransitionModel::MapUpdateShared(const Vector<double> &stats,
-                                      const MapTransitionUpdateConfig &cfg,
-                                      BaseFloat *objf_impr_out,
-                                      BaseFloat *count_out) {
-  KALDI_ASSERT(cfg.share_for_pdfs);
-
-  BaseFloat count_sum = 0.0, objf_impr_sum = 0.0;
-  KALDI_ASSERT(stats.Dim() == NumTransitionIds()+1);
-  std::map<int32, std::set<int32> > pdf_to_tstate;
-
-  for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    int32 pdf = TransitionStateToForwardPdf(tstate);
-    pdf_to_tstate[pdf].insert(tstate);
-    if (!IsHmm()) {
-      pdf = TransitionStateToSelfLoopPdf(tstate);
-      pdf_to_tstate[pdf].insert(tstate);
-    }
-  }
-  std::map<int32, std::set<int32> >::iterator map_iter;
-  for (map_iter = pdf_to_tstate.begin();
-       map_iter != pdf_to_tstate.end();
-       ++map_iter) {
-    // map_iter->first is pdf-id... not needed.
-    const std::set<int32> &tstates = map_iter->second;
-    KALDI_ASSERT(!tstates.empty());
-    int32 one_tstate = *(tstates.begin());
-    int32 n = NumTransitionIndices(one_tstate);
-    KALDI_ASSERT(n >= 1);
-    if (n > 1) { // Only update if >1 transition...
-      Vector<double> counts(n);
-      for (std::set<int32>::const_iterator iter = tstates.begin();
-           iter != tstates.end();
-           ++iter) {
-        int32 tstate = *iter;
-        if (NumTransitionIndices(tstate) != n)
-          KALDI_ERR << "Mismatch in #transition indices: you cannot "
-              "use the --share-for-pdfs option with this topology "
-              "and sharing scheme.";
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          int32 tid = PairToTransitionId(tstate, tidx);
-          counts(tidx) += stats(tid);
-        }
-      }
-      double pdf_tot = counts.Sum();
-      count_sum += pdf_tot;
-
-      // Note: when calculating objf improvement, we
-      // assume we previously had the same tying scheme so
-      // we can get the params from one_tstate and they're valid
-      // for all.
-      Vector<BaseFloat> old_probs(n), new_probs(n);
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        int32 tid = PairToTransitionId(one_tstate, tidx);
-        old_probs(tidx) = new_probs(tidx) = GetTransitionProb(tid);
-      }
-      for (int32 tidx = 0; tidx < n; tidx++)
-        new_probs(tidx) = (counts(tidx) + old_probs(tidx) * cfg.tau) /
-            (pdf_tot + cfg.tau);
-      // Compute objf change
-      for (int32 tidx = 0; tidx < n; tidx++) {
-        double objf_change = counts(tidx) * (Log(new_probs(tidx))
-                                             - Log(old_probs(tidx)));
-        objf_impr_sum += objf_change;
-      }
-      // Commit updated values.
-      for (std::set<int32>::const_iterator iter = tstates.begin();
-           iter != tstates.end();
-           ++iter) {
-        int32 tstate = *iter;
-        for (int32 tidx = 0; tidx < n; tidx++) {
-          int32 tid = PairToTransitionId(tstate, tidx);
-          log_probs_(tid) = Log(new_probs(tidx));
-          if (log_probs_(tid) - log_probs_(tid) != 0.0)
-            KALDI_ERR << "Log probs is inf or NaN: error in update or bad stats?";
-        }
-      }
-    }
-  }
-  KALDI_LOG << "Objf change is " << (objf_impr_sum / count_sum)
-            << " per frame over " << count_sum
-            << " frames.";
-  if (objf_impr_out) *objf_impr_out = objf_impr_sum;
-  if (count_out) *count_out = count_sum;
-  ComputeDerivedOfProbs();
-}
-
-
-int32 TransitionModel::TransitionIdToPhone(int32 trans_id) const {
-  KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
-  int32 trans_state = id2state_[trans_id];
-  return tuples_[trans_state-1].phone;
-}
-
-int32 TransitionModel::TransitionIdToPdfClass(int32 trans_id) const {
-  KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
-  int32 trans_state = id2state_[trans_id];
-
-  const Tuple &t = tuples_[trans_state-1];
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
-  KALDI_ASSERT(static_cast<size_t>(t.hmm_state) < entry.size());
-  if (IsSelfLoop(trans_id))
-    return entry[t.hmm_state].self_loop_pdf_class;
-  else
-    return entry[t.hmm_state].forward_pdf_class;
-}
-
-
-int32 TransitionModel::TransitionIdToHmmState(int32 trans_id) const {
-  KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
-  int32 trans_state = id2state_[trans_id];
-  const Tuple &t = tuples_[trans_state-1];
-  return t.hmm_state;
-}
-
-void TransitionModel::Print(std::ostream &os,
-                            const std::vector<std::string> &phone_names,
-                            const Vector<double> *occs) {
-  if (occs != NULL)
-    KALDI_ASSERT(occs->Dim() == NumPdfs());
-  bool is_hmm = IsHmm();
-  for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    const Tuple &tuple = tuples_[tstate-1];
-    KALDI_ASSERT(static_cast<size_t>(tuple.phone) < phone_names.size());
-    std::string phone_name = phone_names[tuple.phone];
-
-    os << "Transition-state " << tstate << ": phone = " << phone_name
-       << " hmm-state = " << tuple.hmm_state;
-    if (is_hmm)
-      os << " pdf = " << tuple.forward_pdf << '\n';
-    else
-      os << " forward-pdf = " << tuple.forward_pdf << " self-loop-pdf = "
-         << tuple.self_loop_pdf << '\n';
-    for (int32 tidx = 0; tidx < NumTransitionIndices(tstate); tidx++) {
-      int32 tid = PairToTransitionId(tstate, tidx);
-      BaseFloat p = GetTransitionProb(tid);
-      os << " Transition-id = " << tid << " p = " << p;
-      if (occs != NULL) {
-        if (IsSelfLoop(tid))
-          os << " count of pdf = " << (*occs)(tuple.self_loop_pdf);
-        else
-          os << " count of pdf = " << (*occs)(tuple.forward_pdf);
-      }
-      // now describe what it's a transition to.
-      if (IsSelfLoop(tid)) os << " [self-loop]\n";
-      else {
-        int32 hmm_state = tuple.hmm_state;
-        const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
-        KALDI_ASSERT(static_cast<size_t>(hmm_state) < entry.size());
-        int32 next_hmm_state = entry[hmm_state].transitions[tidx].first;
-        KALDI_ASSERT(next_hmm_state != hmm_state);
-        os << " [" << hmm_state << " -> " << next_hmm_state << "]\n";
-      }
-    }
-  }
-}
-
-bool GetPdfsForPhones(const TransitionModel &trans_model,
-                      const std::vector<int32> &phones,
-                      std::vector<int32> *pdfs) {
-  KALDI_ASSERT(IsSortedAndUniq(phones));
-  KALDI_ASSERT(pdfs != NULL);
-  pdfs->clear();
-  for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++) {
-    if (std::binary_search(phones.begin(), phones.end(),
-             trans_model.TransitionStateToPhone(tstate))) {
-      pdfs->push_back(trans_model.TransitionStateToForwardPdf(tstate));
-      pdfs->push_back(trans_model.TransitionStateToSelfLoopPdf(tstate));
-    }
-  }
-  SortAndUniq(pdfs);
-
-  for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++)
-    if ((std::binary_search(pdfs->begin(), pdfs->end(),
-                          trans_model.TransitionStateToForwardPdf(tstate)) ||
-         std::binary_search(pdfs->begin(), pdfs->end(),
-                          trans_model.TransitionStateToSelfLoopPdf(tstate)))
-       && !std::binary_search(phones.begin(), phones.end(),
-                              trans_model.TransitionStateToPhone(tstate)))
-      return false;
-  return true;
-}
-
-bool GetPhonesForPdfs(const TransitionModel &trans_model,
-                     const std::vector<int32> &pdfs,
-                     std::vector<int32> *phones) {
-  KALDI_ASSERT(IsSortedAndUniq(pdfs));
-  KALDI_ASSERT(phones != NULL);
-  phones->clear();
-  for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++) {
-    if (std::binary_search(pdfs.begin(), pdfs.end(),
-                           trans_model.TransitionStateToForwardPdf(tstate)) ||
-        std::binary_search(pdfs.begin(), pdfs.end(),
-                           trans_model.TransitionStateToSelfLoopPdf(tstate)))
-      phones->push_back(trans_model.TransitionStateToPhone(tstate));
-  }
-  SortAndUniq(phones);
-
-  for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++)
-    if (std::binary_search(phones->begin(), phones->end(),
-                           trans_model.TransitionStateToPhone(tstate))
-        && !(std::binary_search(pdfs.begin(), pdfs.end(),
-                               trans_model.TransitionStateToForwardPdf(tstate)) &&
-             std::binary_search(pdfs.begin(), pdfs.end(),
-                               trans_model.TransitionStateToSelfLoopPdf(tstate))) )
-      return false;
-  return true;
-}
-
-bool TransitionModel::Compatible(const TransitionModel &other) const {
-  return (topo_ == other.topo_ && tuples_ == other.tuples_ &&
-          state2id_ == other.state2id_ && id2state_ == other.id2state_
-          && num_pdfs_ == other.num_pdfs_);
-}
-
-bool TransitionModel::IsSelfLoop(int32 trans_id) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_id) < id2state_.size());
-  int32 trans_state = id2state_[trans_id];
-  int32 trans_index = trans_id - state2id_[trans_state];
-  const Tuple &tuple = tuples_[trans_state-1];
-  int32 phone = tuple.phone, hmm_state = tuple.hmm_state;
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
-  KALDI_ASSERT(static_cast<size_t>(hmm_state) < entry.size());
-  return (static_cast<size_t>(trans_index) < entry[hmm_state].transitions.size()
-          && entry[hmm_state].transitions[trans_index].first == hmm_state);
-}
-
-} // End namespace kaldi
diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h
deleted file mode 100644
index c97980405c1..00000000000
--- a/src/hmm/transition-model.h
+++ /dev/null
@@ -1,371 +0,0 @@
-// hmm/transition-model.h
-
-// Copyright 2009-2012  Microsoft Corporation
-//                      Johns Hopkins University (author: Guoguo Chen)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_HMM_TRANSITION_MODEL_H_
-#define KALDI_HMM_TRANSITION_MODEL_H_
-
-#include "base/kaldi-common.h"
-#include "util/const-integer-set.h"
-#include "fst/fst-decl.h" // forward declarations.
-#include "hmm/hmm-topology.h"
-#include "itf/options-itf.h"
-#include "itf/context-dep-itf.h"
-#include "matrix/kaldi-vector.h"
-
-namespace kaldi {
-
-/// \addtogroup hmm_group
-/// @{
-
-// The class TransitionModel is a repository for the transition probabilities.
-// It also handles certain integer mappings.
-// The basic model is as follows.  Each phone has a HMM topology defined in
-// hmm-topology.h.  Each HMM-state of each of these phones has a number of
-// transitions (and final-probs) out of it.  Each HMM-state defined in the
-// HmmTopology class has an associated "pdf_class".  This gets replaced with
-// an actual pdf-id via the tree.  The transition model associates the
-// transition probs with the (phone, HMM-state, pdf-id).  We associate with
-// each such triple a transition-state.  Each
-// transition-state has a number of associated probabilities to estimate;
-// this depends on the number of transitions/final-probs in the topology for
-// that (phone, HMM-state).  Each probability has an associated transition-index.
-// We associate with each (transition-state, transition-index) a unique transition-id.
-// Each individual probability estimated by the transition-model is associated with a
-// transition-id.
-//
-// List of the various types of quantity referred to here and what they mean:
-//           phone:  a phone index (1, 2, 3 ...)
-//       HMM-state:  a number (0, 1, 2...) that indexes TopologyEntry (see hmm-topology.h)
-//          pdf-id:  a number output by the Compute function of ContextDependency (it
-//                   indexes pdf's, either forward or self-loop).  Zero-based.
-// transition-state:  the states for which we estimate transition probabilities for transitions
-//                    out of them.  In some topologies, will map one-to-one with pdf-ids.
-//                    One-based, since it appears on FSTs.
-// transition-index:  identifier of a transition (or final-prob) in the HMM.  Indexes the
-//                    "transitions" vector in HmmTopology::HmmState.  [if it is out of range,
-//                    equal to transitions.size(), it refers to the final-prob.]
-//                    Zero-based.
-//   transition-id:   identifier of a unique parameter of the TransitionModel.
-//                    Associated with a (transition-state, transition-index) pair.
-//                    One-based, since it appears on FSTs.
-//
-// List of the possible mappings TransitionModel can do:
-//   (phone, HMM-state, forward-pdf-id, self-loop-pdf-id) -> transition-state
-//                   (transition-state, transition-index) -> transition-id
-//  Reverse mappings:
-//                        transition-id -> transition-state
-//                        transition-id -> transition-index
-//                     transition-state -> phone
-//                     transition-state -> HMM-state
-//                     transition-state -> forward-pdf-id
-//                     transition-state -> self-loop-pdf-id
-//
-// The main things the TransitionModel object can do are:
-//    Get initialized (need ContextDependency and HmmTopology objects).
-//    Read/write.
-//    Update [given a vector of counts indexed by transition-id].
-//    Do the various integer mappings mentioned above.
-//    Get the probability (or log-probability) associated with a particular transition-id.
-
-
-// Note: this was previously called TransitionUpdateConfig.
-struct MleTransitionUpdateConfig {
-  BaseFloat floor;
-  BaseFloat mincount;
-  bool share_for_pdfs; // If true, share all transition parameters that have the same pdf.
-  MleTransitionUpdateConfig(BaseFloat floor = 0.01,
-                            BaseFloat mincount = 5.0,
-                            bool share_for_pdfs = false):
-      floor(floor), mincount(mincount), share_for_pdfs(share_for_pdfs) {}
-
-  void Register (OptionsItf *opts) {
-    opts->Register("transition-floor", &floor,
-                   "Floor for transition probabilities");
-    opts->Register("transition-min-count", &mincount,
-                   "Minimum count required to update transitions from a state");
-    opts->Register("share-for-pdfs", &share_for_pdfs,
-                   "If true, share all transition parameters where the states "
-                   "have the same pdf.");
-  }
-};
-
-struct MapTransitionUpdateConfig {
-  BaseFloat tau;
-  bool share_for_pdfs; // If true, share all transition parameters that have the same pdf.
-  MapTransitionUpdateConfig(): tau(5.0), share_for_pdfs(false) { }
-
-  void Register (OptionsItf *opts) {
-    opts->Register("transition-tau", &tau, "Tau value for MAP estimation of transition "
-                   "probabilities.");
-    opts->Register("share-for-pdfs", &share_for_pdfs,
-                   "If true, share all transition parameters where the states "
-                   "have the same pdf.");
-  }
-};
-
-class TransitionModel {
-
- public:
-  /// Initialize the object [e.g. at the start of training].
-  /// The class keeps a copy of the HmmTopology object, but not
-  /// the ContextDependency object.
-  TransitionModel(const ContextDependencyInterface &ctx_dep,
-                  const HmmTopology &hmm_topo);
-
-
-  /// Constructor that takes no arguments: typically used prior to calling Read.
-  TransitionModel(): num_pdfs_(0) { }
-
-  void Read(std::istream &is, bool binary);  // note, no symbol table: topo object always read/written w/o symbols.
-  void Write(std::ostream &os, bool binary) const;
-
-
-  /// return reference to HMM-topology object.
-  const HmmTopology &GetTopo() const { return topo_; }
-
-  /// \name Integer mapping functions
-  /// @{
-
-  int32 TupleToTransitionState(int32 phone, int32 hmm_state, int32 pdf, int32 self_loop_pdf) const;
-  int32 PairToTransitionId(int32 trans_state, int32 trans_index) const;
-  int32 TransitionIdToTransitionState(int32 trans_id) const;
-  int32 TransitionIdToTransitionIndex(int32 trans_id) const;
-  int32 TransitionStateToPhone(int32 trans_state) const;
-  int32 TransitionStateToHmmState(int32 trans_state) const;
-  int32 TransitionStateToForwardPdfClass(int32 trans_state) const;
-  int32 TransitionStateToSelfLoopPdfClass(int32 trans_state) const;
-  int32 TransitionStateToForwardPdf(int32 trans_state) const;
-  int32 TransitionStateToSelfLoopPdf(int32 trans_state) const;
-  int32 SelfLoopOf(int32 trans_state) const;  // returns the self-loop transition-id, or zero if
-  // this state doesn't have a self-loop.
-
-  inline int32 TransitionIdToPdf(int32 trans_id) const;
-  // TransitionIdToPdfFast is as TransitionIdToPdf but skips an assertion
-  // (unless we're in paranoid mode).
-  inline int32 TransitionIdToPdfFast(int32 trans_id) const;
-
-  int32 TransitionIdToPhone(int32 trans_id) const;
-  int32 TransitionIdToPdfClass(int32 trans_id) const;
-  int32 TransitionIdToHmmState(int32 trans_id) const;
-
-  /// @}
-
-  bool IsFinal(int32 trans_id) const;  // returns true if this trans_id goes to the final state
-  // (which is bound to be nonemitting).
-  bool IsSelfLoop(int32 trans_id) const;  // return true if this trans_id corresponds to a self-loop.
-
-  /// Returns the total number of transition-ids (note, these are one-based).
-  inline int32 NumTransitionIds() const { return id2state_.size()-1; }
-
-  /// Returns the number of transition-indices for a particular transition-state.
-  /// Note: "Indices" is the plural of "index".   Index is not the same as "id",
-  /// here.  A transition-index is a zero-based offset into the transitions
-  /// out of a particular transition state.
-  int32 NumTransitionIndices(int32 trans_state) const;
-
-  /// Returns the total number of transition-states (note, these are one-based).
-  int32 NumTransitionStates() const { return tuples_.size(); }
-
-  // NumPdfs() actually returns the highest-numbered pdf we ever saw, plus one.
-  // In normal cases this should equal the number of pdfs in the system, but if you
-  // initialized this object with fewer than all the phones, and it happens that
-  // an unseen phone has the highest-numbered pdf, this might be different.
-  int32 NumPdfs() const { return num_pdfs_; }
-
-  // This loops over the tuples and finds the highest phone index present. If
-  // the FST symbol table for the phones is created in the expected way, i.e.:
-  // starting from 1 (<eps> is 0) and numbered contiguously till the last phone,
-  // this will be the total number of phones.
-  int32 NumPhones() const;
-
-  /// Returns a sorted, unique list of phones.
-  const std::vector<int32> &GetPhones() const { return topo_.GetPhones(); }
-
-  // Transition-parameter-getting functions:
-  BaseFloat GetTransitionProb(int32 trans_id) const;
-  BaseFloat GetTransitionLogProb(int32 trans_id) const;
-
-  // The following functions are more specialized functions for getting
-  // transition probabilities, that are provided for convenience.
-
-  /// Returns the log-probability of a particular non-self-loop transition
-  /// after subtracting the probability mass of the self-loop and renormalizing;
-  /// will crash if called on a self-loop.  Specifically:
-  /// for non-self-loops it returns the log of (that prob divided by (1 minus
-  /// self-loop-prob-for-that-state)).
-  BaseFloat GetTransitionLogProbIgnoringSelfLoops(int32 trans_id) const;
-
-  /// Returns the log-prob of the non-self-loop probability
-  /// mass for this transition state. (you can get the self-loop prob, if a self-loop
-  /// exists, by calling GetTransitionLogProb(SelfLoopOf(trans_state)).
-  BaseFloat GetNonSelfLoopLogProb(int32 trans_state) const;
-
-  /// Does Maximum Likelihood estimation.  The stats are counts/weights, indexed
-  /// by transition-id.  This was previously called Update().
-  void MleUpdate(const Vector<double> &stats,
-                 const MleTransitionUpdateConfig &cfg,
-                 BaseFloat *objf_impr_out,
-                 BaseFloat *count_out);
-
-  /// Does Maximum A Posteriori (MAP) estimation.  The stats are counts/weights,
-  /// indexed by transition-id.
-  void MapUpdate(const Vector<double> &stats,
-                 const MapTransitionUpdateConfig &cfg,
-                 BaseFloat *objf_impr_out,
-                 BaseFloat *count_out);
-
-  /// Print will print the transition model in a human-readable way, for purposes of human
-  /// inspection.  The "occs" are optional (they are indexed by pdf-id).
-  void Print(std::ostream &os,
-             const std::vector<std::string> &phone_names,
-             const Vector<double> *occs = NULL);
-
-
-  void InitStats(Vector<double> *stats) const { stats->Resize(NumTransitionIds()+1); }
-
-  void Accumulate(BaseFloat prob, int32 trans_id, Vector<double> *stats) const {
-    KALDI_ASSERT(trans_id <= NumTransitionIds());
-    (*stats)(trans_id) += prob;
-    // This is trivial and doesn't require class members, but leaves us more open
-    // to design changes than doing it manually.
-  }
-
-  /// returns true if all the integer class members are identical (but does not
-  /// compare the transition probabilities.
-  bool Compatible(const TransitionModel &other) const;
-
- private:
-  void MleUpdateShared(const Vector<double> &stats,
-                       const MleTransitionUpdateConfig &cfg,
-                       BaseFloat *objf_impr_out, BaseFloat *count_out);
-  void MapUpdateShared(const Vector<double> &stats,
-                       const MapTransitionUpdateConfig &cfg,
-                       BaseFloat *objf_impr_out, BaseFloat *count_out);
-  void ComputeTuples(const ContextDependencyInterface &ctx_dep);  // called from constructor.  initializes tuples_.
-  void ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_dep);
-  void ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_dep);
-  void ComputeDerived();  // called from constructor and Read function: computes state2id_ and id2state_.
-  void ComputeDerivedOfProbs();  // computes quantities derived from log-probs (currently just
-  // non_self_loop_log_probs_; called whenever log-probs change.
-  void InitializeProbs();  // called from constructor.
-  void Check() const;
-  bool IsHmm() const;
-
-  struct Tuple {
-    int32 phone;
-    int32 hmm_state;
-    int32 forward_pdf;
-    int32 self_loop_pdf;
-    Tuple() { }
-    Tuple(int32 phone, int32 hmm_state, int32 forward_pdf, int32 self_loop_pdf):
-      phone(phone), hmm_state(hmm_state), forward_pdf(forward_pdf), self_loop_pdf(self_loop_pdf) { }
-    bool operator < (const Tuple &other) const {
-      if (phone < other.phone) return true;
-      else if (phone > other.phone) return false;
-      else if (hmm_state < other.hmm_state) return true;
-      else if (hmm_state > other.hmm_state) return false;
-      else if (forward_pdf < other.forward_pdf) return true;
-      else if (forward_pdf > other.forward_pdf) return false;
-      else return (self_loop_pdf < other.self_loop_pdf);
-    }
-    bool operator == (const Tuple &other) const {
-      return (phone == other.phone && hmm_state == other.hmm_state
-              && forward_pdf == other.forward_pdf && self_loop_pdf == other.self_loop_pdf);
-    }
-  };
-
-  HmmTopology topo_;
-
-  /// Tuples indexed by transition state minus one;
-  /// the tuples are in sorted order which allows us to do the reverse mapping from
-  /// tuple to transition state
-  std::vector<Tuple> tuples_;
-
-  /// Gives the first transition_id of each transition-state; indexed by
-  /// the transition-state.  Array indexed 1..num-transition-states+1 (the last one
-  /// is needed so we can know the num-transitions of the last transition-state.
-  std::vector<int32> state2id_;
-
-  /// For each transition-id, the corresponding transition
-  /// state (indexed by transition-id).
-  std::vector<int32> id2state_;
-
-  std::vector<int32> id2pdf_id_;
-
-  /// For each transition-id, the corresponding log-prob.  Indexed by transition-id.
-  Vector<BaseFloat> log_probs_;
-
-  /// For each transition-state, the log of (1 - self-loop-prob).  Indexed by
-  /// transition-state.
-  Vector<BaseFloat> non_self_loop_log_probs_;
-
-  /// This is actually one plus the highest-numbered pdf we ever got back from the
-  /// tree (but the tree numbers pdfs contiguously from zero so this is the number
-  /// of pdfs).
-  int32 num_pdfs_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(TransitionModel);
-};
-
-inline int32 TransitionModel::TransitionIdToPdf(int32 trans_id) const {
-  KALDI_ASSERT(
-      static_cast<size_t>(trans_id) < id2pdf_id_.size() &&
-      "Likely graph/model mismatch (graph built from wrong model?)");
-  return id2pdf_id_[trans_id];
-}
-
-inline int32 TransitionModel::TransitionIdToPdfFast(int32 trans_id) const {
-  // Note: it's a little dangerous to assert this only in paranoid mode.
-  // However, this function is called in the inner loop of decoders and
-  // the assertion likely takes a significant amount of time.  We make
-  // sure that past the end of the id2pdf_id_ array there are big
-  // numbers, which will make the calling code more likely to segfault
-  // (rather than silently die) if this is called for out-of-range values.
-  KALDI_PARANOID_ASSERT(
-      static_cast<size_t>(trans_id) < id2pdf_id_.size() &&
-      "Likely graph/model mismatch (graph built from wrong model?)");
-  return id2pdf_id_[trans_id];
-}
-
-/// Works out which pdfs might correspond to the given phones.  Will return true
-/// if these pdfs correspond *just* to these phones, false if these pdfs are also
-/// used by other phones.
-/// @param trans_model [in] Transition-model used to work out this information
-/// @param phones [in] A sorted, uniq vector that represents a set of phones
-/// @param pdfs [out] Will be set to a sorted, uniq list of pdf-ids that correspond
-///                   to one of this set of phones.
-/// @return  Returns true if all of the pdfs output to "pdfs" correspond to phones from
-///          just this set (false if they may be shared with phones outside this set).
-bool GetPdfsForPhones(const TransitionModel &trans_model,
-                      const std::vector<int32> &phones,
-                      std::vector<int32> *pdfs);
-
-/// Works out which phones might correspond to the given pdfs. Similar to the
-/// above GetPdfsForPhones(, ,)
-bool GetPhonesForPdfs(const TransitionModel &trans_model,
-                      const std::vector<int32> &pdfs,
-                      std::vector<int32> *phones);
-/// @}
-
-
-} // end namespace kaldi
-
-
-#endif
diff --git a/src/hmm/transition-model-test.cc b/src/hmm/transitions-test.cc
similarity index 83%
rename from src/hmm/transition-model-test.cc
rename to src/hmm/transitions-test.cc
index 841c714efb1..8e2fe403f34 100644
--- a/src/hmm/transition-model-test.cc
+++ b/src/hmm/transitions-test.cc
@@ -17,22 +17,20 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-test-utils.h"
 
 namespace kaldi {
 
 
-void TestTransitionModel() {
-
-  TransitionModel *trans_model = GenRandTransitionModel(NULL);
-
+void TestTransitions() {
+  Transitions *trans_model = GenRandTransitions(NULL);
   bool binary = (rand() % 2 == 0);
 
   std::ostringstream os;
   trans_model->Write(os, binary);
 
-  TransitionModel trans_model2;
+  Transitions trans_model2;
   std::istringstream is2(os.str());
   trans_model2.Read(is2, binary);
 
@@ -41,7 +39,7 @@ void TestTransitionModel() {
     trans_model->Write(os1, false);
     trans_model2.Write(os2, false);
     KALDI_ASSERT(os1.str() == os2.str());
-    KALDI_ASSERT(trans_model->Compatible(trans_model2));
+    KALDI_ASSERT(*trans_model == trans_model2);
   }
   delete trans_model;
 }
@@ -50,7 +48,6 @@ void TestTransitionModel() {
 
 int main() {
   for (int i = 0; i < 2; i++)
-    kaldi::TestTransitionModel();
+    kaldi::TestTransitions();
   KALDI_LOG << "Test OK.\n";
 }
-
diff --git a/src/hmm/transitions.cc b/src/hmm/transitions.cc
new file mode 100644
index 00000000000..7319fe0063a
--- /dev/null
+++ b/src/hmm/transitions.cc
@@ -0,0 +1,344 @@
+// hmm/transitions.cc
+
+// Copyright 2009-2012  Microsoft Corporation
+//        Johns Hopkins University (author: Guoguo Chen)
+//        2012-2019 Johns Hopkins University (Author: Daniel Povey)
+//        2019      Hossein Hadian
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "hmm/transitions.h"
+#include "tree/context-dep.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-utils.h"
+
+namespace kaldi {
+
+bool Transitions::operator == (const Transitions &other) const {
+  return topo_ == other.topo_ && info_ == other.info_ &&
+      num_pdfs_ == other.num_pdfs_;
+}
+
+bool Transitions::Compatible(const Transitions& other) const {
+  KALDI_ASSERT(false);
+  return false;
+}
+
+void Transitions::ComputeInfo(const ContextDependencyInterface &ctx_dep) {
+  using StateId = typename fst::StdFst::StateId;
+  const std::vector<int32> &phones = topo_.GetPhones();
+  KALDI_ASSERT(!phones.empty());
+
+  // pdf_class_pairs is a set of lists indexed by phone. Each list stores
+  // all unique (pdf-class, self-loop pdf-class) pairs that that phone
+  // can have (on its arcs).
+  std::vector<std::vector<std::pair<int32, int32> > > pdf_class_pairs;
+  pdf_class_pairs.resize(1 + *std::max_element(phones.begin(), phones.end()));
+  // to_arc_list is a list indexed by phone. For each phone, it has a map which
+  // maps a possible pdf class pair (pdf-class, self-loop pdf-class) to all
+  // the arcs in that phone that match that pdf class pair. An arc is represented
+  // as a (topo-state, arc-index) pair.
+  std::vector<std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > > to_arc_list;
+  to_arc_list.resize(1 + *std::max_element(phones.begin(), phones.end()));
+
+  for (size_t i = 0; i < phones.size(); i++) {
+    int32 phone = phones[i];
+    const fst::StdVectorFst &entry = topo_.TopologyForPhone(phone);
+    int num_states = entry.NumStates();
+
+    std::vector<StateId> state_to_self_loop_pdf_class(num_states, kNoPdf);
+    for (StateId state = 0; state < num_states; ++state) {
+      for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state); !aiter.Done(); aiter.Next()) {
+        const fst::StdArc &arc(aiter.Value());
+        if (arc.nextstate == state) {
+          if (state_to_self_loop_pdf_class[state] != kNoPdf)
+            KALDI_ERR << "State " << state << " in topology of phone "
+                      << phone << " has more than one self-loop.";
+          state_to_self_loop_pdf_class[state] = arc.ilabel;
+        }
+      }
+    }
+
+    std::map<std::pair<int32, int32>, std::vector<std::pair<int32, int32> > > &this_to_arc_list(
+        to_arc_list[phone]);
+    for (StateId state = 0; state < num_states; ++state) {
+      for (fst::ArcIterator<fst::StdVectorFst> aiter(entry, state);
+           !aiter.Done(); aiter.Next()) {
+        const fst::StdArc &arc(aiter.Value());
+        int32 forward_pdf_class = arc.ilabel,
+            self_loop_pdf_class = state_to_self_loop_pdf_class[arc.nextstate];
+        auto state_arc_pair = std::make_pair(state, int32(aiter.Position()));
+        auto pdf_class_pair = std::make_pair(forward_pdf_class, self_loop_pdf_class);
+        this_to_arc_list[pdf_class_pair].push_back(state_arc_pair);
+      }
+    }
+    for (auto const &pdf_class_to_arc: this_to_arc_list) {
+      pdf_class_pairs[phone].push_back(pdf_class_to_arc.first);
+    }
+  }
+  // pdf_info will be a set of lists indexed by phone. Each list is indexed by
+  // the same index as we index into pdf_class_pairs[phone], and the element is
+  // a list of possible (pdf, self-loop pdf) pairs that that (pdf-class,
+  // self-loop pdf-class) pair generates.
+  std::vector<std::vector<std::vector<std::pair<int32, int32> > > > pdf_info;
+
+  ctx_dep.GetPdfInfo(phones, pdf_class_pairs, &pdf_info);
+
+  info_.push_back(TransitionIdInfo());  // transition-id is 1-based, add a
+                                        // dummy for element zero.
+
+  for (int32 i = 0; i < phones.size(); i++) {
+    int32 phone = phones[i];
+    for (int32 j = 0; j < static_cast<int32>(pdf_info[phone].size()); j++) {  // loop on pdf-class pairs
+      int32 pdf_class = pdf_class_pairs[phone][j].first,
+            self_loop_pdf_class = pdf_class_pairs[phone][j].second;
+      auto const &state_arc_vec =
+              to_arc_list[phone][std::make_pair(pdf_class, self_loop_pdf_class)];
+      KALDI_ASSERT(!state_arc_vec.empty());
+      for (auto const& state_arc_pair: state_arc_vec) {  // loop on all arcs matching this pdf-class pair
+        int32 topo_state = state_arc_pair.first,
+            arc_index = state_arc_pair.second;
+        for (size_t m = 0; m < pdf_info[phone][j].size(); m++) {  // loop on all pdf pairs for this pdf-class pair
+          int32 pdf = pdf_info[phone][j][m].first,
+            self_loop_pdf = pdf_info[phone][j][m].second;
+          if (self_loop_pdf_class == -1)
+            self_loop_pdf = -1;
+          TransitionIdInfo tuple{.phone = phone, .topo_state = topo_state,
+                .arc_index = arc_index, .pdf_id = pdf, .self_loop_pdf_id = self_loop_pdf};
+          info_.push_back(tuple);
+        }
+      }
+    }
+  }
+
+  std::sort(info_.begin(), info_.end());  // sort to enable reverse lookup.
+}
+
+void Transitions::ComputeDerived() {
+  pdf_ids_.resize(info_.size());
+  for (int32 tid = 1; tid <= NumTransitionIds(); ++tid) {
+    TransitionIdInfo &transition = info_[tid];
+    auto const &entry = topo_.TopologyForPhone(transition.phone);  // an FST
+    fst::ArcIterator<fst::StdVectorFst> aiter(entry, transition.topo_state);
+    aiter.Seek(transition.arc_index);
+    auto const &arc(aiter.Value());
+
+    transition.is_self_loop = (arc.nextstate == transition.topo_state);
+    transition.is_initial = (transition.topo_state == 0);
+    transition.is_final = (entry.Final(arc.nextstate) != fst::StdFst::Weight::Zero());
+    transition.transition_cost = arc.weight.Value();
+    if (transition.self_loop_pdf_id == -1)
+      transition.self_loop_transition_id = 0;
+    else {
+      // Find the self-loop of the destination state:
+      int32 arc_index = -1;
+      for (fst::ArcIterator<fst::StdVectorFst> aiter_next(entry, arc.nextstate);
+           !aiter_next.Done(); aiter_next.Next())
+        if (aiter_next.Value().nextstate == arc.nextstate) {  // Found the self-loop
+          arc_index = aiter_next.Position();
+          break;
+        }
+      KALDI_ASSERT(arc_index != -1);
+      transition.self_loop_transition_id =
+          TupleToTransitionId(transition.phone, arc.nextstate,
+                              arc_index, transition.self_loop_pdf_id,
+                              transition.self_loop_pdf_id);
+    }
+    pdf_ids_[tid] = transition.pdf_id;
+  }
+}
+
+Transitions::Transitions(const ContextDependencyInterface &ctx_dep,
+                         const Topology &topo): topo_(topo),
+                                                num_pdfs_(ctx_dep.NumPdfs()) {
+  // First thing is to get all possible tuples.
+  ComputeInfo(ctx_dep);
+  ComputeDerived();
+  Check();
+}
+
+int32 Transitions::TupleToTransitionId(int32 phone, int32 topo_state,
+                                       int32 arc_index, int32 pdf_id,
+                                       int32 self_loop_pdf_id) const {
+  TransitionIdInfo tuple{.phone = phone, .topo_state = topo_state,
+        .arc_index = arc_index, .pdf_id = pdf_id, .self_loop_pdf_id = self_loop_pdf_id};
+  // Note: if this ever gets too expensive, which is unlikely, we can refactor
+  // this code to sort first on pdf, and then index on pdf, so those
+  // that have the same pdf are in a contiguous range.
+  auto lowerbound = std::lower_bound(info_.begin(), info_.end(), tuple);
+  if (lowerbound == info_.end() || !(*lowerbound == tuple)) {
+    bool is_end = (lowerbound == info_.end());
+    const TransitionIdInfo &this_tuple = *lowerbound;
+    KALDI_ERR << "Tuple not found. (incompatible tree and model?)"
+              << std::boolalpha << is_end
+              << ", this_tuple pdf_id " << this_tuple.pdf_id;
+  }
+
+  return static_cast<int32>((lowerbound - info_.begin()));
+}
+
+void Transitions::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<Transitions>");
+  topo_.Read(is, binary);
+  ExpectToken(is, binary, "<Info>");
+  int32 size;
+  ReadBasicType(is, binary, &size);
+  info_.resize(size);
+  for (int32 i = 0; i < size; i++) {
+    ReadBasicType(is, binary, &(info_[i].phone));
+    ReadBasicType(is, binary, &(info_[i].topo_state));
+    ReadBasicType(is, binary, &(info_[i].arc_index));
+    ReadBasicType(is, binary, &(info_[i].pdf_id));
+    ReadBasicType(is, binary, &(info_[i].self_loop_pdf_id));
+  }
+  ExpectToken(is, binary, "</Info>");
+  ReadBasicType(is, binary, &num_pdfs_);
+  ExpectToken(is, binary, "</Transitions>");
+  ComputeDerived();
+  Check();
+}
+
+void Transitions::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<Transitions>");
+  if (!binary) os << "\n";
+  topo_.Write(os, binary);
+  WriteToken(os, binary, "<Info>");
+  WriteBasicType(os, binary, static_cast<int32>(info_.size()));
+  if (!binary) os << "\n";
+  for (int32 i = 0; i < static_cast<int32> (info_.size()); i++) {
+    WriteBasicType(os, binary, info_[i].phone);
+    WriteBasicType(os, binary, info_[i].topo_state);
+    WriteBasicType(os, binary, info_[i].arc_index);
+    WriteBasicType(os, binary, info_[i].pdf_id);
+    WriteBasicType(os, binary, info_[i].self_loop_pdf_id);
+    if (!binary) os << "\n";
+  }
+  WriteToken(os, binary, "</Info>");
+  if (!binary) os << "\n";
+  WriteBasicType(os, binary, num_pdfs_);
+  WriteToken(os, binary, "</Transitions>");
+  if (!binary) os << "\n";
+}
+
+void Transitions::Check() const {
+
+}
+const Transitions::TransitionIdInfo&
+Transitions::InfoForTransitionId(int32 transition_id) const {
+  KALDI_ASSERT(transition_id > 0 && transition_id < info_.size());
+  return info_[transition_id];
+}
+void Transitions::Print(std::ostream &os,
+                            const std::vector<std::string> &phone_names,
+                            const Vector<double> *occs) {
+  if (occs != NULL)
+    KALDI_ASSERT(occs->Dim() == NumPdfs());
+  for (int32 tid = 1; tid <= NumTransitionIds(); tid++) {
+    auto const &transition = info_[tid];
+    KALDI_ASSERT(static_cast<size_t>(transition.phone) < phone_names.size());
+    std::string phone_name = phone_names[transition.phone];
+
+    os << "Transition-id " << tid << ": phone = " << phone_name
+       << " topo-state = " << transition.topo_state
+       << " arc-index = " << transition.arc_index
+       << " forward-pdf = " << transition.pdf_id << " self-loop-pdf = "
+       << transition.self_loop_pdf_id
+       << " p = " << transition.transition_cost;
+    if (occs != NULL) {
+      if (transition.is_self_loop)
+        os << " count of pdf = " << (*occs)(transition.self_loop_pdf_id);
+      else
+        os << " count of pdf = " << (*occs)(transition.pdf_id);
+    }
+    if (transition.is_self_loop) os << " [self-loop]\n";
+    else {
+      auto const &entry = topo_.TopologyForPhone(transition.phone);  // an FST
+      fst::ArcIterator<fst::StdVectorFst> aiter(entry, transition.topo_state);
+      aiter.Seek(transition.arc_index);
+      auto const &arc(aiter.Value());
+      os << " [" << transition.topo_state << " -> " << arc.nextstate << "]\n";
+    }
+  }
+}
+
+int32 Transitions::PdfClassForTid(int32 tid) const {
+  auto&& info = InfoForTransitionId(tid);
+  auto&& fst = GetTopo().TopologyForPhone(info.phone);
+  fst::ArcIterator<fst::VectorFst<fst::StdArc> > aiter(fst, info.topo_state);
+  aiter.Seek(info.arc_index);
+  int32 pdf_class = aiter.Value().ilabel;
+  return pdf_class;
+}
+
+bool GetPdfsForPhones(const Transitions &trans_model,
+                      const std::vector<int32> &phones,
+                      std::vector<int32> *pdfs) {
+  KALDI_ASSERT(IsSortedAndUniq(phones));
+  KALDI_ASSERT(pdfs != NULL);
+  pdfs->clear();
+  for (int32 tid = 1; tid <= trans_model.NumTransitionIds(); tid++) {
+    auto const &transition = trans_model.InfoForTransitionId(tid);
+    if (std::binary_search(phones.begin(), phones.end(), transition.phone)) {
+      pdfs->push_back(transition.pdf_id);
+      pdfs->push_back(transition.self_loop_pdf_id);
+    }
+  }
+  SortAndUniq(pdfs);
+
+  for (int32 tid = 1; tid <= trans_model.NumTransitionIds(); tid++) {
+    auto const &transition = trans_model.InfoForTransitionId(tid);
+    if ((std::binary_search(pdfs->begin(), pdfs->end(),
+                            transition.pdf_id) ||
+         std::binary_search(pdfs->begin(), pdfs->end(),
+                            transition.self_loop_pdf_id))
+        && !std::binary_search(phones.begin(), phones.end(),
+                               transition.phone))
+      return false;
+  }
+  return true;
+}
+
+bool GetPhonesForPdfs(const Transitions &trans_model,
+                     const std::vector<int32> &pdfs,
+                     std::vector<int32> *phones) {
+  KALDI_ASSERT(IsSortedAndUniq(pdfs));
+  KALDI_ASSERT(phones != NULL);
+  phones->clear();
+  for (int32 tid = 1; tid <= trans_model.NumTransitionIds(); tid++) {
+    auto const &transition = trans_model.InfoForTransitionId(tid);
+    if (std::binary_search(pdfs.begin(), pdfs.end(), transition.pdf_id) ||
+        std::binary_search(pdfs.begin(), pdfs.end(), transition.self_loop_pdf_id))
+      phones->push_back(transition.phone);
+  }
+  SortAndUniq(phones);
+
+  for (int32 tid = 1; tid <= trans_model.NumTransitionIds(); tid++) {
+    auto const &transition = trans_model.InfoForTransitionId(tid);
+    if (std::binary_search(phones->begin(), phones->end(),
+                           transition.phone)
+        && !(std::binary_search(pdfs.begin(), pdfs.end(),
+                                transition.pdf_id) &&
+             std::binary_search(pdfs.begin(), pdfs.end(),
+                                transition.self_loop_pdf_id)))
+      return false;
+  }
+  return true;
+}
+
+
+} // End namespace kaldi
diff --git a/src/hmm/transitions.h b/src/hmm/transitions.h
new file mode 100644
index 00000000000..6bab0e627dc
--- /dev/null
+++ b/src/hmm/transitions.h
@@ -0,0 +1,279 @@
+// hmm/transitions.h
+
+// Copyright 2009-2012  Microsoft Corporation
+//                2015  Guoguo Chen
+//                2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_HMM_TRANSITIONS_H_
+#define KALDI_HMM_TRANSITIONS_H_
+
+#include "base/kaldi-common.h"
+#include "util/const-integer-set.h"
+#include "fst/fst-decl.h" // forward declarations.
+#include "hmm/topology.h"
+#include "itf/options-itf.h"
+#include "itf/context-dep-itf.h"
+#include "matrix/kaldi-vector.h"
+
+namespace kaldi {
+
+static const int kNoPdf = -1;
+
+// The class Transitions handles various integer mappings.
+// It used to be the home for the trainable transitions, but these
+// no longer exist.  This class can be initialized from the
+// tree and the topology.
+//
+// The topology of an individual phone is as defined in topology.h.
+//
+//  This class basically defines the concept of a "transition-id",
+//  which is a construct that we use in compiled decoding graphs
+//  to make it easy to look up the 'pdf-id' (think of this as the
+//  distribution or neural net output column associated with this
+//  state) and also figure out which phone we are in and which
+//  arc in that phone.
+//
+//  In the original Kaldi, this object contained trainable transition
+//  probabilities, but these have been removed to simplify things.
+//
+//  A transition-id maps to a 4-tuple as follows:
+//       (pdf-id, phone, topo-state, arc-index)
+//  where 'topo-state' is the state index in the fst::StdFst
+//  for the topology, and 'arc-index' is the index of
+//  the arc leaving that state (zero for the first-listed one,
+//  one for the second, etc.)
+
+
+// List of the various types of quantity referred to here and what they mean:
+//           phone:  a phone index (1, 2, 3 ...)
+//       topo-state:  a state index in the phone-topology FST (see topology.h)
+//       arc-index:  The index of the arc leaving this topo-state:
+//                   0 for the first-listed one, 1 for the second.  Will be used
+//                   to Seek() in the ArcIterator.
+//          pdf-id:  A number output by the Compute() function of ContextDependency (it
+//                   indexes pdf's, either forward or self-loop).  Zero-based.
+//                   In DNN-based systems this would be the column index of
+//                   the neural net output.
+//                   Here, it's "this state". Presumably the source?
+// (*)self-loop-pdf-id:  The pdf-id associated with the self-loop of this state,
+//                   if there is one (we do not allow >1), or -1 if there is no
+//                   self-loop.  This will be the same as 'pdf-id' if this transition
+//                   *is* the self-loop.  It might seem odd that we require this
+//                   to get the transition-id for a non-self-loop arc; the reason
+//                   why it's necessary is that we initially create the graph
+//                   without self-loops (for efficiency) and we need to be able
+//                   to look up the corresponding self-loop transition-id to
+//                   add self-loops to the graph. Duh! That makes complete sense!
+//
+//   transition-id:  The numbers that we put on the decoding-graph arcs.
+//                   Each transition-id is associated with a 4-tuple
+//                   (pdf-id, phone, topo-state, arc-index).
+//
+
+
+class Transitions {
+
+ public:
+  /// Initialize the object.  This is deterministic, so initializing
+  /// from the same objects will give you an equivalent numbering.
+  /// The class keeps a copy of the Topology object, but not
+  /// the ContextDependency object.
+  Transitions(const ContextDependencyInterface &ctx_dep,
+              const Topology &topo);
+
+
+  /// Constructor that takes no arguments: typically used prior to calling Read.
+  Transitions(): num_pdfs_(0) { }
+
+  void Read(std::istream &is, bool binary);
+  void Write(std::ostream &os, bool binary) const;
+
+  // This struct is the information associated with one transition-id.
+  // You can work out the transition-id from the first 5 fields.
+  struct TransitionIdInfo {
+
+    int32 phone;      // The phone
+    int32 topo_state; // The state in the topology FST for this phone
+    int32 arc_index;  // The arc-index leaving this state
+    int32 pdf_id;     // The pdf-id associated with this arc (obtained from the
+                      // tree and phonetic-context information, etc.)
+
+    int32 self_loop_pdf_id;  // The pdf-id associated with the self-loop
+                             // transition (if any) leaving the *destination*
+                             // state of this arc, or -1 if that state has no
+                             // self-loop.  Search for (*) above for
+                             // explanation.
+
+    // The remaining fields are 'derived information' that are worked out
+    // from the information above and from the phone topology, and placed
+    // here for convenience.
+
+    // is_self_loop is true if this is a self-loop (a transition to the same
+    // state).  We often need to know this, so it's convenient to have this
+    // information here.
+    bool is_self_loop;
+    // is_initial is true if this is a transition leaving the
+    // initial state.
+    // you transition through the HMM (we check that the topology has no
+    // other transitions to the first HMM-state).
+    bool is_initial;
+
+    // is_final is true if this is a transition entering a final
+    // state.  This is used together with is_initial (and boundary
+    // information) to locate phone boundaries, e.g. for lattice
+    // word alignment: an 'is_final' transition-id followed by an
+    // 'is_initial' transition-id marks a phone boundary, which
+    // we know because we do not allow the start-state in
+    // topologies to be final.
+    bool is_final;
+
+    // transition_cost is the cost (negative log-prob) of this transition).
+    BaseFloat transition_cost;
+    // The transition-id associated with the self-loop of the *destination* of
+    // this arc, if there is one, or 0 if there is no such self-loop.
+    int32 self_loop_transition_id;
+
+
+    bool operator < (const TransitionIdInfo &other) const {
+      if (phone < other.phone) return true;
+      else if (phone > other.phone) return false;
+      else if (topo_state < other.topo_state) return true;
+      else if (topo_state > other.topo_state) return false;
+      else if (arc_index < other.arc_index) return true;
+      else if (arc_index > other.arc_index) return false;
+      else if (pdf_id < other.pdf_id) return true;
+      else if (pdf_id > other.pdf_id) return false;
+      else return (self_loop_pdf_id < other.self_loop_pdf_id);
+    }
+
+    // Compare all non-derived members.
+    bool operator == (const TransitionIdInfo &other) const {
+      return (phone == other.phone && topo_state == other.topo_state &&
+              arc_index == other.arc_index && pdf_id == other.pdf_id &&
+              self_loop_pdf_id == other.self_loop_pdf_id);
+    }
+  };
+
+
+  /// return reference to HMM-topology object.
+  const Topology &GetTopo() const { return topo_; }
+
+  const TransitionIdInfo &InfoForTransitionId(int32 transition_id) const;
+
+  inline int32 TransitionIdToPdfFast(int32 trans_id) const;
+
+  /// This allows you to look up a transition-id.  It returns 0 if nothing
+  /// was found.
+  int32 TupleToTransitionId(int32 phone, int32 topo_state, int32 arc_index,
+                            int32 pdf_id, int32 self_loop_pdf_id) const;
+
+
+  /// Returns the total number of transition-ids (note, these are one-based).
+  inline int32 NumTransitionIds() const { return info_.size() - 1; }
+
+  // NumPdfs() returns the number of pdfs (pdf-ids) in the tree,
+  // as returned by ctx_dep.NumPdfs() for the tree passed to the constructor.
+  int32 NumPdfs() const { return num_pdfs_; }
+
+  /// Returns a sorted, unique list of phones.
+  const std::vector<int32> &GetPhones() const { return topo_.GetPhones(); }
+
+
+  /// Print will print the transition model in a human-readable way, for purposes of human
+  /// inspection.  The "occs" are optional (they are indexed by pdf-id).
+  void Print(std::ostream &os,
+             const std::vector<std::string> &phone_names,
+             const Vector<double> *occs = NULL);
+
+  int32 PdfClassForTid(int32 tid) const;
+
+  /// returns true if this is identical to 'other'
+  bool operator == (const Transitions &other) const;
+
+  bool Compatible(const Transitions& other) const;
+
+ private:
+
+  // Called from constructor.  initializes info_ (at least, the first
+  // 5 fields); the implementation then has to call ComputeDerived()
+  // to initalize the rest.
+  void ComputeInfo(const ContextDependencyInterface &ctx_dep);
+
+  void ComputeDerived();  // Called from constructor and Read function.
+
+  void Check() const;
+
+
+  Topology topo_;
+
+  /// Information about transition-ids, indexed by transition-id.
+  /// the tuples are in lexicographic sorted order which allows us to do the
+  /// reverse mapping from tuple to transition id.
+  std::vector<TransitionIdInfo> info_;
+
+
+  /// Accessing pdf_ids_[i] allows us to look up info_[i].pdf_id in a way that
+  /// is more friendly to memory caches than accessing info_; this is done in
+  /// the inner loops of decoders so it makes sense to optimize for it.
+  std::vector<int32> pdf_ids_;
+
+  /// This is a copy of the NumPdfs() returned by the tree when we constructed
+  /// this object.  Note: pdf-ids are zero-based.
+  int32 num_pdfs_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(Transitions);
+};
+
+inline int32 Transitions::TransitionIdToPdfFast(int32 trans_id) const {
+  // Note: it's a little dangerous to assert this only in paranoid mode.
+  // However, this function is called in the inner loop of decoders and
+  // the assertion likely takes a significant amount of time.  We make
+  // sure that past the end of thd id2pdf_id_ array there are big
+  // numbers, which will make the calling code more likely to segfault
+  // (rather than silently die) if this is called for out-of-range values.
+  KALDI_PARANOID_ASSERT(
+      static_cast<size_t>(trans_id) < pdf_ids_.size() &&
+      "Likely graph/model mismatch (graph built from wrong model?)");
+  return pdf_ids_[trans_id];
+}
+
+/// Works out which pdfs might correspond to the given phones.  Will return true
+/// if these pdfs correspond *just* to these phones, false if these pdfs are also
+/// used by other phones.
+/// @param trans_model [in] Transition-model used to work out this information
+/// @param phones [in] A sorted, uniq vector that represents a set of phones
+/// @param pdfs [out] Will be set to a sorted, uniq list of pdf-ids that correspond
+///                   to one of this set of phones.
+/// @return  Returns true if all of the pdfs output to "pdfs" correspond to phones from
+///          just this set (false if they may be shared with phones outside this set).
+bool GetPdfsForPhones(const Transitions &trans_model,
+                      const std::vector<int32> &phones,
+                      std::vector<int32> *pdfs);
+
+/// Works out which phones might correspond to the given pdfs. Similar to the
+/// above GetPdfsForPhones(, ,)
+bool GetPhonesForPdfs(const Transitions &trans_model,
+                      const std::vector<int32> &pdfs,
+                      std::vector<int32> *phones);
+/// @}
+
+
+} // end namespace kaldi
+
+
+#endif
diff --git a/src/hmm/tree-accu.cc b/src/hmm/tree-accu.cc
index c8ce49d9bc7..18a613b8a5c 100644
--- a/src/hmm/tree-accu.cc
+++ b/src/hmm/tree-accu.cc
@@ -33,7 +33,7 @@ static int32 MapPhone(const std::vector<int32> &phone_map,
 }
 
 
-void AccumulateTreeStats(const TransitionModel &trans_model,
+void AccumulateTreeStats(const Transitions &trans_model,
                          const AccumulateTreeStatsInfo &info,
                          const std::vector<int32> &alignment,
                          const Matrix<BaseFloat> &features,
@@ -54,8 +54,8 @@ void AccumulateTreeStats(const TransitionModel &trans_model,
         i + info.central_position < static_cast<int32>(split_alignment.size())) {
       int32 central_phone =
           MapPhone(info.phone_map,
-                   trans_model.TransitionIdToPhone(
-                       split_alignment[i+info.central_position][0]));
+                   trans_model.InfoForTransitionId(
+                       split_alignment[i+info.central_position][0]).phone);
       bool is_ctx_dep = !std::binary_search(info.ci_phones.begin(),
                                             info.ci_phones.end(),
                                             central_phone);
@@ -65,7 +65,7 @@ void AccumulateTreeStats(const TransitionModel &trans_model,
         if (i + j >= 0 && i + j < static_cast<int32>(split_alignment.size()))
           phone =
               MapPhone(info.phone_map,
-                       trans_model.TransitionIdToPhone(split_alignment[i+j][0]));
+                       trans_model.InfoForTransitionId(split_alignment[i+j][0]).phone);
         else
           phone = 0;  // ContextDependency class uses 0 to mean "out of window";
         // we also set the phone arbitrarily to 0
@@ -84,9 +84,8 @@ void AccumulateTreeStats(const TransitionModel &trans_model,
       for (int32 j = 0; j < static_cast<int32>(split_alignment[i+info.central_position].size());j++) {
         // for central phone of this window...
         EventType evec_more(evec);
-        int32 pdf_class = trans_model.TransitionIdToPdfClass(
-            split_alignment[i+info.central_position][j]);
-        // pdf_class will normally by 0, 1 or 2 for 3-state HMM.
+        int32 pdf_class = trans_model.PdfClassForTid(split_alignment[i+info.central_position][j]);
+        // pdf_class will normally be 0, 1 or 2 for 3-state HMM.
         std::pair<EventKeyType, EventValueType> pr(kPdfClass, pdf_class);
         evec_more.push_back(pr);
         std::sort(evec_more.begin(), evec_more.end());  // these must be sorted!
diff --git a/src/hmm/tree-accu.h b/src/hmm/tree-accu.h
index 92e83c535c7..fd3e09567b5 100644
--- a/src/hmm/tree-accu.h
+++ b/src/hmm/tree-accu.h
@@ -23,7 +23,7 @@
 #include <cctype>  // For isspace.
 #include <limits>
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "tree/clusterable-classes.h"
 #include "tree/build-tree-questions.h" // needed for this typedef:
 // typedef std::vector<std::pair<EventVector, Clusterable*> > BuildTreeStatsType;
@@ -74,7 +74,7 @@ struct AccumulateTreeStatsInfo {
 /// "normal" way).  It adds to 'stats' the stats obtained from this file.  Any
 /// new GaussClusterable* pointers in "stats" will be allocated with "new".
 
-void AccumulateTreeStats(const TransitionModel &trans_model,
+void AccumulateTreeStats(const Transitions &trans_model,
                          const AccumulateTreeStatsInfo &info,
                          const std::vector<int32> &alignment,
                          const Matrix<BaseFloat> &features,
diff --git a/src/itf/context-dep-itf.h b/src/itf/context-dep-itf.h
index b62bd11e11a..9db5a36c70c 100644
--- a/src/itf/context-dep-itf.h
+++ b/src/itf/context-dep-itf.h
@@ -62,9 +62,9 @@ class ContextDependencyInterface {
 
   /// GetPdfInfo returns a vector indexed by pdf-id, saying for each pdf which
   /// pairs of (phone, pdf-class) it can correspond to.  (Usually just one).
-  /// c.f. hmm/hmm-topology.h for meaning of pdf-class.
+  /// c.f. hmm/topology.h for meaning of pdf-class.
   /// This is the old, simpler interface of GetPdfInfo(), and that this one can
-  /// only be called if the HmmTopology object's IsHmm() function call returns
+  /// only be called if the Topology object's IsHmm() function call returns
   /// true.
   virtual void GetPdfInfo(
       const std::vector<int32> &phones,  // list of phones
diff --git a/src/ivector/Makefile b/src/ivector/Makefile
index 1154da6880b..ad53c9007b2 100644
--- a/src/ivector/Makefile
+++ b/src/ivector/Makefile
@@ -13,7 +13,7 @@ OBJFILES = ivector-extractor.o voice-activity-detection.o plda.o \
 LIBNAME = kaldi-ivector
 
 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 
diff --git a/src/ivectorbin/Makefile b/src/ivectorbin/Makefile
index 8dc3498b83b..c261ed3e28e 100644
--- a/src/ivectorbin/Makefile
+++ b/src/ivectorbin/Makefile
@@ -26,7 +26,7 @@ TESTFILES =
 
 
 ADDLIBS = ../ivector/kaldi-ivector.a ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/kws/Makefile b/src/kws/Makefile
index c4367eb2958..9dc7bddab70 100644
--- a/src/kws/Makefile
+++ b/src/kws/Makefile
@@ -10,7 +10,7 @@ OBJFILES = kws-functions.o kws-functions2.o kws-scoring.o
 LIBNAME = kaldi-kws
 
 ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/kwsbin/Makefile b/src/kwsbin/Makefile
index bcc2685b7f3..f03b0a07f92 100644
--- a/src/kwsbin/Makefile
+++ b/src/kwsbin/Makefile
@@ -17,6 +17,6 @@ TESTFILES =
 
 ADDLIBS = ../kws/kaldi-kws.a ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a \
           ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/lat/Makefile b/src/lat/Makefile
index 56521486826..3d4c6afcc79 100644
--- a/src/lat/Makefile
+++ b/src/lat/Makefile
@@ -16,7 +16,7 @@ OBJFILES = kaldi-lattice.o lattice-functions.o word-align-lattice.o \
 LIBNAME = kaldi-lat
 
 ADDLIBS = ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc
index bdf8c3fabc8..cdc920ef95b 100644
--- a/src/lat/determinize-lattice-pruned.cc
+++ b/src/lat/determinize-lattice-pruned.cc
@@ -1294,7 +1294,7 @@ bool DeterminizeLatticePruned(const ExpandedFst<ArcTpl<Weight> > &ifst,
 
 template<class Weight>
 typename ArcTpl<Weight>::Label DeterminizeLatticeInsertPhones(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<ArcTpl<Weight> > *fst) {
   // Define some types.
   typedef ArcTpl<Weight> Arc;
@@ -1316,32 +1316,28 @@ typename ArcTpl<Weight>::Label DeterminizeLatticeInsertPhones(
          !aiter.Done(); aiter.Next()) {
       Arc arc = aiter.Value();
 
-      // Note: the words are on the input symbol side and transition-id's are on
+      // Note: the words are on the input symbol side and transition-ids are on
       // the output symbol side.
-      if ((arc.olabel != 0)
-          && (trans_model.TransitionIdToHmmState(arc.olabel) == 0)
-          && (!trans_model.IsSelfLoop(arc.olabel))) {
-        Label phone =
-            static_cast<Label>(trans_model.TransitionIdToPhone(arc.olabel));
-
-        // Skips <eps>.
-        KALDI_ASSERT(phone != 0);
-
-        if (arc.ilabel == 0) {
-          // If there is no word on the arc, insert the phone directly.
-          arc.ilabel = first_phone_label + phone;
-        } else {
-          // Otherwise, add an additional arc.
-          StateId additional_state = fst->AddState();
-          StateId next_state = arc.nextstate;
-          arc.nextstate = additional_state;
-          fst->AddArc(additional_state,
-                      Arc(first_phone_label + phone, 0,
-                          Weight::One(), next_state));
+      if (arc.olabel != 0) {
+        auto info = trans_model.InfoForTransitionId(arc.olabel);
+        if (info.is_initial && !info.is_self_loop) {
+          Label phone = static_cast<Label>(info.phone);
+          KALDI_ASSERT(phone != 0);
+          if (arc.ilabel == 0) {
+            // If there is no word on the arc, insert the phone directly.
+            arc.ilabel = first_phone_label + phone;
+          } else {
+            // Otherwise, add an additional arc.
+            StateId additional_state = fst->AddState();
+            StateId next_state = arc.nextstate;
+            arc.nextstate = additional_state;
+            fst->AddArc(additional_state,
+                        Arc(first_phone_label + phone, 0,
+                            Weight::One(), next_state));
+          }
+          aiter.SetValue(arc);
         }
       }
-
-      aiter.SetValue(arc);
     }
   }
 
@@ -1391,7 +1387,7 @@ void DeterminizeLatticeDeletePhones(
 */
 template<class Weight, class IntType>
 bool DeterminizeLatticePhonePrunedFirstPass(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     double beam,
     MutableFst<ArcTpl<Weight> > *fst,
     const DeterminizeLatticePrunedOptions &opts) {
@@ -1414,7 +1410,7 @@ bool DeterminizeLatticePhonePrunedFirstPass(
 // lattice might be modified.
 template<class Weight, class IntType>
 bool DeterminizeLatticePhonePruned(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<ArcTpl<Weight> > *ifst,
     double beam,
     MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
@@ -1475,7 +1471,7 @@ bool DeterminizeLatticePhonePruned(
 // will be kept as unchanged.
 template<class Weight, class IntType>
 bool DeterminizeLatticePhonePruned(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     const ExpandedFst<ArcTpl<Weight> > &ifst,
     double beam,
     MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
@@ -1486,7 +1482,7 @@ bool DeterminizeLatticePhonePruned(
 }
 
 bool DeterminizeLatticePhonePrunedWrapper(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<kaldi::LatticeArc> *ifst,
     double beam,
     MutableFst<kaldi::CompactLatticeArc> *ofst,
@@ -1528,7 +1524,7 @@ bool DeterminizeLatticePruned<kaldi::LatticeWeight>(
 
 template
 bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     const ExpandedFst<kaldi::LatticeArc> &ifst,
     double prune,
     MutableFst<kaldi::CompactLatticeArc> *ofst,
@@ -1536,7 +1532,7 @@ bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
 
 template
 bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<kaldi::LatticeArc> *ifst,
     double prune,
     MutableFst<kaldi::CompactLatticeArc> *ofst,
diff --git a/src/lat/determinize-lattice-pruned.h b/src/lat/determinize-lattice-pruned.h
index 8e1858aa2b1..63154109ec2 100644
--- a/src/lat/determinize-lattice-pruned.h
+++ b/src/lat/determinize-lattice-pruned.h
@@ -28,7 +28,7 @@
 #include <set>
 #include <vector>
 #include "fstext/lattice-weight.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "itf/options-itf.h"
 #include "lat/kaldi-lattice.h"
 
@@ -222,7 +222,7 @@ bool DeterminizeLatticePruned(
 */
 template<class Weight>
 typename ArcTpl<Weight>::Label DeterminizeLatticeInsertPhones(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<ArcTpl<Weight> > *fst);
 
 /** This function takes in lattices and deletes "phones" from them. The "phones"
@@ -253,7 +253,7 @@ void DeterminizeLatticeDeletePhones(
 */
 template<class Weight, class IntType>
 bool DeterminizeLatticePhonePruned(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     const ExpandedFst<ArcTpl<Weight> > &ifst,
     double prune,
     MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
@@ -265,7 +265,7 @@ bool DeterminizeLatticePhonePruned(
 */
 template<class Weight, class IntType>
 bool DeterminizeLatticePhonePruned(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<ArcTpl<Weight> > *ifst,
     double prune,
     MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
@@ -282,7 +282,7 @@ bool DeterminizeLatticePhonePruned(
     code.
 */
 bool DeterminizeLatticePhonePrunedWrapper(
-    const kaldi::TransitionModel &trans_model,
+    const kaldi::Transitions &trans_model,
     MutableFst<kaldi::LatticeArc> *ifst,
     double prune,
     MutableFst<kaldi::CompactLatticeArc> *ofst,
diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc
index 7f484f95233..785e6ac24d6 100644
--- a/src/lat/lattice-functions.cc
+++ b/src/lat/lattice-functions.cc
@@ -24,7 +24,7 @@
 
 
 #include "lat/lattice-functions.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "util/stl-utils.h"
 #include "base/kaldi-math.h"
 #include "hmm/hmm-utils.h"
@@ -396,7 +396,7 @@ BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *post,
 }
 
 
-void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans,
+void LatticeActivePhones(const Lattice &lat, const Transitions &trans,
                          const vector<int32> &silence_phones,
                          vector< std::set<int32> > *active_phones) {
   KALDI_ASSERT(IsSortedAndUniq(silence_phones));
@@ -411,7 +411,7 @@ void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans,
         aiter.Next()) {
       const LatticeArc &arc = aiter.Value();
       if (arc.ilabel != 0) {  // Non-epsilon arc
-        int32 phone = trans.TransitionIdToPhone(arc.ilabel);
+        int32 phone = trans.InfoForTransitionId(arc.ilabel).phone;
         if (!std::binary_search(silence_phones.begin(),
                                 silence_phones.end(), phone))
           (*active_phones)[cur_time].insert(phone);
@@ -420,7 +420,7 @@ void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans,
   }  // end looping over states
 }
 
-void ConvertLatticeToPhones(const TransitionModel &trans,
+void ConvertLatticeToPhones(const Transitions &trans,
                             Lattice *lat) {
   typedef LatticeArc Arc;
   int32 num_states = lat->NumStates();
@@ -429,11 +429,11 @@ void ConvertLatticeToPhones(const TransitionModel &trans,
         aiter.Next()) {
       Arc arc(aiter.Value());
       arc.olabel = 0; // remove any word.
-      if ((arc.ilabel != 0) // has a transition-id on input..
-          && (trans.TransitionIdToHmmState(arc.ilabel) == 0)
-          && (!trans.IsSelfLoop(arc.ilabel))) {
-         // && trans.IsFinal(arc.ilabel)) // there is one of these per phone...
-        arc.olabel = trans.TransitionIdToPhone(arc.ilabel);
+
+      if (arc.ilabel != 0) { // has a transition-id on input..
+        auto info = trans.InfoForTransitionId(arc.ilabel);
+        if (info.is_initial && !info.is_self_loop)
+          arc.olabel = info.phone;
       }
       aiter.SetValue(arc);
     }  // end looping over arcs
@@ -697,7 +697,7 @@ void CompactLatticeDepthPerFrame(const CompactLattice &clat,
 
 
 
-void ConvertCompactLatticeToPhones(const TransitionModel &trans,
+void ConvertCompactLatticeToPhones(const Transitions &trans,
                                    CompactLattice *clat) {
   typedef CompactLatticeArc Arc;
   typedef Arc::Weight Weight;
@@ -711,8 +711,9 @@ void ConvertCompactLatticeToPhones(const TransitionModel &trans,
       const std::vector<int32> &tid_seq = arc.weight.String();
       for (std::vector<int32>::const_iterator iter = tid_seq.begin();
            iter != tid_seq.end(); ++iter) {
-        if (trans.IsFinal(*iter))// note: there is one of these per phone...
-          phone_seq.push_back(trans.TransitionIdToPhone(*iter));
+        auto info = trans.InfoForTransitionId(*iter);
+        if (info.is_initial && !info.is_self_loop) // note: there is one of these per phone.
+          phone_seq.push_back(info.phone);
       }
       arc.weight.SetString(phone_seq);
       aiter.SetValue(arc);
@@ -723,8 +724,9 @@ void ConvertCompactLatticeToPhones(const TransitionModel &trans,
       const std::vector<int32> &tid_seq = f.String();
       for (std::vector<int32>::const_iterator iter = tid_seq.begin();
            iter != tid_seq.end(); ++iter) {
-        if (trans.IsFinal(*iter))// note: there is one of these per phone...
-          phone_seq.push_back(trans.TransitionIdToPhone(*iter));
+        auto info = trans.InfoForTransitionId(*iter);
+        if (info.is_initial && !info.is_self_loop) // note: there is one of these per phone.
+          phone_seq.push_back(info.phone);
       }
       f.SetString(phone_seq);
       clat->SetFinal(state, f);
@@ -732,7 +734,7 @@ void ConvertCompactLatticeToPhones(const TransitionModel &trans,
   }  // end looping over states
 }
 
-bool LatticeBoost(const TransitionModel &trans,
+bool LatticeBoost(const Transitions &trans,
                   const std::vector<int32> &alignment,
                   const std::vector<int32> &silence_phones,
                   BaseFloat b,
@@ -761,8 +763,8 @@ bool LatticeBoost(const TransitionModel &trans,
                      << "lattice/model mismatch?";
           return false;
         }
-        int32 phone = trans.TransitionIdToPhone(arc.ilabel),
-            ref_phone = trans.TransitionIdToPhone(alignment[cur_time]);
+        int32 phone = trans.InfoForTransitionId(arc.ilabel).phone,
+            ref_phone = trans.InfoForTransitionId(alignment[cur_time]).phone;
         BaseFloat frame_error;
         if (phone == ref_phone) {
           frame_error = 0.0;
@@ -792,7 +794,7 @@ bool LatticeBoost(const TransitionModel &trans,
 
 
 BaseFloat LatticeForwardBackwardMpeVariants(
-    const TransitionModel &trans,
+    const Transitions &trans,
     const std::vector<int32> &silence_phones,
     const Lattice &lat,
     const std::vector<int32> &num_ali,
@@ -873,8 +875,8 @@ BaseFloat LatticeForwardBackwardMpeVariants(
       double frame_acc = 0.0;
       if (arc.ilabel != 0) {
         int32 cur_time = state_times[s];
-        int32 phone = trans.TransitionIdToPhone(arc.ilabel),
-            ref_phone = trans.TransitionIdToPhone(num_ali[cur_time]);
+        int32 phone = trans.InfoForTransitionId(arc.ilabel).phone,
+            ref_phone = trans.InfoForTransitionId(num_ali[cur_time]).phone;
         bool phone_is_sil = std::binary_search(silence_phones.begin(),
                                                silence_phones.end(),
                                                phone),
@@ -883,8 +885,8 @@ BaseFloat LatticeForwardBackwardMpeVariants(
                                                   ref_phone),
             both_sil = phone_is_sil && ref_phone_is_sil;
         if (!is_mpfe) { // smbr.
-          int32 pdf = trans.TransitionIdToPdf(arc.ilabel),
-              ref_pdf = trans.TransitionIdToPdf(num_ali[cur_time]);
+          int32 pdf = trans.InfoForTransitionId(arc.ilabel).pdf_id,
+              ref_pdf = trans.InfoForTransitionId(num_ali[cur_time]).pdf_id;
           if (!one_silence_class)  // old behavior
             frame_acc = (pdf == ref_pdf && !phone_is_sil) ? 1.0 : 0.0;
           else
@@ -918,8 +920,8 @@ BaseFloat LatticeForwardBackwardMpeVariants(
       int32 transition_id = arc.ilabel;
       if (arc.ilabel != 0) {
         int32 cur_time = state_times[s];
-        int32 phone = trans.TransitionIdToPhone(arc.ilabel),
-            ref_phone = trans.TransitionIdToPhone(num_ali[cur_time]);
+        int32 phone = trans.InfoForTransitionId(arc.ilabel).phone,
+            ref_phone = trans.InfoForTransitionId(num_ali[cur_time]).phone;
         bool phone_is_sil = std::binary_search(silence_phones.begin(),
                                                silence_phones.end(), phone),
             ref_phone_is_sil = std::binary_search(silence_phones.begin(),
@@ -927,8 +929,8 @@ BaseFloat LatticeForwardBackwardMpeVariants(
                                                   ref_phone),
             both_sil = phone_is_sil && ref_phone_is_sil;
         if (!is_mpfe) { // smbr.
-          int32 pdf = trans.TransitionIdToPdf(arc.ilabel),
-              ref_pdf = trans.TransitionIdToPdf(num_ali[cur_time]);
+          int32 pdf = trans.InfoForTransitionId(arc.ilabel).pdf_id,
+              ref_pdf = trans.InfoForTransitionId(num_ali[cur_time]).pdf_id;
           if (!one_silence_class)  // old behavior
             frame_acc = (pdf == ref_pdf && !phone_is_sil) ? 1.0 : 0.0;
           else
@@ -1024,7 +1026,7 @@ bool CompactLatticeToWordAlignment(const CompactLattice &clat,
 
 
 bool CompactLatticeToWordProns(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const CompactLattice &clat,
     std::vector<int32> *words,
     std::vector<int32> *begin_times,
@@ -1080,7 +1082,7 @@ bool CompactLatticeToWordProns(
       std::vector<int32> plengths(split_alignment.size());
       for (size_t i = 0; i < split_alignment.size(); i++) {
         KALDI_ASSERT(!split_alignment[i].empty());
-        phones[i] = tmodel.TransitionIdToPhone(split_alignment[i][0]);
+        phones[i] = tmodel.InfoForTransitionId(split_alignment[i][0]).phone;
         plengths[i] = split_alignment[i].size();
       }
       prons->push_back(phones);
@@ -1215,7 +1217,7 @@ struct ClatRescoreTuple {
     RescoreCompactLattice, "tmodel" will be NULL and speedup_factor will be 1.0.
  */
 bool RescoreCompactLatticeInternal(
-    const TransitionModel *tmodel,
+    const Transitions *tmodel,
     BaseFloat speedup_factor,
     DecodableInterface *decodable,
     CompactLattice *clat) {
@@ -1286,10 +1288,10 @@ bool RescoreCompactLatticeInternal(
     BaseFloat frame_scale = 1.0;
     KALDI_ASSERT(!time_to_state[t].empty());
     if (tmodel != NULL) {
-      int32 pdf_id = tmodel->TransitionIdToPdf(time_to_state[t][0].tid);
+      int32 pdf_id = tmodel->InfoForTransitionId(time_to_state[t][0].tid).pdf_id;
       bool frame_has_multiple_pdfs = false;
       for (size_t i = 1; i < time_to_state[t].size(); i++) {
-        if (tmodel->TransitionIdToPdf(time_to_state[t][i].tid) != pdf_id) {
+        if (tmodel->InfoForTransitionId(time_to_state[t][i].tid).pdf_id != pdf_id) {
           frame_has_multiple_pdfs = true;
           break;
         }
@@ -1345,7 +1347,7 @@ bool RescoreCompactLatticeInternal(
 
 
 bool RescoreCompactLatticeSpeedup(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     BaseFloat speedup_factor,
     DecodableInterface *decodable,
     CompactLattice *clat) {
@@ -1413,7 +1415,7 @@ bool RescoreLattice(DecodableInterface *decodable,
 
 
 BaseFloat LatticeForwardBackwardMmi(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const Lattice &lat,
     const std::vector<int32> &num_ali,
     bool drop_frames,
@@ -1755,4 +1757,51 @@ void ReplaceAcousticScoresFromMap(
   }
 }
 
+void AddTransitions(
+    const Transitions &tmodel,
+    Lattice *lat) {
+  typedef Lattice::Arc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+  typedef Arc::Label Label;
+
+  for (StateId s = 0; s < lat->NumStates(); s++) {
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s);
+          !aiter.Done(); aiter.Next()) {
+      Arc arc(aiter.Value());
+      int32 tid = arc.ilabel;
+      BaseFloat transition_cost =
+          tmodel.InfoForTransitionId(tid).transition_cost;
+      arc.weight.SetValue1(arc.weight.Value1() + transition_cost);
+      aiter.SetValue(arc);
+    }
+  }
+}
+
+
+void AddTransitions(
+    const Transitions &tmodel,
+    CompactLattice *lat) {
+  typedef CompactLattice::Arc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+  typedef Arc::Label Label;
+
+  for (StateId s = 0; s < lat->NumStates(); s++) {
+    for (fst::MutableArcIterator<CompactLattice> aiter(lat, s);
+          !aiter.Done(); aiter.Next()) {
+      Arc arc(aiter.Value());
+      BaseFloat tot_transition_cost = 0.0;
+      for (int32 tid: arc.weight.String())
+        tot_transition_cost +=
+            tmodel.InfoForTransitionId(tid).transition_cost;
+      LatticeWeight new_weight = arc.weight.Weight();
+      new_weight.SetValue1(new_weight.Value1() + tot_transition_cost);
+      arc.weight.SetWeight(new_weight);
+      aiter.SetValue(arc);
+    }
+  }
+}
+
+
 }  // namespace kaldi
diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h
index c7fe4833a4a..0bf73540485 100644
--- a/src/lat/lattice-functions.h
+++ b/src/lat/lattice-functions.h
@@ -30,7 +30,7 @@
 #include "base/kaldi-common.h"
 #include "hmm/posterior.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 #include "itf/decodable-itf.h"
 
@@ -137,21 +137,21 @@ void CompactLatticeLimitDepth(int32 max_arcs_per_frame,
 /// outputs for each frame the set of phones active on that frame.  If
 /// sil_phones (which must be sorted and uniq) is nonempty, it excludes
 /// phones in this list.
-void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans,
+void LatticeActivePhones(const Lattice &lat, const Transitions &trans,
                          const std::vector<int32> &sil_phones,
                          std::vector<std::set<int32> > *active_phones);
 
 /// Given a lattice, and a transition model to map pdf-ids to phones,
 /// replace the output symbols (presumably words), with phones; we
-/// use the TransitionModel to work out the phone sequence.  Note
+/// use the Transitions to work out the phone sequence.  Note
 /// that the phone labels are not exactly aligned with the phone
 /// boundaries.  We put a phone label to coincide with any transition
 /// to the final, nonemitting state of a phone (this state always exists,
-/// we ensure this in HmmTopology::Check()).  This would be the last
+/// we ensure this in Topology::Check()).  This would be the last
 /// transition-id in the phone if reordering is not done (but typically
 /// we do reorder).
 /// Also see PhoneAlignLattice, in phone-align-lattice.h.
-void ConvertLatticeToPhones(const TransitionModel &trans_model,
+void ConvertLatticeToPhones(const Transitions &trans_model,
                             Lattice *lat);
 
 /// Prunes a lattice or compact lattice.  Returns true on success, false if
@@ -164,7 +164,7 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 /// replace the sequences of transition-ids with sequences of phones.
 /// Note that this is different from ConvertLatticeToPhones, in that
 /// we replace the transition-ids not the words.
-void ConvertCompactLatticeToPhones(const TransitionModel &trans_model,
+void ConvertCompactLatticeToPhones(const Transitions &trans_model,
                                    CompactLattice *clat);
 
 /// Boosts LM probabilities by b * [number of frame errors]; equivalently, adds
@@ -172,14 +172,14 @@ void ConvertCompactLatticeToPhones(const TransitionModel &trans_model,
 /// There is a frame error if a particular transition-id on a particular frame
 /// corresponds to a phone not matching transcription's alignment for that frame.
 /// This is used in "margin-inspired" discriminative training, esp. Boosted MMI.
-/// The TransitionModel is used to map transition-ids in the lattice
+/// The Transitions is used to map transition-ids in the lattice
 /// input-side to phones; the phones appearing in
 /// "silence_phones" are treated specially in that we replace the frame error f
 /// (either zero or 1) for a frame, with the minimum of f or max_silence_error.
 /// For the normal recipe, max_silence_error would be zero.
 /// Returns true on success, false if there was some kind of mismatch.
 /// At input, silence_phones must be sorted and unique.
-bool LatticeBoost(const TransitionModel &trans,
+bool LatticeBoost(const Transitions &trans,
                   const std::vector<int32> &alignment,
                   const std::vector<int32> &silence_phones,
                   BaseFloat b,
@@ -226,7 +226,7 @@ bool LatticeBoost(const TransitionModel &trans,
                         pseudo log-likelihoods of states at each frame.
 */
 BaseFloat LatticeForwardBackwardMpeVariants(
-    const TransitionModel &trans,
+    const Transitions &trans,
     const std::vector<int32> &silence_phones,
     const Lattice &lat,
     const std::vector<int32> &num_ali,
@@ -261,7 +261,7 @@ BaseFloat LatticeForwardBackwardMpeVariants(
 
    It returns the forward-backward likelihood of the lattice. */
 BaseFloat LatticeForwardBackwardMmi(
-    const TransitionModel &trans,
+    const Transitions &trans,
     const Lattice &lat,
     const std::vector<int32> &num_ali,
     bool drop_frames,
@@ -298,7 +298,7 @@ bool CompactLatticeToWordAlignment(const CompactLattice &clat,
 /// did not have the correct format (e.g. if it is empty or it is not
 /// linear).
 bool CompactLatticeToWordProns(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const CompactLattice &clat,
     std::vector<int32> *words,
     std::vector<int32> *begin_times,
@@ -350,12 +350,26 @@ int32 LongestSentenceLength(const CompactLattice &lat);
 /// speedup_factor; otherwise we set them to zero.  This gives the right
 /// expected probability so our corpus-level diagnostics will be about right.
 bool RescoreCompactLatticeSpeedup(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     BaseFloat speedup_factor,
     DecodableInterface *decodable,
     CompactLattice *clat);
 
 
+/// Adds transition costs from transition-model 'tmodel' to the costs in 'lat'.
+/// Note: these transition costs are not trainable, they are fixed once the
+/// topology is known.
+void AddTransitions(
+    const Transitions &tmodel,
+    CompactLattice *lat);
+/// Adds transition costs from transition-model 'tmodel' to the costs in 'lat'.
+/// Note: these transition costs are not trainable, they are fixed once the
+/// topology is known.
+void AddTransitions(
+    const Transitions &tmodel,
+    Lattice *lat);
+
+
 /// This function *adds* the negated scores obtained from the Decodable object,
 /// to the acoustic scores on the arcs.  If you want to replace them, you should
 /// use ScaleCompactLattice to first set the acoustic scores to zero.  Returns
@@ -377,26 +391,26 @@ void ComposeCompactLatticeDeterministic(
     fst::DeterministicOnDemandFst<fst::StdArc>* det_fst,
     CompactLattice* composed_clat);
 
-/// This function computes the mapping from the pair 
-/// (frame-index, transition-id) to the pair 
-/// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the 
+/// This function computes the mapping from the pair
+/// (frame-index, transition-id) to the pair
+/// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the
 /// transition-id in that frame.
-/// frame-index in the lattice. 
-/// This function is useful for retaining the acoustic scores in a 
-/// non-compact lattice after a process like determinization where the 
+/// frame-index in the lattice.
+/// This function is useful for retaining the acoustic scores in a
+/// non-compact lattice after a process like determinization where the
 /// frame-level acoustic scores are typically lost.
-/// The function ReplaceAcousticScoresFromMap is used to restore the 
+/// The function ReplaceAcousticScoresFromMap is used to restore the
 /// acoustic scores computed by this function.
 ///
-///   @param [in] lat   Input lattice. Expected to be top-sorted. Otherwise the 
-///                     function will crash. 
-///   @param [out] acoustic_scores  
+///   @param [in] lat   Input lattice. Expected to be top-sorted. Otherwise the
+///                     function will crash.
+///   @param [out] acoustic_scores
 ///                     Pointer to a map from the pair (frame-index,
 ///                     transition-id) to a pair (sum-of-acoustic-scores,
 ///                     num-of-occurences).
 ///                     Usually the acoustic scores for a pdf-id (and hence
 ///                     transition-id) on a frame will be the same for all the
-///                     occurences of the pdf-id in that frame. 
+///                     occurences of the pdf-id in that frame.
 ///                     But if not, we will take the average of the acoustic
 ///                     scores. Hence, we store both the sum-of-acoustic-scores
 ///                     and the num-of-occurences of the transition-id in that
@@ -409,11 +423,11 @@ void ComputeAcousticScoresMap(
 /// This function restores acoustic scores computed using the function
 /// ComputeAcousticScoresMap into the lattice.
 ///
-///   @param [in] acoustic_scores  
+///   @param [in] acoustic_scores
 ///                      A map from the pair (frame-index, transition-id) to a
-///                      pair (sum-of-acoustic-scores, num-of-occurences) of 
+///                      pair (sum-of-acoustic-scores, num-of-occurences) of
 ///                      the occurences of the transition-id in that frame.
-///                      See the comments for ComputeAcousticScoresMap for 
+///                      See the comments for ComputeAcousticScoresMap for
 ///                      details.
 ///   @param [out] lat   Pointer to the output lattice.
 void ReplaceAcousticScoresFromMap(
diff --git a/src/lat/minimize-lattice.cc b/src/lat/minimize-lattice.cc
index 2132d324d20..2cae91a2563 100644
--- a/src/lat/minimize-lattice.cc
+++ b/src/lat/minimize-lattice.cc
@@ -22,7 +22,7 @@
 
 
 #include "lat/minimize-lattice.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "util/stl-utils.h"
 
 namespace fst {
diff --git a/src/lat/phone-align-lattice.cc b/src/lat/phone-align-lattice.cc
index 5f11128eddd..a0027d573c7 100644
--- a/src/lat/phone-align-lattice.cc
+++ b/src/lat/phone-align-lattice.cc
@@ -20,7 +20,7 @@
 
 
 #include "lat/phone-align-lattice.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "util/stl-utils.h"
 
 namespace kaldi {
@@ -58,7 +58,7 @@ class LatticePhoneAligner {
     /// wrong so don't trust the output too fully.
     /// Note: the "next_state" of the arc will not be set, you have to do that
     /// yourself.
-    bool OutputPhoneArc(const TransitionModel &tmodel,
+    bool OutputPhoneArc(const Transitions &tmodel,
                         const PhoneAlignLatticeOptions &opts,
                         CompactLatticeArc *arc_out,
                         bool *error);
@@ -67,7 +67,7 @@ class LatticePhoneAligner {
     /// the arc won't have any transition-ids on it.  This is intended to fix
     /// a particular pathology where too many words were pending and we had
     /// blowup.
-    bool OutputWordArc(const TransitionModel &tmodel,
+    bool OutputWordArc(const Transitions &tmodel,
                        const PhoneAlignLatticeOptions &opts,
                        CompactLatticeArc *arc_out,
                        bool *error);
@@ -91,7 +91,7 @@ class LatticePhoneAligner {
     /// will consist of partial words, and this will only
     /// happen for lattices that were somehow broken, i.e.
     /// had not reached the final state.
-    void OutputArcForce(const TransitionModel &tmodel,
+    void OutputArcForce(const Transitions &tmodel,
                         const PhoneAlignLatticeOptions &opts,
                         CompactLatticeArc *arc_out,
                         bool *error);
@@ -245,7 +245,7 @@ class LatticePhoneAligner {
   }
 
   LatticePhoneAligner(const CompactLattice &lat,
-                      const TransitionModel &tmodel,
+                      const Transitions &tmodel,
                       const PhoneAlignLatticeOptions &opts,
                      CompactLattice *lat_out):
       lat_(lat), tmodel_(tmodel), opts_(opts), lat_out_(lat_out),
@@ -283,7 +283,7 @@ class LatticePhoneAligner {
   }
 
   CompactLattice lat_;
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
   const PhoneAlignLatticeOptions &opts_;
   CompactLattice *lat_out_;
 
@@ -293,76 +293,71 @@ class LatticePhoneAligner {
 };
 
 bool LatticePhoneAligner::ComputationState::OutputPhoneArc(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const PhoneAlignLatticeOptions &opts,
     CompactLatticeArc *arc_out,
     bool *error) {
   if (transition_ids_.empty()) return false;
-  int32 phone = tmodel.TransitionIdToPhone(transition_ids_[0]);
-  // we assume the start of transition_ids_ is the start of the phone;
-  // this is a precondition.
-  size_t len = transition_ids_.size(), i;
-  // Keep going till we reach a "final" transition-id; note, if
-  // reorder==true, we have to go a bit further after this.
-  for (i = 0; i < len; i++) {
-    int32 tid = transition_ids_[i];
-    int32 this_phone = tmodel.TransitionIdToPhone(tid);
-    if (this_phone != phone && ! *error) { // error condition: should have
-                                           // reached final transition-id first.
-      *error = true;
-      KALDI_WARN << phone << " -> " << this_phone;
-      KALDI_WARN << "Phone changed before final transition-id found "
-          "[broken lattice or mismatched model or wrong --reorder option?]";
-    }
-    if (tmodel.IsFinal(tid))
-      break;
-  }
-  if (i == len) return false; // fell off loop.
-  i++; // go past the one for which IsFinal returned true.
-  if (opts.reorder) // we have to consume the following self-loop transition-ids.
-    while (i < len && tmodel.IsSelfLoop(transition_ids_[i])) i++;
-  if (i == len) return false; // we don't know if it ends here... so can't output arc.
 
-  // interpret i as the number of transition-ids to consume.
-  std::vector<int32> tids_out(transition_ids_.begin(),
-                              transition_ids_.begin()+i);
+  const Transitions::TransitionIdInfo *prev_info = &tmodel.InfoForTransitionId(
+      transition_ids_[0]);
+  if (!prev_info->is_initial)
+    return false;
 
-  Label output_label = 0;
-  if (!word_labels_.empty()) {
-    output_label = word_labels_[0];
-    word_labels_.erase(word_labels_.begin(), word_labels_.begin()+1);
+  size_t len = transition_ids_.size(), i;
+  for (i = 1; i < len; i++) {
+    const Transitions::TransitionIdInfo *this_info = &tmodel.InfoForTransitionId(
+        transition_ids_[i]);
+    if (prev_info->is_final && this_info->is_initial) {
+      // This is a phone boundary.
+      Label output_label = 0;
+      if (!word_labels_.empty()) {
+        // Note: this word label won't necessarily be meaningfully aligned with
+        // the phones.
+        output_label = word_labels_[0];
+        word_labels_.erase(word_labels_.begin());
+      }
+      if (opts.replace_output_symbols)
+        output_label = prev_info->phone;
+      std::vector<int32> tids_out(transition_ids_.begin(),
+                                  transition_ids_.begin() + i);
+      *arc_out = CompactLatticeArc(output_label, output_label,
+                                   CompactLatticeWeight(weight_, tids_out),
+                                   fst::kNoStateId);
+      transition_ids_.erase(transition_ids_.begin(), transition_ids_.begin()+i);
+      weight_ = LatticeWeight::One(); // we just output the weight.
+      return true;
+    }
+    prev_info = this_info;
   }
-  if (opts.replace_output_symbols)
-    output_label = phone;
-  *arc_out = CompactLatticeArc(output_label, output_label,
-                               CompactLatticeWeight(weight_, tids_out),
-                               fst::kNoStateId);
-  transition_ids_.erase(transition_ids_.begin(), transition_ids_.begin()+i);
-  weight_ = LatticeWeight::One(); // we just output the weight.
-  return true;
+  return false;
 }
 
 bool LatticePhoneAligner::ComputationState::OutputWordArc(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const PhoneAlignLatticeOptions &opts,
     CompactLatticeArc *arc_out,
     bool *error) {
   // output a word but no phones.
+
+  // I believe the reason we don't do this if there is just one word, is that we
+  // have reason to believe there is still a way to output that word on a
+  // regular phone arc.
   if (word_labels_.size() < 2) return false;
 
   int32 output_label = word_labels_[0];
-  word_labels_.erase(word_labels_.begin(), word_labels_.begin()+1);
+  word_labels_.erase(word_labels_.begin());
 
   *arc_out = CompactLatticeArc(output_label, output_label,
                                CompactLatticeWeight(weight_, std::vector<int32>()),
                                fst::kNoStateId);
-  weight_ = LatticeWeight::One(); // we just output the weight, so set it to one.
+  weight_ = LatticeWeight::One(); // we just output the cost, so remove it.
   return true;
 }
 
 
 void LatticePhoneAligner::ComputationState::OutputArcForce(
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const PhoneAlignLatticeOptions &opts,
     CompactLatticeArc *arc_out,
     bool *error) {
@@ -372,24 +367,25 @@ void LatticePhoneAligner::ComputationState::OutputArcForce(
   // although it might not be obvious from superficially checking
   // the code.  IsEmpty() would be true if we had transition_ids_.empty()
   // and opts.replace_output_symbols, so we would already die by assertion;
-  // in fact, this function would never be called.
+  // in fact, this function would never have been called.
 
-  if (!transition_ids_.empty()) { // Do some checking here.
+  if (!transition_ids_.empty()) {
+    // Do some checking here.  We expect there to be exactly one phone on this
+    // arc.  This code is reached at the end of a lattice.
     int32 tid = transition_ids_[0];
-    phone = tmodel.TransitionIdToPhone(tid);
-    int32 num_final = 0;
+    phone = tmodel.InfoForTransitionId(tid).phone;
     for (int32 i = 0; i < transition_ids_.size(); i++) { // A check.
       int32 this_tid = transition_ids_[i];
-      int32 this_phone = tmodel.TransitionIdToPhone(this_tid);
-      bool is_final = tmodel.IsFinal(this_tid); // should be exactly one.
-      if (is_final) num_final++;
+      int32 this_phone = tmodel.InfoForTransitionId(this_tid).phone;
       if (this_phone != phone && ! *error) {
-        KALDI_WARN << "Mismatch in phone: error in lattice or mismatched transition model?";
+        KALDI_WARN << "Mismatch in phone: error in lattice or mismatched "
+            "transition model?";
         *error = true;
       }
     }
-    if (num_final != 1 && ! *error) {
-      KALDI_WARN << "Problem phone-aligning lattice: saw " << num_final
+    if (!tmodel.InfoForTransitionId(transition_ids_.back()).is_final &&
+        ! *error) {
+      KALDI_WARN << "Problem phone-aligning lattice: found no final-state for "
                  << " final-states in last phone in lattice (forced out?) "
                  << "Producing partial lattice.";
       *error = true;
@@ -399,7 +395,7 @@ void LatticePhoneAligner::ComputationState::OutputArcForce(
   Label output_label = 0;
   if (!word_labels_.empty()) {
     output_label = word_labels_[0];
-    word_labels_.erase(word_labels_.begin(), word_labels_.begin()+1);
+    word_labels_.erase(word_labels_.begin());
   }
   if (opts.replace_output_symbols)
     output_label = phone;
@@ -407,11 +403,11 @@ void LatticePhoneAligner::ComputationState::OutputArcForce(
                                CompactLatticeWeight(weight_, transition_ids_),
                                fst::kNoStateId);
   transition_ids_.clear();
-  weight_ = LatticeWeight::One(); // we just output the weight.
+  weight_ = LatticeWeight::One();
 }
 
 bool PhoneAlignLattice(const CompactLattice &lat,
-                       const TransitionModel &tmodel,
+                       const Transitions &tmodel,
                        const PhoneAlignLatticeOptions &opts,
                        CompactLattice *lat_out) {
   LatticePhoneAligner aligner(lat, tmodel, opts, lat_out);
diff --git a/src/lat/phone-align-lattice.h b/src/lat/phone-align-lattice.h
index 106e5e03e21..b8916f34e94 100644
--- a/src/lat/phone-align-lattice.h
+++ b/src/lat/phone-align-lattice.h
@@ -25,7 +25,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 
 namespace kaldi {
@@ -59,7 +59,7 @@ struct PhoneAlignLatticeOptions {
 /// everything was OK, false if some kind of error was detected (e.g. the
 /// "reorder" option was incorrectly specified.)
 bool PhoneAlignLattice(const CompactLattice &lat,
-                       const TransitionModel &tmodel,
+                       const Transitions &tmodel,
                        const PhoneAlignLatticeOptions &opts,
                        CompactLattice *lat_out);
 
diff --git a/src/lat/push-lattice.cc b/src/lat/push-lattice.cc
index 616c8c5ad06..cf2464c6be8 100644
--- a/src/lat/push-lattice.cc
+++ b/src/lat/push-lattice.cc
@@ -22,7 +22,7 @@
 
 
 #include "lat/push-lattice.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "util/stl-utils.h"
 
 namespace fst {
diff --git a/src/lat/word-align-lattice-lexicon-test.cc b/src/lat/word-align-lattice-lexicon-test.cc
index 240153417b1..d1ed65c2618 100644
--- a/src/lat/word-align-lattice-lexicon-test.cc
+++ b/src/lat/word-align-lattice-lexicon-test.cc
@@ -172,7 +172,7 @@ void GenerateCompactLatticeRandomly(const std::vector<int32> &alignment,
 
 void TestWordAlignLatticeLexicon() {
   ContextDependency *ctx_dep;
-  TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep);
+  Transitions *trans_model = GenRandTransitions(&ctx_dep);
   bool allow_zero_words = true;
   bool allow_empty_word = true;
   bool allow_multiple_prons = true;
@@ -191,8 +191,7 @@ void TestWordAlignLatticeLexicon() {
   PrintWordsAndPhones(word_seq, phone_seq);
 
   std::vector<int32> alignment;
-  bool reorder = (RandInt(0, 1) == 0);
-  GenerateRandomAlignment(*ctx_dep, *trans_model, reorder,
+  GenerateRandomAlignment(*ctx_dep, *trans_model,
                           phone_seq, &alignment);
 
   CompactLattice clat;
@@ -206,7 +205,6 @@ void TestWordAlignLatticeLexicon() {
   opts.test = true;  // we rely on the self-test code that's activated when we
                      // do this.
   opts.allow_duplicate_paths = true;
-  opts.reorder = reorder;
   CompactLattice aligned_clat;
   bool ans = WordAlignLatticeLexicon(clat, *trans_model, lexicon_info, opts,
                                      &aligned_clat);
@@ -234,4 +232,3 @@ int main() {
     kaldi::TestWordAlignLatticeLexicon();
   std::cout << "Tests succeeded\n";
 }
-
diff --git a/src/lat/word-align-lattice-lexicon.cc b/src/lat/word-align-lattice-lexicon.cc
index 63284b771de..2a268f90dbf 100644
--- a/src/lat/word-align-lattice-lexicon.cc
+++ b/src/lat/word-align-lattice-lexicon.cc
@@ -21,7 +21,7 @@
 #include "lat/phone-align-lattice.h"
 #include "lat/word-align-lattice-lexicon.h"
 #include "lat/lattice-functions.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "util/stl-utils.h"
 
@@ -71,7 +71,7 @@ class LatticeLexiconWordAligner {
     /// previously did PhoneAlignLattice, we can assume this arc corresponds to
     /// exactly one or zero phones.
     void Advance(const CompactLatticeArc &arc,
-                 const TransitionModel &tmodel,
+                 const Transitions &tmodel,
                  LatticeWeight *leftover_weight);
 
     /// Returns true if, assuming we were to add one or more phones by calling
@@ -283,7 +283,7 @@ class LatticeLexiconWordAligner {
   }
 
   LatticeLexiconWordAligner(const CompactLattice &lat,
-                            const TransitionModel &tmodel,
+                            const Transitions &tmodel,
                             const WordAlignLatticeLexiconInfo &lexicon_info,
                             int32 max_states,
                             int32 partial_word_label,
@@ -343,7 +343,7 @@ class LatticeLexiconWordAligner {
   }
 
   CompactLattice lat_in_;
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
   const WordAlignLatticeLexiconInfo &lexicon_info_;
   int32 max_states_;
   CompactLattice *lat_out_;
@@ -571,14 +571,18 @@ void LatticeLexiconWordAligner::ProcessFinalForceOut() {
 }
 
 void LatticeLexiconWordAligner::ComputationState::Advance(
-    const CompactLatticeArc &arc, const TransitionModel &tmodel, LatticeWeight *weight) {
+    const CompactLatticeArc &arc, const Transitions &tmodel, LatticeWeight *weight) {
   const std::vector<int32> &tids = arc.weight.String();
   int32 phone;
   if (tids.empty()) phone = 0;
   else {
-    phone = tmodel.TransitionIdToPhone(tids.front());
-    KALDI_ASSERT(phone == tmodel.TransitionIdToPhone(tids.back()) &&
-                 "Error: lattice is not phone-aligned.");
+    const Transitions::TransitionIdInfo
+        &start_info = tmodel.InfoForTransitionId(tids.front()),
+        &end_info = tmodel.InfoForTransitionId(tids.back());
+    if (!start_info.is_initial || !end_info.is_final ||
+        start_info.phone != end_info.phone)
+      KALDI_ERR << "Error: lattice is not phone-aligned.";
+    phone = start_info.phone;
   }
   if (arc.ilabel != 0) { // note: arc.ilabel==arc.olabel (acceptor)
     words_.push_back(arc.ilabel);
@@ -743,7 +747,7 @@ bool LatticeLexiconWordAligner::ComputationState::TakeTransition(
 // has the same input-word and output-word.  The other case is complex
 // to test.
 static bool IsPlausibleWord(const WordAlignLatticeLexiconInfo &lexicon_info,
-                            const TransitionModel &tmodel,
+                            const Transitions &tmodel,
                             int32 word_id,
                             const std::vector<int32> &transition_ids) {
 
@@ -754,7 +758,7 @@ static bool IsPlausibleWord(const WordAlignLatticeLexiconInfo &lexicon_info,
   std::vector<int32> phones(split_alignment.size());
   for (size_t i = 0; i < split_alignment.size(); i++) {
     KALDI_ASSERT(!split_alignment[i].empty());
-    phones[i] = tmodel.TransitionIdToPhone(split_alignment[i][0]);
+    phones[i] = tmodel.InfoForTransitionId(split_alignment[i][0]).phone;
   }
   std::vector<int32> lexicon_entry;
   lexicon_entry.push_back(word_id);
@@ -925,7 +929,7 @@ static void MapSymbols(const WordAlignLatticeLexiconInfo &lexicon_info,
 }
 
 static bool TestWordAlignedLattice(const WordAlignLatticeLexiconInfo &lexicon_info,
-                                   const TransitionModel &tmodel,
+                                   const Transitions &tmodel,
                                    CompactLattice clat,
                                    CompactLattice aligned_clat,
                                    bool allow_duplicate_paths) {
@@ -999,7 +1003,7 @@ static bool TestWordAlignedLattice(const WordAlignLatticeLexiconInfo &lexicon_in
 
 // This is the wrapper function for users to call.
 bool WordAlignLatticeLexicon(const CompactLattice &lat,
-                             const TransitionModel &tmodel,
+                             const Transitions &tmodel,
                              const WordAlignLatticeLexiconInfo &lexicon_info,
                              const WordAlignLatticeLexiconOpts &opts,
                              CompactLattice *lat_out) {
@@ -1065,4 +1069,3 @@ bool ReadLexiconForWordAlign (std::istream &is,
 }
 
 }  // namespace kaldi
-
diff --git a/src/lat/word-align-lattice-lexicon.h b/src/lat/word-align-lattice-lexicon.h
index 915142234a0..823def61f80 100644
--- a/src/lat/word-align-lattice-lexicon.h
+++ b/src/lat/word-align-lattice-lexicon.h
@@ -25,7 +25,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 
 namespace kaldi {
@@ -161,7 +161,7 @@ struct WordAlignLatticeLexiconOpts {
 /// error including when the the lattice seems to have been "forced out"
 /// (did not reach end state, resulting in partial word at end).
 bool WordAlignLatticeLexicon(const CompactLattice &lat,
-                             const TransitionModel &tmodel,
+                             const Transitions &tmodel,
                              const WordAlignLatticeLexiconInfo &lexicon_info,
                              const WordAlignLatticeLexiconOpts &opts,
                              CompactLattice *lat_out);
@@ -177,7 +177,7 @@ bool WordAlignLatticeLexicon(const CompactLattice &lat,
 ///   partial-word arcs, with the partial-word label.
 ///   silence arcs, with the silence label.
 void TestWordAlignedLatticeLexicon(const CompactLattice &lat,
-                                   const TransitionModel &tmodel,
+                                   const Transitions &tmodel,
                                    const std::vector<std::vector<int32> > &lexicon,
                                    const CompactLattice &aligned_lat,
                                    bool allow_duplicate_paths);
diff --git a/src/lat/word-align-lattice.cc b/src/lat/word-align-lattice.cc
index 3cc43d54100..951f96a302a 100644
--- a/src/lat/word-align-lattice.cc
+++ b/src/lat/word-align-lattice.cc
@@ -19,7 +19,7 @@
 
 
 #include "lat/word-align-lattice.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "util/stl-utils.h"
 
 namespace kaldi {
@@ -57,7 +57,7 @@ class LatticeWordAligner {
     /// Note: the "next_state" of the arc will not be set, you have to do that
     /// yourself.
     bool OutputArc(const WordBoundaryInfo &info,
-                   const TransitionModel &tmodel,
+                   const Transitions &tmodel,
                    CompactLatticeArc *arc_out,
                    bool *error) {
       // order of this ||-expression doesn't matter for
@@ -69,15 +69,15 @@ class LatticeWordAligner {
     }
 
     bool OutputSilenceArc(const WordBoundaryInfo &info,
-                          const TransitionModel &tmodel,
+                          const Transitions &tmodel,
                           CompactLatticeArc *arc_out,
                           bool *error);
     bool OutputOnePhoneWordArc(const WordBoundaryInfo &info,
-                               const TransitionModel &tmodel,
+                               const Transitions &tmodel,
                                CompactLatticeArc *arc_out,
                                bool *error);
     bool OutputNormalWordArc(const WordBoundaryInfo &info,
-                             const TransitionModel &tmodel,
+                             const Transitions &tmodel,
                              CompactLatticeArc *arc_out,
                              bool *error);
 
@@ -101,7 +101,7 @@ class LatticeWordAligner {
     /// happen for lattices that were somehow broken, i.e.
     /// had not reached the final state.
     void OutputArcForce(const WordBoundaryInfo &info,
-                        const TransitionModel &tmodel,
+                        const Transitions &tmodel,
                         CompactLatticeArc *arc_out,
                         bool *error);
 
@@ -185,7 +185,7 @@ class LatticeWordAligner {
       // have returned false or we wouldn't have been called, so we have to
       // force it out.
       CompactLatticeArc lat_arc;
-      tuple.comp_state.OutputArcForce(info_, tmodel_, &lat_arc, &error_);
+      tuple.comp_state.OutputArcForce(wb_info_, tmodel_, &lat_arc, &error_);
       // True in the next line means add it to the queue.
       lat_arc.nextstate = GetStateForTuple(tuple, true);
       // The final-prob stuff will get called again from ProcessQueueElement().
@@ -211,7 +211,7 @@ class LatticeWordAligner {
     // epsilon-sequencing rules encoded by the filters in
     // composition.
     CompactLatticeArc lat_arc;
-    if (tuple.comp_state.OutputArc(info_, tmodel_, &lat_arc, &error_)) {
+    if (tuple.comp_state.OutputArc(wb_info_, tmodel_, &lat_arc, &error_)) {
       // note: this function changes the tuple (when it returns true).
       lat_arc.nextstate = GetStateForTuple(tuple, true); // true == add to queue,
       // if not already present.
@@ -250,11 +250,11 @@ class LatticeWordAligner {
   }
 
   LatticeWordAligner(const CompactLattice &lat,
-                     const TransitionModel &tmodel,
+                     const Transitions &tmodel,
                      const WordBoundaryInfo &info,
                      int32 max_states,
                      CompactLattice *lat_out):
-      lat_(lat), tmodel_(tmodel), info_in_(info), info_(info),
+      lat_(lat), tmodel_(tmodel), wb_info_in_(info), wb_info_(info),
       max_states_(max_states), lat_out_(lat_out),
       error_(false) {
     bool test = true;
@@ -272,17 +272,17 @@ class LatticeWordAligner {
     // stage, where we don't want the arcs corresponding to silence or
     // partial words to be removed-- only the arcs with nothing at all
     // on them.
-    if (info_.partial_word_label == 0 || info_.silence_label == 0) {
+    if (wb_info_.partial_word_label == 0 || wb_info_.silence_label == 0) {
       int32 unused_label = 1 + HighestNumberedOutputSymbol(lat);
-      if (info_.partial_word_label >= unused_label)
-        unused_label = info_.partial_word_label + 1;
-      if (info_.silence_label >= unused_label)
-        unused_label = info_.silence_label + 1;
+      if (wb_info_.partial_word_label >= unused_label)
+        unused_label = wb_info_.partial_word_label + 1;
+      if (wb_info_.silence_label >= unused_label)
+        unused_label = wb_info_.silence_label + 1;
       KALDI_ASSERT(unused_label > 0);
-      if (info_.partial_word_label == 0)
-        info_.partial_word_label = unused_label++;
-      if (info_.silence_label == 0)
-        info_.silence_label = unused_label;
+      if (wb_info_.partial_word_label == 0)
+        wb_info_.partial_word_label = unused_label++;
+      if (wb_info_.silence_label == 0)
+        wb_info_.silence_label = unused_label;
     }
   }
 
@@ -294,10 +294,10 @@ class LatticeWordAligner {
     // Remove epsilon arcs from output lattice.
     RmEpsilon(lat_out_, true); // true = connect.
     std::vector<int32> syms_to_remove;
-    if (info_in_.partial_word_label == 0)
-      syms_to_remove.push_back(info_.partial_word_label);
-    if (info_in_.silence_label == 0)
-      syms_to_remove.push_back(info_.silence_label);
+    if (wb_info_in_.partial_word_label == 0)
+      syms_to_remove.push_back(wb_info_.partial_word_label);
+    if (wb_info_in_.silence_label == 0)
+      syms_to_remove.push_back(wb_info_.silence_label);
     if (!syms_to_remove.empty()) {
       RemoveSomeInputSymbols(syms_to_remove, lat_out_);
       Project(lat_out_, fst::PROJECT_INPUT);
@@ -332,9 +332,9 @@ class LatticeWordAligner {
   }
 
   CompactLattice lat_;
-  const TransitionModel &tmodel_;
-  const WordBoundaryInfo &info_in_;
-  WordBoundaryInfo info_;
+  const Transitions &tmodel_;
+  const WordBoundaryInfo &wb_info_in_;
+  WordBoundaryInfo wb_info_;
   int32 max_states_;
   CompactLattice *lat_out_;
 
@@ -348,222 +348,177 @@ class LatticeWordAligner {
 };
 
 bool LatticeWordAligner::ComputationState::OutputSilenceArc(
-    const WordBoundaryInfo &info, const TransitionModel &tmodel,
+    const WordBoundaryInfo &wb_info, const Transitions &tmodel,
     CompactLatticeArc *arc_out,  bool *error) {
   if (transition_ids_.empty()) return false;
-  int32 phone = tmodel.TransitionIdToPhone(transition_ids_[0]);
-  if (info.TypeOfPhone(phone) != WordBoundaryInfo::kNonWordPhone) return false;
+  const Transitions::TransitionIdInfo *prev_info = &tmodel.InfoForTransitionId(
+      transition_ids_[0]);
+
+  if (wb_info.TypeOfPhone(prev_info->phone) != WordBoundaryInfo::kNonWordPhone)
+    return false;
+
 
   // we assume the start of transition_ids_ is the start of the phone [silence];
   // this is a precondition.
+  if (!prev_info->is_initial) {
+    KALDI_WARN << "Something went wrong in word alignment; likely model mismatch.";
+    return false;
+  }
+
   size_t len = transition_ids_.size(), i;
-  // Keep going till we reach a "final" transition-id; note, if
-  // reorder==true, we have to go a bit further after this.
-  for (i = 0; i < len; i++) {
-    int32 tid = transition_ids_[i];
-    int32 this_phone = tmodel.TransitionIdToPhone(tid);
-    if (this_phone != phone && ! *error) { // error condition: should have reached final transition-id first.
-      *error = true;
-      KALDI_WARN << "Phone changed before final transition-id found "
-          "[broken lattice or mismatched model or wrong --reorder option?]";
+
+  for (i = 1; i < len; i++) {
+    const Transitions::TransitionIdInfo *this_info = &tmodel.InfoForTransitionId(
+        transition_ids_[i]);
+    if (prev_info->is_final && this_info->is_initial) {
+      // This is a phone boundary.
+      std::vector<int32> tids_out(transition_ids_.begin(),
+                                  transition_ids_.begin() + i);
+      *arc_out = CompactLatticeArc(wb_info.silence_label, wb_info.silence_label,
+                                   CompactLatticeWeight(weight_, tids_out), fst::kNoStateId);
+      transition_ids_.erase(transition_ids_.begin(), transition_ids_.begin() + i);
+      weight_ = LatticeWeight::One(); // we just output the weight.
+      return true;
     }
-    if (tmodel.IsFinal(tid))
-      break;
-  }
-  if (i == len) return false; // fell off loop.
-  i++; // go past the one for which IsFinal returned true.
-  if (info.reorder) // we have to consume the following self-loop transition-ids.
-    while (i < len && tmodel.IsSelfLoop(transition_ids_[i])) i++;
-  if (i == len) return false; // we don't know if it ends here... so can't output arc.
-
-  if (tmodel.TransitionIdToPhone(transition_ids_[i-1]) != phone
-      && ! *error) { // another check.
-    KALDI_WARN << "Phone changed unexpectedly in lattice "
-        "[broken lattice or mismatched model?]";
+    prev_info = this_info;
   }
-  // interpret i as the number of transition-ids to consume.
-  std::vector<int32> tids_out(transition_ids_.begin(), transition_ids_.begin()+i);
-
-  // consumed transition ids from our internal state.
-  *arc_out = CompactLatticeArc(info.silence_label, info.silence_label,
-                               CompactLatticeWeight(weight_, tids_out), fst::kNoStateId);
-  transition_ids_.erase(transition_ids_.begin(), transition_ids_.begin()+i); // delete these
-  weight_ = LatticeWeight::One(); // we just output the weight.
-  return true;
+  // We couldn't find a word boundary.  Note: we also return false if the
+  // word boundary was at the end of this sequence, because we don't know at this point
+  // that it was a word boundary.   End of lattice effects will be handled separately.
+  return false;
 }
 
 
 bool LatticeWordAligner::ComputationState::OutputOnePhoneWordArc(
-    const WordBoundaryInfo &info, const TransitionModel &tmodel,
+    const WordBoundaryInfo &wb_info, const Transitions &tmodel,
     CompactLatticeArc *arc_out,  bool *error) {
   if (transition_ids_.empty()) return false;
   if (word_labels_.empty()) return false;
-  int32 phone = tmodel.TransitionIdToPhone(transition_ids_[0]);
-  if (info.TypeOfPhone(phone) != WordBoundaryInfo::kWordBeginAndEndPhone)
+  const Transitions::TransitionIdInfo *prev_info = &tmodel.InfoForTransitionId(
+      transition_ids_[0]);
+  if (wb_info.TypeOfPhone(prev_info->phone) != WordBoundaryInfo::kWordBeginAndEndPhone)
     return false;
-  // we assume the start of transition_ids_ is the start of the phone.
-  // this is a precondition.
+  if (!prev_info->is_initial) {
+    KALDI_WARN << "Something went wrong in word alignment; likely model mismatch.";
+    return false;
+  }
+
   size_t len = transition_ids_.size(), i;
-  for (i = 0; i < len; i++) {
-    int32 tid = transition_ids_[i];
-    int32 this_phone = tmodel.TransitionIdToPhone(tid);
-    if (this_phone != phone && ! *error) { // error condition: should have reached final transition-id first.
-      KALDI_WARN << "Phone changed before final transition-id found "
-          "[broken lattice or mismatched model or wrong --reorder option?]";
-      // just continue, ignoring this-- we'll probably output something...
+  for (i = 1; i < len; i++) {
+    const Transitions::TransitionIdInfo *this_info = &tmodel.InfoForTransitionId(
+        transition_ids_[i]);
+    if (prev_info->is_final && this_info->is_initial) {
+      // This is a phone boundary.
+      int32 word = word_labels_[0];
+      std::vector<int32> tids_out(transition_ids_.begin(),
+                                  transition_ids_.begin() + i);
+      *arc_out = CompactLatticeArc(word, word,
+                                   CompactLatticeWeight(weight_, tids_out), fst::kNoStateId);
+      transition_ids_.erase(transition_ids_.begin(),
+                            transition_ids_.begin() + i);
+      weight_ = LatticeWeight::One();  // we just output the weight.
+      word_labels_.erase(word_labels_.begin());
+      return true;
     }
-    if (tmodel.IsFinal(tid))
-      break;
-  }
-  if (i == len) return false; // fell off loop.
-  i++; // go past the one for which IsFinal returned true.
-  if (info.reorder) // we have to consume the following self-loop transition-ids.
-    while (i < len && tmodel.IsSelfLoop(transition_ids_[i])) i++;
-  if (i == len) return false; // we don't know if it ends here... so can't output arc.
-
-  if (tmodel.TransitionIdToPhone(transition_ids_[i-1]) != phone
-      && ! *error) { // another check.
-    KALDI_WARN << "Phone changed unexpectedly in lattice "
-        "[broken lattice or mismatched model?]";
-    *error = true;
+    prev_info = this_info;
   }
-
-  // interpret i as the number of transition-ids to consume.
-  std::vector<int32> tids_out(transition_ids_.begin(),
-                              transition_ids_.begin() + i);
-
-  // consumed transition ids from our internal state.
-  int32 word = word_labels_[0];
-  *arc_out = CompactLatticeArc(word, word,
-                               CompactLatticeWeight(weight_, tids_out), fst::kNoStateId);
-  transition_ids_.erase(transition_ids_.begin(),
-                        transition_ids_.begin() + i); // delete these
-  // Remove the word that we just output.
-  word_labels_.erase(word_labels_.begin(), word_labels_.begin() + 1);
-  weight_ = LatticeWeight::One(); // we just output the weight.
-  return true;
+  // We couldn't find a word boundary.  Note: we also return false if the
+  // word boundary was at the end of this sequence, because we don't know at this point
+  // that it was a word boundary.   End of lattice effects will be handled separately.
+  return false;
 }
 
 
 /// This function tries to see if it can output a normal word arc--
 /// one with at least two phones in it.
 bool LatticeWordAligner::ComputationState::OutputNormalWordArc(
-    const WordBoundaryInfo &info, const TransitionModel &tmodel,
+    const WordBoundaryInfo &wb_info, const Transitions &tmodel,
     CompactLatticeArc *arc_out,  bool *error) {
   if (transition_ids_.empty()) return false;
   if (word_labels_.empty()) return false;
-  int32 begin_phone = tmodel.TransitionIdToPhone(transition_ids_[0]);
-  if (info.TypeOfPhone(begin_phone) != WordBoundaryInfo::kWordBeginPhone)
+  const Transitions::TransitionIdInfo *prev_info = &tmodel.InfoForTransitionId(
+      transition_ids_[0]);
+  if (wb_info.TypeOfPhone(prev_info->phone) != WordBoundaryInfo::kWordBeginPhone)
     return false;
-  // we assume the start of transition_ids_ is the start of the phone.
+  // Note, we assume the start of transition_ids_ is the start of the phone.
   // this is a precondition.
   size_t len = transition_ids_.size(), i;
 
-  // Eat up the transition-ids of this word-begin phone until we get to the
-  // "final" transition-id.  [there may be self-loops following this though,
-  // if reorder==true]
-  for (i = 0; i < len && !tmodel.IsFinal(transition_ids_[i]); i++);
-  if (i == len) return false;
-  i++; // Skip over this final-transition.
-  if (info.reorder) // Skip over any reordered self-loops for this final-transition
-    for (; i < len && tmodel.IsSelfLoop(transition_ids_[i]); i++);
-  if (i == len) return false;
-  if (tmodel.TransitionIdToPhone(transition_ids_[i-1]) != begin_phone
-      && ! *error) { // another check.
-    KALDI_WARN << "Phone changed unexpectedly in lattice "
-        "[broken lattice or mismatched model?]";
-    *error = true;
-  }
-  // Now keep going till we hit a word-ending phone.
-  // Note: we don't expect anything except word-internal phones
-  // here, but we'll just print a warning if we get something
-  // else.
-  for (; i < len; i++) {
-    int32 this_phone = tmodel.TransitionIdToPhone(transition_ids_[i]);
-    if (info.TypeOfPhone(this_phone) == WordBoundaryInfo::kWordEndPhone)
+  for (i = 1; i < len; i++) {
+    const Transitions::TransitionIdInfo *this_info = &tmodel.InfoForTransitionId(
+        transition_ids_[i]);
+    if (prev_info->is_final && this_info->is_initial) {
+      // This is a phone boundary.
       break;
-    if (info.TypeOfPhone(this_phone) != WordBoundaryInfo::kWordInternalPhone
-        && !*error) {
-      KALDI_WARN << "Unexpected phone " << this_phone
-                 << " found inside a word.";
-      *error = true;
     }
+    prev_info = this_info;
   }
-  if (i == len) return false;
-
-  // OK, we hit a word-ending phone.  Continue till we get to
-  // a "final-transition".
-
-  // this variable just used for checks.
-  int32 final_phone = tmodel.TransitionIdToPhone(transition_ids_[i]);
+  // OK, we just consumed the word-initial phone.
+  if (i == len)
+    return false;
+  prev_info = &tmodel.InfoForTransitionId(transition_ids_[i]);
+  i++;
+  // Eat up any word-internal phones.
+  while (i < len && wb_info.TypeOfPhone(prev_info->phone) ==
+         WordBoundaryInfo::kWordInternalPhone) {
+    prev_info = &tmodel.InfoForTransitionId(transition_ids_[i]);
+    i++;
+  }
+  if (i == len)
+    return false;
+  // Try to find the ending of the next phone, which should be a word-final
+  // phone.
   for (; i < len; i++) {
-    int32 this_phone = tmodel.TransitionIdToPhone(transition_ids_[i]);
-    if (this_phone != final_phone && ! *error) {
-      *error = true;
-      KALDI_WARN << "Phone changed before final transition-id found "
-          "[broken lattice or mismatched model or wrong --reorder option?]";
+    const Transitions::TransitionIdInfo *this_info = &tmodel.InfoForTransitionId(
+        transition_ids_[i]);
+    if (prev_info->is_final && this_info->is_initial) {
+      // This is a phone boundary.
+      if (wb_info.TypeOfPhone(prev_info->phone) !=
+          WordBoundaryInfo::kWordEndPhone) {
+        if (! *error) {
+          *error = true;
+          KALDI_WARN << "Unexpected phone sequences found.. something is wrong.";
+        }
+        return false;
+      }
+      int32 word = word_labels_[0];
+      std::vector<int32> tids_out(transition_ids_.begin(),
+                                  transition_ids_.begin() + i);
+      *arc_out = CompactLatticeArc(word, word,
+                                   CompactLatticeWeight(weight_, tids_out),
+                                   fst::kNoStateId);
+      transition_ids_.erase(transition_ids_.begin(),
+                            transition_ids_.begin() + i);
+      weight_ = LatticeWeight::One();  // we just output the weight.
+      word_labels_.erase(word_labels_.begin());
+      return true;
     }
-    if (tmodel.IsFinal(transition_ids_[i])) break;
-  }
-  if (i == len) return false;
-  i++;
-  // We got to the final-transition of the final phone;
-  // if reorder==true, continue eating up the self-loop.
-  if (info.reorder == true)
-    while (i < len && tmodel.IsSelfLoop(transition_ids_[i])) i++;
-  if (i == len) return false;
-  if (tmodel.TransitionIdToPhone(transition_ids_[i-1]) != final_phone
-      && ! *error) {
-    *error = true;
-    KALDI_WARN << "Phone changed while following final self-loop "
-        "[broken lattice or mismatched model or wrong --reorder option?]";
+    prev_info = this_info;
   }
-
-  // OK, we're ready to output the word.
-  // Interpret i as the number of transition-ids to consume.
-  std::vector<int32> tids_out(transition_ids_.begin(),
-                              transition_ids_.begin() + i);
-
-  // consumed transition ids from our internal state.
-  int32 word = word_labels_[0];
-  *arc_out = CompactLatticeArc(word, word,
-                               CompactLatticeWeight(weight_, tids_out),
-                               fst::kNoStateId);
-  transition_ids_.erase(transition_ids_.begin(),
-                        transition_ids_.begin() + i); // delete these
-  // Remove the word that we just output.
-  word_labels_.erase(word_labels_.begin(),
-                     word_labels_.begin() + 1);
-  weight_ = LatticeWeight::One(); // we just output the weight.
-  return true;
+  return false;
 }
 
 // Returns true if this vector of transition-ids could be a valid
 // word.  Note: the checks are not 100% exhaustive.
-static bool IsPlausibleWord(const WordBoundaryInfo &info,
-                            const TransitionModel &tmodel,
+static bool IsPlausibleWord(const WordBoundaryInfo &wb_info,
+                            const Transitions &tmodel,
                             const std::vector<int32> &transition_ids) {
   if (transition_ids.empty()) return false;
-  int32 first_phone = tmodel.TransitionIdToPhone(transition_ids.front()),
-      last_phone = tmodel.TransitionIdToPhone(transition_ids.back());
-  if ( (info.TypeOfPhone(first_phone) == WordBoundaryInfo::kWordBeginAndEndPhone
-        && first_phone == last_phone)
-       ||
-       (info.TypeOfPhone(first_phone) == WordBoundaryInfo::kWordBeginPhone &&
-        info.TypeOfPhone(last_phone) == WordBoundaryInfo::kWordEndPhone) ) {
-    if (! info.reorder) {
-      return (tmodel.IsFinal(transition_ids.back()));
-    } else {
-      int32 i = transition_ids.size() - 1;
-      while (i > 0 && tmodel.IsSelfLoop(transition_ids[i])) i--;
-      return tmodel.IsFinal(transition_ids[i]);
-    }
-  } else return false;
+  const Transitions::TransitionIdInfo
+      &first_info = tmodel.InfoForTransitionId(transition_ids.front()),
+      &last_info = tmodel.InfoForTransitionId(transition_ids.back());
+  if (!first_info.is_initial || !last_info.is_final)
+    return false;
+  int32 first_phone = first_info.phone, last_phone = last_info.phone;
+  return ((wb_info.TypeOfPhone(first_phone) == WordBoundaryInfo::kWordBeginAndEndPhone
+           && first_phone == last_phone) ||
+          (wb_info.TypeOfPhone(first_phone) == WordBoundaryInfo::kWordBeginPhone &&
+           wb_info.TypeOfPhone(last_phone) == WordBoundaryInfo::kWordEndPhone) );
 }
 
 
 void LatticeWordAligner::ComputationState::OutputArcForce(
-    const WordBoundaryInfo &info, const TransitionModel &tmodel,
+    const WordBoundaryInfo &info, const Transitions &tmodel,
     CompactLatticeArc *arc_out,  bool *error) {
 
   KALDI_ASSERT(!IsEmpty());
@@ -600,10 +555,10 @@ void LatticeWordAligner::ComputationState::OutputArcForce(
     word_labels_.clear();
   } else if (!transition_ids_.empty() && word_labels_.empty()) {
     // Transition-ids but no word label-- either silence or partial word.
-    int32 first_phone = tmodel.TransitionIdToPhone(transition_ids_[0]);
+    int32 first_phone = tmodel.InfoForTransitionId(transition_ids_[0]).phone;
     if (info.TypeOfPhone(first_phone) == WordBoundaryInfo::kNonWordPhone) {
       // first phone is silence...
-      if (first_phone != tmodel.TransitionIdToPhone(transition_ids_.back())
+      if (first_phone != tmodel.InfoForTransitionId(transition_ids_.back()).phone
           && ! *error) {
         *error = true;
         // Phone changed-- this is a code error, because the regular OutputArc
@@ -612,16 +567,12 @@ void LatticeWordAligner::ComputationState::OutputArcForce(
         KALDI_ERR << "Broken silence arc at end of utterance (the phone "
             "changed); code error";
       }
-      if (!*error) { // Check that it ends at the end state of silence; error otherwise.
-        int32 i = transition_ids_.size() - 1;
-        if (info.reorder)
-          while (tmodel.IsSelfLoop(transition_ids_[i]) && i > 0)
-            i--;
-        if (!tmodel.IsFinal(transition_ids_[i])) {
-          *error = true;
-          KALDI_WARN << "Broken silence arc at end of utterance (does not "
-              "reach end of silence)";
-        }
+      if (!*error &&
+          !tmodel.InfoForTransitionId(transition_ids_.back()).is_final) {
+        // warn but output it anyway.
+        *error = true;
+        KALDI_WARN << "Broken silence arc at end of utterance (does not "
+            "reach end of silence)";
       }
       CompactLatticeWeight cw(weight_, transition_ids_);
       *arc_out = CompactLatticeArc(info.silence_label, info.silence_label,
@@ -673,20 +624,17 @@ WordBoundaryInfo::WordBoundaryInfo(const WordBoundaryInfoOpts &opts) {
   SetOptions(opts.winternal_phones, kWordInternalPhone);
   SetOptions(opts.silence_phones, (opts.silence_has_olabels ?
                                    kWordBeginAndEndPhone : kNonWordPhone));
-  reorder = opts.reorder;
   silence_label = opts.silence_label;
   partial_word_label = opts.partial_word_label;
 }
 
 WordBoundaryInfo::WordBoundaryInfo(const WordBoundaryInfoNewOpts &opts) {
-  reorder = opts.reorder;
   silence_label = opts.silence_label;
   partial_word_label = opts.partial_word_label;
 }
 
 WordBoundaryInfo::WordBoundaryInfo(const WordBoundaryInfoNewOpts &opts,
                                    std::string word_boundary_file) {
-  reorder = opts.reorder;
   silence_label = opts.silence_label;
   partial_word_label = opts.partial_word_label;
   bool binary_in;
@@ -721,7 +669,7 @@ void WordBoundaryInfo::Init(std::istream &stream) {
 }
 
 bool WordAlignLattice(const CompactLattice &lat,
-                      const TransitionModel &tmodel,
+                      const Transitions &tmodel,
                       const WordBoundaryInfo &info,
                       int32 max_states,
                       CompactLattice *lat_out) {
@@ -734,165 +682,94 @@ bool WordAlignLattice(const CompactLattice &lat,
 class WordAlignedLatticeTester {
  public:
   WordAlignedLatticeTester(const CompactLattice &lat,
-                           const TransitionModel &tmodel,
+                           const Transitions &tmodel,
                            const WordBoundaryInfo &info,
                            const CompactLattice &aligned_lat):
-      lat_(lat), tmodel_(tmodel), info_(info), aligned_lat_(aligned_lat) { }
+      lat_(lat), tmodel_(tmodel), wb_info_(info), aligned_lat_(aligned_lat) { }
 
   void Test() {
     // First test that each aligned arc is valid.
     typedef CompactLattice::StateId StateId ;
+    typedef CompactLattice::Arc Arc;
     for (StateId s = 0; s < aligned_lat_.NumStates(); s++) {
       for (fst::ArcIterator<CompactLattice> iter(aligned_lat_, s);
            !iter.Done();
            iter.Next()) {
-        TestArc(iter.Value());
-      }
-      if (aligned_lat_.Final(s) != CompactLatticeWeight::Zero()) {
-        TestFinal(aligned_lat_.Final(s));
+        const Arc &arc = iter.Value();
+        if (!TestArc(arc))
+          KALDI_ERR << "Invalid arc in aligned CompactLattice: "
+                    << arc.ilabel << " " << arc.olabel << " " << arc.nextstate
+                    << " " << arc.weight;
       }
+      if (aligned_lat_.Final(s) != CompactLatticeWeight::Zero() &&
+          !aligned_lat_.Final(s).String().empty())
+        KALDI_ERR << "Expect to have no strings on final-weights of word-aligned "
+            "lattices.";
     }
     TestEquivalent();
   }
  private:
-  void TestArc(const CompactLatticeArc &arc) {
-    if (! (TestArcSilence(arc) || TestArcNormalWord(arc) || TestArcOnePhoneWord(arc)
-           || TestArcEmpty(arc)))
-      KALDI_ERR << "Invalid arc in aligned CompactLattice: "
-                << arc.ilabel << " " << arc.olabel << " " << arc.nextstate
-                << " " << arc.weight;
-  }
-  bool TestArcEmpty(const CompactLatticeArc &arc) {
-    if (arc.ilabel != 0) return false; // Check there is no label.  Note, ilabel==olabel.
-    const std::vector<int32> &tids = arc.weight.String();
-    return tids.empty();
-  }
-  bool TestArcSilence(const CompactLatticeArc &arc) {
-    // This only applies when silence doesn't have word labels.
-    if (arc.ilabel !=  info_.silence_label) return false; // Check the label is
-    // the silence label. Note, ilabel==olabel.
-    const std::vector<int32> &tids = arc.weight.String();
-    if (tids.empty()) return false;
-    int32 first_phone = tmodel_.TransitionIdToPhone(tids.front());
-    if (info_.TypeOfPhone(first_phone) != WordBoundaryInfo::kNonWordPhone)
+  bool TestArc(const CompactLatticeArc &arc) {
+    std::vector<int32> phones;
+    if (!SplitArcToPhones(arc, &phones))
       return false;
-    for (size_t i = 0; i < tids.size(); i++)
-      if (tmodel_.TransitionIdToPhone(tids[i]) != first_phone) return false;
-
-    if (!info_.reorder) return tmodel_.IsFinal(tids.back());
-    else {
-      for (size_t i = 0; i < tids.size(); i++) {
-        if (tmodel_.IsFinal(tids[i])) { // got the "final" transition, which is
-          // reordered to actually not be final.  Make sure that all the
-          // rest of the transition ids are the self-loop of that same
-          // transition-state.
-          for (size_t j = i+1; j < tids.size(); j++) {
-            if (!(tmodel_.TransitionIdToTransitionState(tids[j])
-                  == tmodel_.TransitionIdToTransitionState(tids[i]))) return false;
-          }
-          return true;
-        }
-      }
-      return false; // fell off loop.  No final-state present.
-    }
-  }
-
-  bool TestArcOnePhoneWord(const CompactLatticeArc &arc) {
-    if (arc.ilabel == 0) return false; // Check there's a label.  Note, ilabel==olabel.
-    const std::vector<int32> &tids = arc.weight.String();
-    if (tids.empty()) return false;
-    int32 first_phone = tmodel_.TransitionIdToPhone(tids.front());
-    if (info_.TypeOfPhone(first_phone) !=
-        WordBoundaryInfo::kWordBeginAndEndPhone) return false;
-    for (size_t i = 0; i < tids.size(); i++)
-      if (tmodel_.TransitionIdToPhone(tids[i]) != first_phone) return false;
-
-    if (!info_.reorder) return tmodel_.IsFinal(tids.back());
-    else {
-      for (size_t i = 0; i < tids.size(); i++) {
-        if (tmodel_.IsFinal(tids[i])) { // got the "final" transition, which is
-          // reordered to actually not be final.  Make sure that all the
-          // rest of the transition ids are the self-loop of that same
-          // transition-state.
-          for (size_t j = i+1; j < tids.size(); j++) {
-            if (tmodel_.TransitionIdToTransitionState(tids[j])
-                != tmodel_.TransitionIdToTransitionState(tids[i])) return false;
-          }
-          return true;
-        }
-      }
-      return false; // fell off loop.  No final-state present.
+    if (arc.ilabel == 0 && phones.empty())
+      return true;  // epsilon/empty arc (allowed).
+    if (arc.ilabel == wb_info_.silence_label &&
+        phones.size() == 1 &&
+        wb_info_.TypeOfPhone(phones.front()) == WordBoundaryInfo::kNonWordPhone)
+      return true;  // could be a silence arc.
+    if (arc.ilabel != 0 && phones.size() == 1 &&
+        wb_info_.TypeOfPhone(phones.front()) == WordBoundaryInfo::kWordBeginAndEndPhone)
+      return true;  // could be single-phone word arc.
+
+
+    {  // Now test if it could be a normal (non-single-phone) word arc.
+      if (phones.size() < 2 || arc.ilabel == 0) return false;
+      if (wb_info_.TypeOfPhone(phones.front()) != WordBoundaryInfo::kWordBeginPhone)
+        return false;
+      for (size_t i = 1; 1 + 1 < phones.size(); i++)
+        if (wb_info_.TypeOfPhone(phones[i]) != WordBoundaryInfo::kWordInternalPhone)
+          return false;
+      if (wb_info_.TypeOfPhone(phones.back()) != WordBoundaryInfo::kWordEndPhone)
+        return false;
+      return true;  // A normal word arc
     }
   }
 
-  bool TestArcNormalWord(const CompactLatticeArc &arc) {
-    if (arc.ilabel == 0) return false; // Check there's a label.  Note, ilabel==olabel.
+  // This function, used in testing code, splits up the transition_ids on an arc into
+  // a sequence of phones.  If returns false if the arc does not contain "whole phones",
+  // i.e. if it doesn't start at the start of a phone and end at the end of a phone.
+  bool SplitArcToPhones(const CompactLatticeArc &arc,
+                        std::vector<int32> *phones) {
     const std::vector<int32> &tids = arc.weight.String();
-    if (tids.empty()) return false;
-    int32 first_phone = tmodel_.TransitionIdToPhone(tids.front());
-    if (info_.TypeOfPhone(first_phone) != WordBoundaryInfo::kWordBeginPhone)
+    phones->clear();
+    if (tids.empty())
+      return true;
+    const Transitions::TransitionIdInfo *cur_info = &tmodel_.InfoForTransitionId(
+        tids[0]);
+    if (!cur_info->is_initial)
       return false;
-    size_t i;
-    { // first phone.
-      int num_final = 0;
-      for (i = 0; i < tids.size(); i++) {
-        if (tmodel_.TransitionIdToPhone(tids[i]) != first_phone) break;
-        if (tmodel_.IsFinal(tids[i])) num_final++;
-      }
-      if (num_final != 1)
-        return false; // Something went wrong-- perhaps we
-      // got two beginning phones in a row.
-    }
-    { // middle phones.  Skip over them.
-      while (i < tids.size() &&
-             info_.TypeOfPhone(tmodel_.TransitionIdToPhone(tids[i]))
-             == WordBoundaryInfo::kWordInternalPhone)
-        i++;
-    }
-    if (i == tids.size()) return false;
-    int32 final_phone = tmodel_.TransitionIdToPhone(tids[i]);
-    if (info_.TypeOfPhone(final_phone) != WordBoundaryInfo::kWordEndPhone)
-      return false; // not word-ending.
-    for (size_t j = i; j < tids.size(); j++) // make sure only this final phone till end.
-      if (tmodel_.TransitionIdToPhone(tids[j]) != final_phone)
-        return false; // Other phones after final phone.
-
-    for (size_t j = i; j < tids.size(); j++) {
-      if (tmodel_.IsFinal(tids[j])) { // Found "final transition"..   Note:
-        // may be "reordered" with its self loops.
-        if (!info_.reorder) return (j+1 == tids.size());
-        else {
-          // Make sure the only thing that follows this is self-loops
-          // of the final transition-state.
-          for (size_t k = j + 1; k < tids.size(); k++)
-            if (tmodel_.TransitionIdToTransitionState(tids[k])
-                != tmodel_.TransitionIdToTransitionState(tids[j])
-                || !tmodel_.IsSelfLoop(tids[k]))
-              return false;
-          return true;
-        }
+    size_t len = tids.size(), i;
+    for (i = 0; i < len; i++) {
+      cur_info = &tmodel_.InfoForTransitionId(tids[i]);
+      if (cur_info->is_initial && !cur_info->is_self_loop) {
+        // there is exactly one such arc per phone.
+        phones->push_back(cur_info->phone);
       }
+      return false;
     }
-    return false; // Found no final state.
+    if (!cur_info->is_final)
+      return false;
+    return true;
   }
 
-  bool TestArcPartialWord(const CompactLatticeArc &arc) {
-    if (arc.ilabel != info_.partial_word_label) return false; // label should
-    // be the partial-word label.
-    const std::vector<int32> &tids = arc.weight.String();
-    if (tids.empty()) return false;
-    return true; // We're pretty liberal when it comes to partial words here.
-  }
 
-  void TestFinal(const CompactLatticeWeight &w) {
-    if (!w.String().empty())
-      KALDI_ERR << "Expect to have no strings on final-weights of lattices.";
-  }
   void TestEquivalent() {
     CompactLattice aligned_lat(aligned_lat_);
-    if (info_.silence_label != 0) { // remove silence labels.
+    if (wb_info_.silence_label != 0) { // remove silence labels.
       std::vector<int32> to_remove;
-      to_remove.push_back(info_.silence_label);
+      to_remove.push_back(wb_info_.silence_label);
       RemoveSomeInputSymbols(to_remove, &aligned_lat);
       Project(&aligned_lat, fst::PROJECT_INPUT);
     }
@@ -904,8 +781,8 @@ class WordAlignedLatticeTester {
   }
 
   const CompactLattice &lat_;
-  const TransitionModel &tmodel_;
-  const WordBoundaryInfo &info_;
+  const Transitions &tmodel_;
+  const WordBoundaryInfo &wb_info_;
   const CompactLattice &aligned_lat_;
 };
 
@@ -916,7 +793,7 @@ class WordAlignedLatticeTester {
 /// succeeded and it wasn't a forced-out lattice); otherwise the test will most
 /// likely fail.
 void TestWordAlignedLattice(const CompactLattice &lat,
-                            const TransitionModel &tmodel,
+                            const Transitions &tmodel,
                             const WordBoundaryInfo &info,
                             const CompactLattice &aligned_lat) {
   WordAlignedLatticeTester t(lat, tmodel, info, aligned_lat);
diff --git a/src/lat/word-align-lattice.h b/src/lat/word-align-lattice.h
index 41be075d108..725341bc867 100644
--- a/src/lat/word-align-lattice.h
+++ b/src/lat/word-align-lattice.h
@@ -25,7 +25,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 
 namespace kaldi {
@@ -34,7 +34,7 @@ namespace kaldi {
 struct WordBoundaryInfoOpts {
   // Note: use of this structure
   // is deprecated, see WordBoundaryInfoNewOpts.
-  
+
   // Note: this structure (and the code in word-align-lattice.{h,cc}
   // makes stronger assumptions than the rest of the Kaldi toolkit:
   // that is, it assumes you have word-position-dependent phones,
@@ -51,14 +51,13 @@ struct WordBoundaryInfoOpts {
   std::string silence_phones;
   int32 silence_label;
   int32 partial_word_label;
-  bool reorder;
   bool silence_may_be_word_internal;
   bool silence_has_olabels;
-  
+
   WordBoundaryInfoOpts(): silence_label(0), partial_word_label(0),
-                          reorder(true), silence_may_be_word_internal(false),
+                          silence_may_be_word_internal(false),
                           silence_has_olabels(false) { }
-  
+
   void Register(OptionsItf *opts) {
     opts->Register("wbegin-phones", &wbegin_phones, "Colon-separated list of "
                    "numeric ids of phones that begin a word");
@@ -80,12 +79,9 @@ struct WordBoundaryInfoOpts {
                    "word symbol that is to be used for arcs in the word-aligned "
                    "lattice corresponding to partial words at the end of "
                    "\"forced-out\" utterances (zero is OK)");
-    opts->Register("reorder", &reorder, "True if the lattices were generated "
-                   "from graphs that had the --reorder option true, relating to "
-                   "reordering self-loops (typically true)");
     opts->Register("silence-may-be-word-internal", &silence_may_be_word_internal,
                    "If true, silence may appear inside words' prons (but not at begin/end!)\n");
-    opts->Register("silence-has-olabels", &silence_has_olabels, 
+    opts->Register("silence-has-olabels", &silence_has_olabels,
                    "If true, silence phones have output labels in the lattice, just\n"
                    "like regular words.  [This means you can't have un-labeled silences]");
   }
@@ -96,11 +92,9 @@ struct WordBoundaryInfoOpts {
 struct WordBoundaryInfoNewOpts {
   int32 silence_label;
   int32 partial_word_label;
-  bool reorder;
-  
-  WordBoundaryInfoNewOpts(): silence_label(0), partial_word_label(0),
-                             reorder(true) { }
-  
+
+  WordBoundaryInfoNewOpts(): silence_label(0), partial_word_label(0) { }
+
   void Register(OptionsItf *opts) {
     opts->Register("silence-label", &silence_label, "Numeric id of word symbol "
                    "that is to be used for silence arcs in the word-aligned "
@@ -109,9 +103,6 @@ struct WordBoundaryInfoNewOpts {
                    "word symbol that is to be used for arcs in the word-aligned "
                    "lattice corresponding to partial words at the end of "
                    "\"forced-out\" utterances (zero is OK)");
-    opts->Register("reorder", &reorder, "True if the lattices were generated "
-                   "from graphs that had the --reorder option true, relating to "
-                   "reordering self-loops (typically true)");
   }
 };
 
@@ -150,7 +141,7 @@ struct WordBoundaryInfo {
           "word-boundary file (or options)";
     return phone_to_type[p];
   }
-  
+
   std::vector<PhoneType> phone_to_type;
 
   int32 silence_label; // The integer label we give to silence words.
@@ -189,7 +180,7 @@ struct WordBoundaryInfo {
 /// abort the computation, return false and produce an empty
 /// lattice out.
 bool WordAlignLattice(const CompactLattice &lat,
-                      const TransitionModel &tmodel,
+                      const Transitions &tmodel,
                       const WordBoundaryInfo &info,
                       int32 max_states,
                       CompactLattice *lat_out);
@@ -203,7 +194,7 @@ bool WordAlignLattice(const CompactLattice &lat,
 ///   partial-word arcs, with the partial-word label.
 ///   silence arcs, with the silence label.
 void TestWordAlignedLattice(const CompactLattice &lat,
-                            const TransitionModel &tmodel,
+                            const Transitions &tmodel,
                             const WordBoundaryInfo &info,
                             const CompactLattice &aligned_lat);
 
diff --git a/src/latbin/Makefile b/src/latbin/Makefile
index 9809cdcbb85..5f686e8ae3e 100644
--- a/src/latbin/Makefile
+++ b/src/latbin/Makefile
@@ -35,6 +35,6 @@ TESTFILES =
 ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../nnet3/kaldi-nnet3.a \
           ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/latbin/lattice-add-trans-probs.cc b/src/latbin/lattice-add-trans-probs.cc
index 0fa79338f8e..bf783ea77e7 100644
--- a/src/latbin/lattice-add-trans-probs.cc
+++ b/src/latbin/lattice-add-trans-probs.cc
@@ -23,7 +23,7 @@
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 
 int main(int argc, char *argv[]) {
@@ -36,24 +36,13 @@ int main(int argc, char *argv[]) {
     using fst::StdArc;
 
     const char *usage =
-        "Add transition probabilities into graph part of lattice scores,\n"
-        "controlled by options --transition-scale and --self-loop-scale, which\n"
-        "for compatibility with the original graph, would normally be set to the same\n"
-        "values used in graph compilatoin\n"
+        "Add transition probabilities into graph part of lattice scores\n"
         "\n"
         "Usage: lattice-add-trans-probs [options] model lattice-rspecifier lattice-wspecifier\n"
-        " e.g.: lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=0.1 1.mdl ark:in.lats ark:out.lats\n";
+        " e.g.: lattice-add-trans-probs ark:in.lats ark:out.lats\n";
 
     ParseOptions po(usage);
 
-    BaseFloat transition_scale = 1.0, self_loop_scale = 1.0;
-
-    po.Register("transition-scale", &transition_scale,
-                "Scale for transition probabilities (excluding self-loops)");
-    po.Register("self-loop-scale", &self_loop_scale,
-                "Probability scale for self-loop vs. non-self-loop "
-                "probability mass.");
-
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -68,7 +57,7 @@ int main(int argc, char *argv[]) {
 
     int32 n_done = 0;
 
-    TransitionModel trans_model;
+    Transitions trans_model;
 
     ReadKaldiObject(model_rxfilename, &trans_model);
 
@@ -77,7 +66,7 @@ int main(int argc, char *argv[]) {
     CompactLatticeWriter clat_writer(lats_wspecifier); // write as compact.
     for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++) {
       Lattice lat(lattice_reader.Value());
-      AddTransitionProbs(trans_model, transition_scale, self_loop_scale, &lat);
+      AddTransitionProbs(trans_model, &lat);
       CompactLattice clat;
       ConvertLattice(lat, &clat);
       clat_writer.Write(lattice_reader.Key(), clat);
diff --git a/src/latbin/lattice-align-phones.cc b/src/latbin/lattice-align-phones.cc
index 9367fb1f3a7..6781487e962 100644
--- a/src/latbin/lattice-align-phones.cc
+++ b/src/latbin/lattice-align-phones.cc
@@ -63,7 +63,7 @@ int main(int argc, char *argv[]) {
         lats_rspecifier = po.GetArg(2),
         lats_wspecifier = po.GetArg(3);
     
-    TransitionModel tmodel;
+    Transitions tmodel;
     ReadKaldiObject(model_rxfilename, &tmodel);
     
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
diff --git a/src/latbin/lattice-align-words-lexicon.cc b/src/latbin/lattice-align-words-lexicon.cc
index 72226731c7c..37dd670f9f8 100644
--- a/src/latbin/lattice-align-words-lexicon.cc
+++ b/src/latbin/lattice-align-words-lexicon.cc
@@ -80,7 +80,7 @@ int main(int argc, char *argv[]) {
       }
     }
 
-    TransitionModel tmodel;
+    Transitions tmodel;
     ReadKaldiObject(model_rxfilename, &tmodel);
     
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
diff --git a/src/latbin/lattice-align-words.cc b/src/latbin/lattice-align-words.cc
index 7f024258c42..0b8841fffe2 100644
--- a/src/latbin/lattice-align-words.cc
+++ b/src/latbin/lattice-align-words.cc
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
         lats_rspecifier = po.GetArg(3),
         lats_wspecifier = po.GetArg(4);
 
-    TransitionModel tmodel;
+    Transitions tmodel;
     ReadKaldiObject(model_rxfilename, &tmodel);
     
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
diff --git a/src/latbin/lattice-arc-post.cc b/src/latbin/lattice-arc-post.cc
index 38a5d6d304d..c8000d3dbba 100644
--- a/src/latbin/lattice-arc-post.cc
+++ b/src/latbin/lattice-arc-post.cc
@@ -34,7 +34,7 @@ class ArcPosteriorComputer {
   ArcPosteriorComputer(const CompactLattice &clat,
                        BaseFloat min_post,
                        bool print_alignment,
-                       const TransitionModel *trans_model = NULL):
+                       const Transitions *trans_model = NULL):
       clat_(clat), min_post_(min_post), print_alignment_(print_alignment),
       trans_model_(trans_model) { }
 
@@ -82,10 +82,10 @@ class ArcPosteriorComputer {
           const std::vector<int32> &ali = arc.weight.String();
           bool first_phone = true;
           for (int32 frame = 0; frame < num_frames; frame++) {
-            if (trans_model_->IsFinal(ali[frame])) {
+            if (trans_model_->InfoForTransitionId(ali[frame]).is_final) {
               if (first_phone) first_phone = false;
               else os << ' ';
-              os << trans_model_->TransitionIdToPhone(ali[frame]);
+              os << trans_model_->InfoForTransitionId(ali[frame]).phone;
             }
           }
         }
@@ -103,7 +103,7 @@ class ArcPosteriorComputer {
 
   BaseFloat min_post_;
   bool print_alignment_;
-  const TransitionModel *trans_model_;
+  const Transitions *trans_model_;
 };
 
 }
@@ -159,7 +159,7 @@ int main(int argc, char *argv[]) {
     if (acoustic_scale == 0.0)
       KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
 
-    kaldi::TransitionModel trans_model;
+    kaldi::Transitions trans_model;
 
     std::string lats_rspecifier, output_wxfilename;
     if (po.NumArgs() == 3) {
diff --git a/src/latbin/lattice-boost-ali.cc b/src/latbin/lattice-boost-ali.cc
index 98913fdb034..18b6887062d 100644
--- a/src/latbin/lattice-boost-ali.cc
+++ b/src/latbin/lattice-boost-ali.cc
@@ -79,7 +79,7 @@ int main(int argc, char *argv[]) {
     kaldi::RandomAccessInt32VectorReader alignment_reader(ali_rspecifier);
     kaldi::CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
 
-    kaldi::TransitionModel trans;
+    kaldi::Transitions trans;
     {
       bool binary_in;
       kaldi::Input ki(model_rxfilename, &binary_in);
diff --git a/src/latbin/lattice-copy.cc b/src/latbin/lattice-copy.cc
index 22bddef4575..5bf83dfbd8d 100644
--- a/src/latbin/lattice-copy.cc
+++ b/src/latbin/lattice-copy.cc
@@ -23,6 +23,8 @@
 #include "util/common-utils.h"
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
+#include "hmm/transitions.h"
+#include "lat/lattice-functions.h"
 
 namespace kaldi {
   int32 CopySubsetLattices(std::string filename,
@@ -154,6 +156,7 @@ int main(int argc, char *argv[]) {
     bool write_compact = true, ignore_missing = false;
     std::string include_rxfilename;
     std::string exclude_rxfilename;
+    std::string transition_model_rxfilename;
 
     po.Register("write-compact", &write_compact, "If true, write in normal (compact) form.");
     po.Register("include", &include_rxfilename,
@@ -166,6 +169,10 @@ int main(int argc, char *argv[]) {
                 "whose lattices will be excluded");
     po.Register("ignore-missing", &ignore_missing,
                 "Exit with status 0 even if no lattices are copied");
+    po.Register("add-transitions", &transition_model_rxfilename,
+                "If this option is provided, transition costs/probabilities will "
+                "be added, obtained from the provided model.  (Note: these "
+                "are derived from the topology, they are not trained.)");
 
     po.Read(argc, argv);
 
@@ -183,6 +190,11 @@ int main(int argc, char *argv[]) {
 
     int32 n_done = 0;
 
+    Transitions transitions;  // For adding transition costs.
+
+    if (!transition_model_rxfilename.empty())
+      ReadKaldiObject(transition_model_rxfilename, &transitions);
+
     if (write_compact) {
       SequentialCompactLatticeReader lattice_reader(lats_rspecifier);
       CompactLatticeWriter lattice_writer(lats_wspecifier);
@@ -200,8 +212,12 @@ int main(int argc, char *argv[]) {
             false, ignore_missing);
       }
 
-      for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++)
-        lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value());
+      for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++) {
+        CompactLattice &lat = lattice_reader.Value();
+        if (!transition_model_rxfilename.empty())
+          AddTransitions(transitions, &lat);
+        lattice_writer.Write(lattice_reader.Key(), lat);
+      }
     } else {
       SequentialLatticeReader lattice_reader(lats_rspecifier);
       LatticeWriter lattice_writer(lats_wspecifier);
@@ -219,8 +235,12 @@ int main(int argc, char *argv[]) {
             true, ignore_missing);
       }
 
-      for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++)
+      for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++) {
+        Lattice &lat = lattice_reader.Value();
+        if (!transition_model_rxfilename.empty())
+          AddTransitions(transitions, &lat);
         lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value());
+      }
     }
     KALDI_LOG << "Done copying " << n_done << " lattices.";
 
diff --git a/src/latbin/lattice-determinize-phone-pruned-parallel.cc b/src/latbin/lattice-determinize-phone-pruned-parallel.cc
index 6d273d433c6..0221fd8eb47 100644
--- a/src/latbin/lattice-determinize-phone-pruned-parallel.cc
+++ b/src/latbin/lattice-determinize-phone-pruned-parallel.cc
@@ -17,7 +17,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/determinize-lattice-pruned.h"
 #include "lat/lattice-functions.h"
@@ -31,7 +31,7 @@ class DeterminizeLatticeTask {
  public:
   // Initializer takes ownership of "lat".
   DeterminizeLatticeTask(
-      const TransitionModel &trans_model,
+      const Transitions &trans_model,
       fst::DeterminizeLatticePhonePrunedOptions &opts,
       std::string key,
       BaseFloat acoustic_scale,
@@ -69,7 +69,7 @@ class DeterminizeLatticeTask {
     clat_writer_->Write(key_, det_clat_);
   }
  private:
-  const TransitionModel *trans_model_;
+  const Transitions *trans_model_;
   const fst::DeterminizeLatticePhonePrunedOptions &opts_;
   std::string key_;
   BaseFloat acoustic_scale_;
@@ -130,7 +130,7 @@ int main(int argc, char *argv[]) {
         lats_rspecifier = po.GetArg(2),
         lats_wspecifier = po.GetArg(3);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     // Reads as regular lattice-- this is the form the determinization code
diff --git a/src/latbin/lattice-determinize-phone-pruned.cc b/src/latbin/lattice-determinize-phone-pruned.cc
index 94a8530273b..94a484dabbb 100644
--- a/src/latbin/lattice-determinize-phone-pruned.cc
+++ b/src/latbin/lattice-determinize-phone-pruned.cc
@@ -18,7 +18,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #include "base/kaldi-common.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/determinize-lattice-pruned.h"
 #include "lat/lattice-functions.h"
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
         lats_rspecifier = po.GetArg(2),
         lats_wspecifier = po.GetArg(3);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     // Reads as regular lattice-- this is the form the determinization code
diff --git a/src/latbin/lattice-rescore-mapped.cc b/src/latbin/lattice-rescore-mapped.cc
index 9dcc63219ee..ccea04b23e0 100644
--- a/src/latbin/lattice-rescore-mapped.cc
+++ b/src/latbin/lattice-rescore-mapped.cc
@@ -21,14 +21,14 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "util/stl-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
 
 namespace kaldi {
 
-void LatticeAcousticRescore(const TransitionModel &trans_model,
+void LatticeAcousticRescore(const Transitions &trans_model,
                             const Matrix<BaseFloat> &log_likes,
                             const std::vector<int32> &state_times,
                             Lattice *lat) {
@@ -55,7 +55,7 @@ void LatticeAcousticRescore(const TransitionModel &trans_model,
         LatticeArc arc = aiter.Value();
         int32 trans_id = arc.ilabel;
         if (trans_id != 0) {  // Non-epsilon input label on arc
-          int32 pdf_id = trans_model.TransitionIdToPdf(trans_id);
+          int32 pdf_id = trans_model.TransitionIdToPdfFast(trans_id);
           if (pdf_id > log_likes.NumCols())
             KALDI_ERR << "Pdf-id " << pdf_id << " is out of the range of "
                       << "input log-likelihoods " << log_likes.NumCols()
@@ -109,7 +109,7 @@ int main(int argc, char *argv[]) {
         loglike_rspecifier = po.GetArg(3),
         lats_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/latbin/lattice-to-mpe-post.cc b/src/latbin/lattice-to-mpe-post.cc
index e7f0f334a45..ddc6382d1a9 100644
--- a/src/latbin/lattice-to-mpe-post.cc
+++ b/src/latbin/lattice-to-mpe-post.cc
@@ -24,7 +24,7 @@
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
     }
     RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier);
     
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/latbin/lattice-to-phone-lattice.cc b/src/latbin/lattice-to-phone-lattice.cc
index 10da2b47bf1..2e62a498d2d 100644
--- a/src/latbin/lattice-to-phone-lattice.cc
+++ b/src/latbin/lattice-to-phone-lattice.cc
@@ -23,7 +23,7 @@
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -66,7 +66,7 @@ int main(int argc, char *argv[]) {
     
     int32 n_done = 0;
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     
     ReadKaldiObject(model_rxfilename, &trans_model);
     
diff --git a/src/latbin/lattice-to-smbr-post.cc b/src/latbin/lattice-to-smbr-post.cc
index 5e78ea9996c..5e4634d9185 100644
--- a/src/latbin/lattice-to-smbr-post.cc
+++ b/src/latbin/lattice-to-smbr-post.cc
@@ -24,7 +24,7 @@
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -87,7 +87,7 @@ int main(int argc, char *argv[]) {
     }
     RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier);
     
-    TransitionModel trans_model;
+    Transitions trans_model;
     {
       bool binary;
       Input ki(model_filename, &binary);
diff --git a/src/latbin/nbest-to-prons.cc b/src/latbin/nbest-to-prons.cc
index aa6326e031c..4999e81b30c 100644
--- a/src/latbin/nbest-to-prons.cc
+++ b/src/latbin/nbest-to-prons.cc
@@ -63,7 +63,7 @@ int main(int argc, char *argv[]) {
         wxfilename = po.GetArg(3);
 
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     ReadKaldiObject(model_rxfilename, &trans_model);
 
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
diff --git a/src/lm/Makefile b/src/lm/Makefile
index 5fff942fcee..59778c2d503 100644
--- a/src/lm/Makefile
+++ b/src/lm/Makefile
@@ -12,6 +12,7 @@ OBJFILES = arpa-file-parser.o arpa-lm-compiler.o const-arpa-lm.o \
 LIBNAME = kaldi-lm
 
 ADDLIBS = ../fstext/kaldi-fstext.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/lmbin/Makefile b/src/lmbin/Makefile
index 1e81391092e..e074972ad30 100644
--- a/src/lmbin/Makefile
+++ b/src/lmbin/Makefile
@@ -11,6 +11,6 @@ OBJFILES =
 TESTFILES =
 
 ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
+	      ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/matrix/Makefile b/src/matrix/Makefile
index e39be1ffec9..59c28419d81 100644
--- a/src/matrix/Makefile
+++ b/src/matrix/Makefile
@@ -2,7 +2,7 @@
 
 all:
 
-OPENFST_CXXFLAGS = 
+OPENFST_CXXFLAGS =
 OPENFST_LDLIBS =
 
 include ../kaldi.mk
@@ -18,7 +18,7 @@ OBJFILES = kaldi-matrix.o kaldi-vector.o packed-matrix.o sp-matrix.o tp-matrix.o
 
 LIBNAME = kaldi-matrix
 
-ADDLIBS = ../base/kaldi-base.a 
+ADDLIBS = ../base/kaldi-base.a ../cblasext/kaldi-cblasext.a
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/matrix/cblas-wrappers.h b/src/matrix/cblas-wrappers.h
deleted file mode 100644
index f869ab7e078..00000000000
--- a/src/matrix/cblas-wrappers.h
+++ /dev/null
@@ -1,491 +0,0 @@
-// matrix/cblas-wrappers.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey);
-//                 Haihua Xu; Wei Shi
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//  http://www.apache.org/licenses/LICENSE-2.0
-
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_MATRIX_CBLAS_WRAPPERS_H_
-#define KALDI_MATRIX_CBLAS_WRAPPERS_H_ 1
-
-
-#include <limits>
-#include "matrix/sp-matrix.h"
-#include "matrix/kaldi-vector.h"
-#include "matrix/kaldi-matrix.h"
-#include "matrix/matrix-functions.h"
-#include "matrix/kaldi-blas.h"
-
-// Do not include this file directly.  It is to be included
-// by .cc files in this directory.
-
-namespace kaldi {
-
-
-inline void cblas_Xcopy(const int N, const float *X, const int incX, float *Y,
-                        const int incY) {
-  cblas_scopy(N, X, incX, Y, incY);
-}
-
-inline void cblas_Xcopy(const int N, const double *X, const int incX, double *Y,
-                        const int incY) {
-  cblas_dcopy(N, X, incX, Y, incY);
-}
-
-
-inline float cblas_Xasum(const int N, const float *X, const int incX) {
-  return cblas_sasum(N, X, incX);
-}
-
-inline double cblas_Xasum(const int N, const double *X, const int incX) {
-  return cblas_dasum(N, X, incX);
-}
-
-inline void cblas_Xrot(const int N, float *X, const int incX, float *Y,
-                       const int incY, const float c, const float s) {
-  cblas_srot(N, X, incX, Y, incY, c, s);
-}
-inline void cblas_Xrot(const int N, double *X, const int incX, double *Y,
-                       const int incY, const double c, const double s) {
-  cblas_drot(N, X, incX, Y, incY, c, s);
-}
-inline float cblas_Xdot(const int N, const float *const X,
-                        const int incX, const float *const Y,
-                        const int incY) {
-  return cblas_sdot(N, X, incX, Y, incY);
-}
-inline double cblas_Xdot(const int N, const double *const X,
-                        const int incX, const double *const Y,
-                        const int incY) {
-  return cblas_ddot(N, X, incX, Y, incY);
-}
-inline void cblas_Xaxpy(const int N, const float alpha, const float *X,
-                        const int incX, float *Y, const int incY) {
-  cblas_saxpy(N, alpha, X, incX, Y, incY);
-}
-inline void cblas_Xaxpy(const int N, const double alpha, const double *X,
-                        const int incX, double *Y, const int incY) {
-  cblas_daxpy(N, alpha, X, incX, Y, incY);
-}
-inline void cblas_Xscal(const int N, const float alpha, float *data,
-                        const int inc) {
-  cblas_sscal(N, alpha, data, inc);
-}
-inline void cblas_Xscal(const int N, const double alpha, double *data, 
-                        const int inc) {
-  cblas_dscal(N, alpha, data, inc);
-}
-inline void cblas_Xspmv(const float alpha, const int num_rows, const float *Mdata,
-                        const float *v, const int v_inc,
-                        const float beta, float *y, const int y_inc) {
-  cblas_sspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc);
-}
-inline void cblas_Xspmv(const double alpha, const int num_rows, const double *Mdata,
-                        const double *v, const int v_inc,
-                        const double beta, double *y, const int y_inc) {
-  cblas_dspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc);
-}
-inline void cblas_Xtpmv(MatrixTransposeType trans, const float *Mdata,
-                        const int num_rows, float *y, const int y_inc) {
-  cblas_stpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
-              CblasNonUnit, num_rows, Mdata, y, y_inc);
-}
-inline void cblas_Xtpmv(MatrixTransposeType trans, const double *Mdata,
-                        const int num_rows, double *y, const int y_inc) {
-  cblas_dtpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
-              CblasNonUnit, num_rows, Mdata, y, y_inc);
-}
-
-
-inline void cblas_Xtpsv(MatrixTransposeType trans, const float *Mdata,
-                        const int num_rows, float *y, const int y_inc) {
-  cblas_stpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
-              CblasNonUnit, num_rows, Mdata, y, y_inc);
-}
-inline void cblas_Xtpsv(MatrixTransposeType trans, const double *Mdata,
-                        const int num_rows, double *y, const int y_inc) {
-  cblas_dtpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
-              CblasNonUnit, num_rows, Mdata, y, y_inc);
-}
-
-// x = alpha * M * y + beta * x
-inline void cblas_Xspmv(MatrixIndexT dim, float alpha, const float *Mdata,
-                        const float *ydata, MatrixIndexT ystride,
-                        float beta, float *xdata, MatrixIndexT xstride) {
-  cblas_sspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
-              ydata, ystride, beta, xdata, xstride);
-}
-inline void cblas_Xspmv(MatrixIndexT dim, double alpha, const double *Mdata,
-                        const double *ydata, MatrixIndexT ystride,
-                        double beta, double *xdata, MatrixIndexT xstride) {
-  cblas_dspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
-              ydata, ystride, beta, xdata, xstride);
-}
-
-// Implements  A += alpha * (x y'  + y x'); A is symmetric matrix.
-inline void cblas_Xspr2(MatrixIndexT dim, float alpha, const float *Xdata,
-                        MatrixIndexT incX, const float *Ydata, MatrixIndexT incY,
-                          float *Adata) {
-  cblas_sspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
-              incX, Ydata, incY, Adata);
-}
-inline void cblas_Xspr2(MatrixIndexT dim, double alpha, const double *Xdata,
-                        MatrixIndexT incX, const double *Ydata, MatrixIndexT incY,
-                        double *Adata) {
-  cblas_dspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
-              incX, Ydata, incY, Adata);
-}
-
-// Implements  A += alpha * (x x'); A is symmetric matrix.
-inline void cblas_Xspr(MatrixIndexT dim, float alpha, const float *Xdata,
-                       MatrixIndexT incX, float *Adata) {
-  cblas_sspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
-}
-inline void cblas_Xspr(MatrixIndexT dim, double alpha, const double *Xdata,
-                       MatrixIndexT incX, double *Adata) {
-  cblas_dspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
-}
-
-// sgemv,dgemv: y = alpha M x + beta y.
-inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows,
-                        MatrixIndexT num_cols, float alpha, const float *Mdata,
-                        MatrixIndexT stride, const float *xdata,
-                        MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) {
-  cblas_sgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
-              num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
-}
-inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows,
-                        MatrixIndexT num_cols, double alpha, const double *Mdata,
-                        MatrixIndexT stride, const double *xdata,
-                        MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) {
-  cblas_dgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
-              num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
-}
-
-// sgbmv, dgmmv: y = alpha M x +  + beta * y.
-inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows,
-                        MatrixIndexT num_cols, MatrixIndexT num_below,
-                        MatrixIndexT num_above, float alpha, const float *Mdata,
-                        MatrixIndexT stride, const float *xdata,
-                        MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) {
-  cblas_sgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
-              num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
-              incX, beta, ydata, incY);
-}
-inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows,
-                        MatrixIndexT num_cols, MatrixIndexT num_below,
-                        MatrixIndexT num_above, double alpha, const double *Mdata,
-                        MatrixIndexT stride, const double *xdata,
-                        MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) {
-  cblas_dgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
-              num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
-              incX, beta, ydata, incY);
-}
-
-
-template<typename Real>
-inline void Xgemv_sparsevec(MatrixTransposeType trans, MatrixIndexT num_rows,
-                            MatrixIndexT num_cols, Real alpha, const Real *Mdata,
-                            MatrixIndexT stride, const Real *xdata,
-                            MatrixIndexT incX, Real beta, Real *ydata,
-                            MatrixIndexT incY) {
-  if (trans == kNoTrans) {
-    if (beta != 1.0) cblas_Xscal(num_rows, beta, ydata, incY);
-    for (MatrixIndexT i = 0; i < num_cols; i++) {
-      Real x_i = xdata[i * incX];
-      if (x_i == 0.0) continue;
-      // Add to ydata, the i'th column of M, times alpha * x_i
-      cblas_Xaxpy(num_rows, x_i * alpha, Mdata + i, stride, ydata, incY);
-    }    
-  } else {
-    if (beta != 1.0) cblas_Xscal(num_cols, beta, ydata, incY);
-    for (MatrixIndexT i = 0; i < num_rows; i++) {
-      Real x_i = xdata[i * incX];
-      if (x_i == 0.0) continue;
-      // Add to ydata, the i'th row of M, times alpha * x_i
-      cblas_Xaxpy(num_cols, x_i * alpha,
-                  Mdata + (i * stride), 1, ydata, incY);
-    }
-  }
-}
-
-inline void cblas_Xgemm(const float alpha,
-                        MatrixTransposeType transA,
-                        const float *Adata,
-                        MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
-                        MatrixTransposeType transB, 
-                        const float *Bdata, MatrixIndexT b_stride,
-                        const float beta,
-                        float *Mdata, 
-                        MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
-  cblas_sgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA), 
-              static_cast<CBLAS_TRANSPOSE>(transB),
-              num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
-              alpha, Adata, a_stride, Bdata, b_stride,
-              beta, Mdata, stride); 
-}
-inline void cblas_Xgemm(const double alpha,
-                        MatrixTransposeType transA,
-                        const double *Adata,
-                        MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
-                        MatrixTransposeType transB, 
-                        const double *Bdata, MatrixIndexT b_stride,
-                        const double beta,
-                        double *Mdata, 
-                        MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
-  cblas_dgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA), 
-              static_cast<CBLAS_TRANSPOSE>(transB),
-              num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
-              alpha, Adata, a_stride, Bdata, b_stride,
-              beta, Mdata, stride); 
-}
-
-
-inline void cblas_Xsymm(const float alpha,
-                        MatrixIndexT sz,
-                        const float *Adata,MatrixIndexT a_stride,
-                        const float *Bdata,MatrixIndexT b_stride,
-                        const float beta,
-                        float *Mdata, MatrixIndexT stride) {
-  cblas_ssymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
-              a_stride, Bdata, b_stride, beta, Mdata, stride);
-}
-inline void cblas_Xsymm(const double alpha,
-                        MatrixIndexT sz,
-                        const double *Adata,MatrixIndexT a_stride,
-                        const double *Bdata,MatrixIndexT b_stride,
-                        const double beta,
-                        double *Mdata, MatrixIndexT stride) {
-  cblas_dsymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
-              a_stride, Bdata, b_stride, beta, Mdata, stride);
-}
-// ger: M += alpha x y^T.
-inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, float alpha,
-                       const float *xdata, MatrixIndexT incX, const float *ydata,
-                       MatrixIndexT incY, float *Mdata, MatrixIndexT stride) {
-  cblas_sger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
-             Mdata, stride);
-}
-inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, double alpha,
-                       const double *xdata, MatrixIndexT incX, const double *ydata,
-                       MatrixIndexT incY, double *Mdata, MatrixIndexT stride) {
-  cblas_dger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
-             Mdata, stride);
-}
-
-// syrk: symmetric rank-k update.
-// if trans==kNoTrans, then C = alpha A A^T + beta C
-// else C = alpha A^T A + beta C.
-// note: dim_c is dim(C), other_dim_a is the "other" dimension of A, i.e.
-// num-cols(A) if kNoTrans, or num-rows(A) if kTrans.
-// We only need the row-major and lower-triangular option of this, and this
-// is hard-coded.
-inline void cblas_Xsyrk (
-    const MatrixTransposeType trans, const MatrixIndexT dim_c,
-    const MatrixIndexT other_dim_a, const float alpha, const float *A,
-    const MatrixIndexT a_stride, const float beta, float *C,
-    const MatrixIndexT c_stride) {
-  cblas_ssyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
-              dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
-}
-
-inline void cblas_Xsyrk(
-    const MatrixTransposeType trans, const MatrixIndexT dim_c,
-    const MatrixIndexT other_dim_a, const double alpha, const double *A,
-    const MatrixIndexT a_stride, const double beta, double *C,
-    const MatrixIndexT c_stride) {
-  cblas_dsyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
-              dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
-}
-
-/// matrix-vector multiply using a banded matrix; we always call this
-/// with b = 1 meaning we're multiplying by a diagonal matrix.  This is used for
-/// elementwise multiplication.  We miss some of the arguments out of this
-/// wrapper.
-inline void cblas_Xsbmv1(
-    const MatrixIndexT dim,
-    const double *A,
-    const double alpha,
-    const double *x,
-    const double beta,
-    double *y) {
-  cblas_dsbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
-              1, x, 1, beta, y, 1);
-}
-
-inline void cblas_Xsbmv1(
-    const MatrixIndexT dim,
-    const float *A,
-    const float alpha,
-    const float *x,
-    const float beta,
-    float *y) {
-  cblas_ssbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
-              1, x, 1, beta, y, 1);
-}
-
-/// This is not really a wrapper for CBLAS as CBLAS does not have this; in future we could
-/// extend this somehow.
-inline void mul_elements(
-    const MatrixIndexT dim,
-    const double *a,
-    double *b) { // does b *= a, elementwise.
-  double c1, c2, c3, c4;
-  MatrixIndexT i;
-  for (i = 0; i + 4 <= dim; i += 4) {
-    c1 = a[i] * b[i];
-    c2 = a[i+1] * b[i+1];
-    c3 = a[i+2] * b[i+2];
-    c4 = a[i+3] * b[i+3];
-    b[i] = c1;
-    b[i+1] = c2;
-    b[i+2] = c3;
-    b[i+3] = c4;
-  }
-  for (; i < dim; i++)
-    b[i] *= a[i];
-}
-
-inline void mul_elements(
-    const MatrixIndexT dim,
-    const float *a,
-    float *b) { // does b *= a, elementwise.
-  float c1, c2, c3, c4;
-  MatrixIndexT i;
-  for (i = 0; i + 4 <= dim; i += 4) {
-    c1 = a[i] * b[i];
-    c2 = a[i+1] * b[i+1];
-    c3 = a[i+2] * b[i+2];
-    c4 = a[i+3] * b[i+3];
-    b[i] = c1;
-    b[i+1] = c2;
-    b[i+2] = c3;
-    b[i+3] = c4;
-  }
-  for (; i < dim; i++)
-    b[i] *= a[i];
-}
-
-
-
-// add clapack here
-#if !defined(HAVE_ATLAS)
-inline void clapack_Xtptri(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *result) {
-  stptri_(const_cast<char *>("U"), const_cast<char *>("N"), num_rows, Mdata, result);
-}
-inline void clapack_Xtptri(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *result) {
-  dtptri_(const_cast<char *>("U"), const_cast<char *>("N"), num_rows, Mdata, result);
-}
-// 
-inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols, 
-                            float *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot, 
-                            KaldiBlasInt *result) {
-  sgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
-}
-inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols, 
-                            double *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot, 
-                            KaldiBlasInt *result) {
-  dgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
-}
-
-// 
-inline void clapack_Xgetri2(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride,
-                           KaldiBlasInt *pivot, float *p_work, 
-                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
-  sgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
-}
-inline void clapack_Xgetri2(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
-                           KaldiBlasInt *pivot, double *p_work, 
-                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
-  dgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
-}
-//
-inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
-                           KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride,
-                           float *sv, float *Vdata, KaldiBlasInt *vstride,
-                           float *Udata, KaldiBlasInt *ustride, float *p_work,
-                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
-  sgesvd_(v, u,
-          num_cols, num_rows, Mdata, stride,
-          sv, Vdata, vstride, Udata, ustride, 
-          p_work, l_work, result); 
-}
-inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
-                           KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
-                           double *sv, double *Vdata, KaldiBlasInt *vstride,
-                           double *Udata, KaldiBlasInt *ustride, double *p_work,
-                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
-  dgesvd_(v, u,
-          num_cols, num_rows, Mdata, stride,
-          sv, Vdata, vstride, Udata, ustride,
-          p_work, l_work, result); 
-}
-//
-void inline clapack_Xsptri(KaldiBlasInt *num_rows, float *Mdata, 
-                           KaldiBlasInt *ipiv, float *work, KaldiBlasInt *result) {
-  ssptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
-}
-void inline clapack_Xsptri(KaldiBlasInt *num_rows, double *Mdata, 
-                           KaldiBlasInt *ipiv, double *work, KaldiBlasInt *result) {
-  dsptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
-}
-//
-void inline clapack_Xsptrf(KaldiBlasInt *num_rows, float *Mdata,
-                           KaldiBlasInt *ipiv, KaldiBlasInt *result) {
-  ssptrf_(const_cast<char *>("U"), num_rows, Mdata, ipiv, result);
-}
-void inline clapack_Xsptrf(KaldiBlasInt *num_rows, double *Mdata,
-                           KaldiBlasInt *ipiv, KaldiBlasInt *result) {
-  dsptrf_(const_cast<char *>("U"), num_rows, Mdata, ipiv, result);
-}
-#else
-inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
-                           float *Mdata, MatrixIndexT stride, 
-                           int *pivot, int *result) {
-  *result = clapack_sgetrf(CblasColMajor, num_rows, num_cols,
-                              Mdata, stride, pivot);
-}
-
-inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
-                           double *Mdata, MatrixIndexT stride, 
-                           int *pivot, int *result) {
-  *result = clapack_dgetrf(CblasColMajor, num_rows, num_cols,
-                              Mdata, stride, pivot);
-}
-//
-inline int clapack_Xtrtri(int num_rows, float *Mdata, MatrixIndexT stride) {
-  return  clapack_strtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
-                              Mdata, stride);
-}
-
-inline int clapack_Xtrtri(int num_rows, double *Mdata, MatrixIndexT stride) {
-  return  clapack_dtrtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
-                              Mdata, stride);
-}
-//
-inline void clapack_Xgetri(MatrixIndexT num_rows, float *Mdata, MatrixIndexT stride,
-                      int *pivot, int *result) {
-  *result = clapack_sgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
-}
-inline void clapack_Xgetri(MatrixIndexT num_rows, double *Mdata, MatrixIndexT stride,
-                      int *pivot, int *result) {
-  *result = clapack_dgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
-}
-#endif
-
-}
-// namespace kaldi
-
-#endif
diff --git a/src/matrix/jama-svd.h b/src/matrix/jama-svd.h
index 8304dac63e3..33a3ef9e083 100644
--- a/src/matrix/jama-svd.h
+++ b/src/matrix/jama-svd.h
@@ -31,7 +31,7 @@
 
 #include "matrix/kaldi-matrix.h"
 #include "matrix/sp-matrix.h"
-#include "matrix/cblas-wrappers.h"
+#include "cblasext/cblas-wrappers.h"
 
 namespace kaldi {
 
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index faf23cdf0c5..a81cc4ba3a4 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -23,13 +23,16 @@
 // limitations under the License.
 
 #include "matrix/kaldi-matrix.h"
+#include "matrix/kaldi-vector.h"
 #include "matrix/sp-matrix.h"
 #include "matrix/jama-svd.h"
 #include "matrix/jama-eig.h"
 #include "matrix/compressed-matrix.h"
 #include "matrix/sparse-matrix.h"
+#include "cblasext/cblas-wrappers.h"
+#include "cblasext/cblas-extensions.h"
 
-static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans), 
+static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans),
     "kaldi::kNoTrans and kaldi::kTrans must be equal to the appropriate CBLAS library constants!");
 
 namespace kaldi {
@@ -118,10 +121,10 @@ template<>
 template<>
 void MatrixBase<float>::AddVecVec(const float alpha,
                                   const VectorBase<float> &a,
-                                  const VectorBase<float> &rb) {
-  KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_);
-  cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(),
-             1, data_, stride_);
+                                  const VectorBase<float> &b) {
+  KALDI_ASSERT(a.Dim() == num_rows_ && b.Dim() == num_cols_);
+  cblas_Xger(a.Dim(), b.Dim(), alpha, a.Data(), 1,
+             b.Data(), 1, data_, stride_);
 }
 
 template<typename Real>
@@ -133,8 +136,10 @@ void MatrixBase<Real>::AddVecVec(const Real alpha,
   if (num_rows_ * num_cols_ > 100) { // It's probably worth it to allocate
     // temporary vectors of the right type and use BLAS.
     Vector<Real> temp_a(a), temp_b(b);
-    cblas_Xger(num_rows_, num_cols_, alpha, temp_a.Data(), 1,
-               temp_b.Data(), 1, data_, stride_);
+    cblas_Xger(num_rows_, num_cols_, alpha,
+               temp_a.Data(), 1,
+               temp_b.Data(), 1,
+               data_, stride_);
   } else {
     const OtherReal *a_data = a.Data(), *b_data = b.Data();
     Real *row_data = data_;
@@ -160,11 +165,11 @@ template<>
 template<>
 void MatrixBase<double>::AddVecVec(const double alpha,
                                    const VectorBase<double> &a,
-                                   const VectorBase<double> &rb) {
-  KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_);
+                                   const VectorBase<double> &b) {
+  KALDI_ASSERT(a.Dim() == num_rows_ && b.Dim() == num_cols_);
   if (num_rows_ == 0) return;
-  cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(),
-             1, data_, stride_);
+  cblas_Xger(a.Dim(), b.Dim(), alpha, a.Data(), 1,
+             b.Data(), 1, data_, stride_);
 }
 
 template<typename Real>
@@ -180,8 +185,10 @@ void MatrixBase<Real>::AddMatMat(const Real alpha,
                || (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
   KALDI_ASSERT(&A !=  this && &B != this);
   if (num_rows_ == 0) return;
-  cblas_Xgemm(alpha, transA, A.data_, A.num_rows_, A.num_cols_, A.stride_,
-              transB, B.data_, B.stride_, beta, data_, num_rows_, num_cols_, stride_);
+  cblas_Xgemm(alpha, static_cast<CBLAS_TRANSPOSE>(transA),
+              A.data_, A.num_rows_, A.num_cols_, A.stride_,
+              static_cast<CBLAS_TRANSPOSE>(transB),
+              B.data_, B.stride_, beta, data_, num_rows_, num_cols_, stride_);
 
 }
 
@@ -258,7 +265,8 @@ void MatrixBase<Real>::SymAddMat2(const Real alpha,
   MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_);
 
   // This function call is hard-coded to update the lower triangle.
-  cblas_Xsyrk(transA, num_rows_, A_other_dim, alpha, A.Data(),
+  cblas_Xsyrk(static_cast<CBLAS_TRANSPOSE>(transA),
+              num_rows_, A_other_dim, alpha, A.Data(),
               A.Stride(), beta, this->data_, this->stride_);
 }
 
@@ -287,16 +295,18 @@ void MatrixBase<Real>::AddMatSmat(const Real alpha,
     for (MatrixIndexT c = 0; c < num_cols; c++) {
       // for each column of *this, do
       // [this column] = [alpha * A * this column of B] + [beta * this column]
-      Xgemv_sparsevec(transA, Arows, Acols, alpha, Adata, Astride,
-                      Bdata + c, Bstride, beta, data + c, stride);
+      cblasext_Xgemv_sparsevec(static_cast<CBLAS_TRANSPOSE>(transA),
+                               Arows, Acols, alpha, Adata, Astride,
+                               Bdata + c, Bstride, beta, data + c, stride);
     }
   } else {
     // Iterate over the columns of *this and the rows of B.
     for (MatrixIndexT c = 0; c < num_cols; c++) {
       // for each column of *this, do
       // [this column] = [alpha * A * this row of B] + [beta * this column]
-      Xgemv_sparsevec(transA, Arows, Acols, alpha, Adata, Astride,
-                      Bdata + (c * Bstride), 1, beta, data + c, stride);
+      cblasext_Xgemv_sparsevec(static_cast<CBLAS_TRANSPOSE>(transA),
+                               Arows, Acols, alpha, Adata, Astride,
+                               Bdata + (c * Bstride), 1, beta, data + c, stride);
     }
   }
 }
@@ -324,16 +334,18 @@ void MatrixBase<Real>::AddSmatMat(const Real alpha,
     for (MatrixIndexT r = 0; r < num_rows; r++) {
       // for each row of *this, do
       // [this row] = [alpha * (this row of A) * B^T] + [beta * this row]
-      Xgemv_sparsevec(invTransB, Brows, Bcols, alpha, Bdata, Bstride,
-                      Adata + (r * Astride), 1, beta, data + (r * stride), 1);
+      cblasext_Xgemv_sparsevec(static_cast<CBLAS_TRANSPOSE>(invTransB),
+                               Brows, Bcols, alpha, Bdata, Bstride,
+                               Adata + (r * Astride), 1, beta, data + (r * stride), 1);
     }
   } else {
     // Iterate over the rows of *this and the columns of A.
     for (MatrixIndexT r = 0; r < num_rows; r++) {
       // for each row of *this, do
       // [this row] = [alpha * (this column of A) * B^T] + [beta * this row]
-      Xgemv_sparsevec(invTransB, Brows, Bcols, alpha, Bdata, Bstride,
-                      Adata + r, Astride, beta, data + (r * stride), 1);
+      cblasext_Xgemv_sparsevec(static_cast<CBLAS_TRANSPOSE>(invTransB),
+                               Brows, Bcols, alpha, Bdata, Bstride,
+                               Adata + r, Astride, beta, data + (r * stride), 1);
     }
   }
 }
@@ -539,7 +551,7 @@ void MatrixBase<Real>::AddMatSmat(Real alpha, const MatrixBase<Real> &A,
         // pass stride to write a column as matrices are stored in row major order.
         cblas_Xaxpy(this_num_rows, alpha_B_jk, a_col_k, A.stride_,
                     this_col_j, this->stride_);
-        //for (MatrixIndexT i = 0; i < this_num_rows; ++i) 
+        //for (MatrixIndexT i = 0; i < this_num_rows; ++i)
         // this_col_j[i*this->stride_] +=  alpha_B_jk * a_col_k[i*A.stride_];
       }
     }
@@ -593,7 +605,8 @@ void MatrixBase<Real>::AddDiagVecMat(
   Real *data = data_;
   const Real *Mdata = M.Data(), *vdata = v.Data();
   if (num_rows_ == 0) return;
-  for (MatrixIndexT i = 0; i < num_rows; i++, data += stride, Mdata += M_row_stride, vdata++)
+  for (MatrixIndexT i = 0; i < num_rows;
+       i++, data += stride, Mdata += M_row_stride, vdata++)
     cblas_Xaxpy(num_cols, alpha * *vdata, Mdata, M_col_stride, data, 1);
 }
 
@@ -626,9 +639,9 @@ void MatrixBase<Real>::AddMatDiagVec(
   const Real *Mdata = M.Data(), *vdata = v.Data();
   if (num_rows_ == 0) return;
   for (MatrixIndexT i = 0; i < num_rows; i++){
-      for(MatrixIndexT j = 0; j < num_cols; j ++ ){
-          data[i*stride + j] += alpha * vdata[j] * Mdata[i*M_row_stride + j*M_col_stride];
-      }
+    for(MatrixIndexT j = 0; j < num_cols; j ++ ){
+      data[i*stride + j] += alpha * vdata[j] * Mdata[i*M_row_stride + j*M_col_stride];
+    }
   }
 }
 
@@ -1140,17 +1153,8 @@ template<typename Real>
 void MatrixBase<Real>::MulElements(const MatrixBase<Real> &a) {
   KALDI_ASSERT(a.NumRows() == num_rows_ && a.NumCols() == num_cols_);
 
-  if (num_cols_ == stride_ && num_cols_ == a.stride_) {
-    mul_elements(num_rows_ * num_cols_, a.data_, data_);
-  } else {
-    MatrixIndexT a_stride = a.stride_, stride = stride_;
-    Real *data = data_, *a_data = a.data_;
-    for (MatrixIndexT i = 0; i < num_rows_; i++) {
-      mul_elements(num_cols_, a_data, data);
-      a_data += a_stride;
-      data += stride;
-    }
-  }
+  cblasext_mul_elements_mat(a.Data(), a.NumRows(), a.NumCols(),
+                            a.Stride(), data_, stride_);
 }
 
 template<typename Real>
@@ -1993,27 +1997,27 @@ void MatrixBase<Real>::OrthogonalizeRows() {
 // symmetric positive definite).
 
 template<typename Real>
-void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *rs, MatrixBase<Real> *rU, Real check_thresh) // e.g. check_thresh = 0.001
+void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *U, Real check_thresh) // e.g. check_thresh = 0.001
 {
   const MatrixIndexT D = num_rows_;
 
   KALDI_ASSERT(num_rows_ == num_cols_);
   KALDI_ASSERT(IsSymmetric() && "SymPosSemiDefEig: expecting input to be symmetrical.");
-  KALDI_ASSERT(rU->num_rows_ == D && rU->num_cols_ == D && rs->Dim() == D);
+  KALDI_ASSERT(U->num_rows_ == D && U->num_cols_ == D && s->Dim() == D);
 
   Matrix<Real>  Vt(D, D);
-  Svd(rs, rU, &Vt);
+  Svd(s, U, &Vt);
 
   // First just zero any singular values if the column of U and V do not have +ve dot product--
   // this may mean we have small negative eigenvalues, and if we zero them the result will be closer to correct.
   for (MatrixIndexT i = 0;i < D;i++) {
     Real sum = 0.0;
-    for (MatrixIndexT j = 0;j < D;j++) sum += (*rU)(j, i) * Vt(i, j);
-    if (sum < 0.0) (*rs)(i) = 0.0;
+    for (MatrixIndexT j = 0;j < D;j++) sum += (*U)(j, i) * Vt(i, j);
+    if (sum < 0.0) (*s)(i) = 0.0;
   }
 
   {
-    Matrix<Real> tmpU(*rU); Vector<Real> tmps(*rs); tmps.ApplyPow(0.5);
+    Matrix<Real> tmpU(*U); Vector<Real> tmps(*s); tmps.ApplyPow(0.5);
     tmpU.MulColsVec(tmps);
     SpMatrix<Real> tmpThis(D);
     tmpThis.AddMat2(1.0, tmpU, kNoTrans, 0.0);
@@ -2690,26 +2694,17 @@ bool AttemptComplexPower(double *x_re, double *x_im, double power);
 
 template <typename Real>
 Real TraceMatMat(const MatrixBase<Real> &A,
-                  const MatrixBase<Real> &B,
-                  MatrixTransposeType trans) {  // tr(A B), equivalent to sum of each element of A times same element in B'
-  MatrixIndexT aStride = A.stride_, bStride = B.stride_;
+                 const MatrixBase<Real> &B,
+                 MatrixTransposeType trans) {  // tr(A B), equivalent to sum of each element of A times same element in B'
   if (trans == kNoTrans) {
     KALDI_ASSERT(A.NumRows() == B.NumCols() && A.NumCols() == B.NumRows());
-    Real ans = 0.0;
-    Real *adata = A.data_, *bdata = B.data_;
-    MatrixIndexT arows = A.NumRows(), acols = A.NumCols();
-    for (MatrixIndexT row = 0;row < arows;row++, adata+=aStride, bdata++)
-      ans += cblas_Xdot(acols, adata, 1, bdata, bStride);
-    return ans;
   } else {
     KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols());
-    Real ans = 0.0;
-    Real *adata = A.data_, *bdata = B.data_;
-    MatrixIndexT arows = A.NumRows(), acols = A.NumCols();
-    for (MatrixIndexT row = 0;row < arows;row++, adata+=aStride, bdata+=bStride)
-      ans += cblas_Xdot(acols, adata, 1, bdata, 1);
-    return ans;
   }
+  return cblasext_trace_mat_mat(A.Data(), A.NumRows(), A.NumCols(),
+                                A.Stride(), 1, B.Data(),
+                                static_cast<CBLAS_TRANSPOSE>(trans),
+                                B.Stride(), 1);
 }
 
 
@@ -2824,7 +2819,7 @@ void MatrixBase<Real>::GroupMax(const MatrixBase<Real> &src) {
 
 template<typename Real>
 void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
-                                const MatrixIndexT *indices) {
+                                const int32 *indices) {
   KALDI_ASSERT(NumRows() == src.NumRows());
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       this_stride = stride_, src_stride = src.stride_;
@@ -2839,7 +2834,7 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
   // For the sake of memory locality we do this row by row, rather
   // than doing it column-wise using cublas_Xcopy
   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
-    const MatrixIndexT *index_ptr = &(indices[0]);
+    const int32 *index_ptr = &(indices[0]);
     for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
       if (*index_ptr < 0) this_data[c] = 0;
       else this_data[c] = src_data[*index_ptr];
@@ -2850,7 +2845,7 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
 
 template<typename Real>
 void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
-                               const MatrixIndexT *indices) {
+                               const int32 *indices) {
   KALDI_ASSERT(NumRows() == src.NumRows());
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       this_stride = stride_, src_stride = src.stride_;
@@ -2864,8 +2859,9 @@ void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
 
   // For the sake of memory locality we do this row by row, rather
   // than doing it column-wise using cublas_Xcopy
-  for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
-    const MatrixIndexT *index_ptr = &(indices[0]);
+  for (MatrixIndexT r = 0; r < num_rows;
+       r++, this_data += this_stride, src_data += src_stride) {
+    const int32 *index_ptr = indices;
     for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
       if (*index_ptr >= 0)
         this_data[c] += src_data[*index_ptr];
@@ -2875,7 +2871,7 @@ void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
 
 template<typename Real>
 void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
-                                const MatrixIndexT *indices) {
+                                const int32 *indices) {
   KALDI_ASSERT(NumCols() == src.NumCols());
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       this_stride = stride_;
@@ -2917,7 +2913,7 @@ void MatrixBase<Real>::CopyToRows(Real *const *dst) const {
 template<typename Real>
 void MatrixBase<Real>::AddRows(Real alpha,
                                const MatrixBase<Real> &src,
-                               const MatrixIndexT *indexes) {
+                               const int32 *indexes) {
   KALDI_ASSERT(NumCols() == src.NumCols());
   MatrixIndexT num_rows = num_rows_,
       num_cols = num_cols_, this_stride = stride_;
@@ -2946,7 +2942,7 @@ void MatrixBase<Real>::AddRows(Real alpha, const Real *const *src) {
 
 template<typename Real>
 void MatrixBase<Real>::AddToRows(Real alpha,
-                                 const MatrixIndexT *indexes,
+                                 const int32 *indexes,
                                  MatrixBase<Real> *dst) const {
   KALDI_ASSERT(NumCols() == dst->NumCols());
   MatrixIndexT num_rows = num_rows_,
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index 4387538c472..84f0283a64f 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -66,7 +66,7 @@ class MatrixBase {
   /// Returns number of columns (or zero for empty matrix).
   inline MatrixIndexT NumCols() const { return num_cols_; }
 
-  /// Stride (distance in memory between each row).  Will be >= NumCols.
+  /// Stride (distance in memory between each row).  Must be >= NumCols().
   inline MatrixIndexT Stride() const {  return stride_; }
 
   /// Returns size in bytes of the data held by the matrix.
@@ -184,18 +184,12 @@ class MatrixBase {
 
   /* Accessing of sub-parts of the matrix. */
 
-  /// Return specific row of matrix [const].
-  inline const SubVector<Real> Row(MatrixIndexT i) const {
+  /// Return specific row of matrix.  Warning: this can get
+  /// around const constraints.
+  inline SubVector<Real> Row(MatrixIndexT i) const {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return SubVector<Real>(data_ + (i * stride_), NumCols());
-  }
-
-  /// Return specific row of matrix.
-  inline SubVector<Real> Row(MatrixIndexT i) {
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
-                 static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return SubVector<Real>(data_ + (i * stride_), NumCols());
+    return SubVector<Real>(data_ + (i * stride_), num_cols_);
   }
 
   /// Return a sub-part of matrix.
@@ -282,14 +276,14 @@ class MatrixBase {
   /// all elements of "indices" must be in [-1, src.NumCols()-1],
   /// and src.NumRows() must equal this.NumRows()
   void CopyCols(const MatrixBase<Real> &src,
-                const MatrixIndexT *indices);
+                const int32 *indices);
 
   /// Copies row r from row indices[r] of src (does nothing
   /// As a special case, if indexes[i] == -1, sets row i to zero.
   /// all elements of "indices" must be in [-1, src.NumRows()-1],
   /// and src.NumCols() must equal this.NumCols()
   void CopyRows(const MatrixBase<Real> &src,
-                const MatrixIndexT *indices);
+                const int32 *indices);
 
   /// Add column indices[r] of src to column r.
   /// As a special case, if indexes[i] == -1, skip column i
@@ -297,7 +291,7 @@ class MatrixBase {
   /// all elements of "reorder" must be in [-1, src.NumCols()-1],
   /// and src.NumRows() must equal this.NumRows()
   void AddCols(const MatrixBase<Real> &src,
-               const MatrixIndexT *indices);
+               const int32 *indices);
 
   /// Copies row r of this matrix from an array of floats at the location given
   /// by src[r]. If any src[r] is NULL then this.Row(r) will be set to zero.
@@ -316,7 +310,7 @@ class MatrixBase {
   /// be in [-1, src.NumRows()-1], and src.NumCols() must equal this.NumCols().
   void AddRows(Real alpha,
                const MatrixBase<Real> &src,
-               const MatrixIndexT *indexes);
+               const int32 *indexes);
 
   /// Does for each row r, this.Row(r) += alpha * src[r], treating src[r] as the
   /// beginning of a region of memory representing a vector of floats, of the
@@ -335,7 +329,7 @@ class MatrixBase {
   /// Requires that all the indexes[i] that are >= 0
   /// be distinct, otherwise the behavior is undefined.
   void AddToRows(Real alpha,
-                 const MatrixIndexT *indexes,
+                 const int32 *indexes,
                  MatrixBase<Real> *dst) const;
 
   inline void ApplyPow(Real power) {
@@ -412,7 +406,9 @@ class MatrixBase {
      Null pointers for U and/or Vt at input mean we do not want that output.  We
      expect that S.Dim() == m, U is either NULL or m by n,
      and v is either NULL or n by n.
-     The singular values are not sorted (use SortSvd for that).  */
+     The singular values are not sorted (use SortSvd for that).
+     Requires that s->Stride() == 1.
+  */
   void DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
                       MatrixBase<Real> *Vt);  // Destroys calling matrix.
 
@@ -420,6 +416,7 @@ class MatrixBase {
   /// transposed; the normal formulation is U diag(s) V^T.
   /// Null pointers for U or V mean we don't want that output (this saves
   /// compute).  The singular values are not sorted (use SortSvd for that).
+  /// Requires that s->Stride() == 1.
   void Svd(VectorBase<Real> *s, MatrixBase<Real> *U,
            MatrixBase<Real> *Vt) const;
   /// Compute SVD but only retain the singular values.
@@ -566,10 +563,7 @@ class MatrixBase {
    * positive semi-definite (check_thresh controls how stringent the check is;
    * set it to 2 to ensure it won't ever complain, but it will zero out negative
    * dimensions in your matrix.
-   *
-   * Caution: if you want the eigenvalues, it may make more sense to convert to
-   * SpMatrix and use Eig() function there, which uses eigenvalue decomposition
-   * directly rather than SVD.
+   * Requires s->Stride() == 1.
   */
   void SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *P,
                         Real check_thresh = 0.001);
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index ccc7e89b5bc..f9256d08eb4 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -25,14 +25,25 @@
 
 #include <algorithm>
 #include <string>
-#include "matrix/cblas-wrappers.h"
+#include "cblasext/cblas-wrappers.h"
+#include "cblasext/cblas-extensions.h"
 #include "matrix/kaldi-vector.h"
 #include "matrix/kaldi-matrix.h"
 #include "matrix/sp-matrix.h"
 #include "matrix/sparse-matrix.h"
 
+
 namespace kaldi {
 
+template<typename Real> inline const Real* Get64Ones() {
+  // The C++ standard doesn't seem to provide a compact way to do this.
+  static const Real ones[64] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+  return ones;
+}
+
 template<typename Real>
 Real VecVec(const VectorBase<Real> &a,
             const VectorBase<Real> &b) {
@@ -97,7 +108,8 @@ void VectorBase<Real>::AddMatVec(const Real alpha,
   KALDI_ASSERT((trans == kNoTrans && M.NumCols() == v.dim_ && M.NumRows() == dim_)
                || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
   KALDI_ASSERT(&v != this);
-  cblas_Xgemv(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
+  cblas_Xgemv(static_cast<CBLAS_TRANSPOSE>(trans),
+              M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
               v.Data(), 1, beta, data_, 1);
 }
 
@@ -110,8 +122,9 @@ void VectorBase<Real>::AddMatSvec(const Real alpha,
   KALDI_ASSERT((trans == kNoTrans && M.NumCols() == v.dim_ && M.NumRows() == dim_)
                || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
   KALDI_ASSERT(&v != this);
-  Xgemv_sparsevec(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
-                  v.Data(), 1, beta, data_, 1);
+  cblasext_Xgemv_sparsevec(static_cast<CBLAS_TRANSPOSE>(trans),
+                           M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
+                           v.Data(), 1, beta, data_, 1);
   return;
   /*
   MatrixIndexT this_dim = this->dim_, v_dim = v.dim_,
@@ -139,12 +152,12 @@ void VectorBase<Real>::AddMatSvec(const Real alpha,
 
 template<typename Real>
 void VectorBase<Real>::AddSpVec(const Real alpha,
-                                 const SpMatrix<Real> &M,
-                                 const VectorBase<Real> &v,
-                                 const Real beta) {
+                                const SpMatrix<Real> &M,
+                                const VectorBase<Real> &v,
+                                const Real beta) {
   KALDI_ASSERT(M.NumRows() == v.dim_ && dim_ == v.dim_);
   KALDI_ASSERT(&v != this);
-  cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), 1, beta, data_, 1);
+  cblas_Xspmv(M.NumRows(), alpha, M.Data(), v.Data(), 1, beta, data_, 1);
 }
 
 
@@ -152,14 +165,16 @@ template<typename Real>
 void VectorBase<Real>::MulTp(const TpMatrix<Real> &M,
                               const MatrixTransposeType trans) {
   KALDI_ASSERT(M.NumRows() == dim_);
-  cblas_Xtpmv(trans,M.Data(),M.NumRows(),data_,1);
+  cblas_Xtpmv(static_cast<CBLAS_TRANSPOSE>(trans),
+              M.Data(), M.NumRows(), data_, 1);
 }
 
 template<typename Real>
 void VectorBase<Real>::Solve(const TpMatrix<Real> &M,
                         const MatrixTransposeType trans) {
   KALDI_ASSERT(M.NumRows() == dim_);
-  cblas_Xtpsv(trans, M.Data(), M.NumRows(), data_, 1);
+  cblas_Xtpsv(static_cast<CBLAS_TRANSPOSE>(trans),
+              M.Data(), M.NumRows(), data_, 1);
 }
 
 
@@ -713,41 +728,40 @@ Real VectorBase<Real>::SumLog() const {
 template<typename Real>
 void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
   KALDI_ASSERT(dim_ == M.NumCols());
-  MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_;
-  Real *data = data_;
-
-  // implement the function according to a dimension cutoff for computation efficiency
-  if (num_rows <= 64) {
-    cblas_Xscal(dim, beta, data, 1);
-    const Real *m_data = M.Data();
-    for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride)
-      cblas_Xaxpy(dim, alpha, m_data, 1, data, 1);
-
-  } else {
-    Vector<Real> ones(M.NumRows());
-    ones.Set(1.0);
-    this->AddMatVec(alpha, M, kTrans, ones, beta);
+  // the BLAS standard does not support vectors with stride zero, even though
+  // some implementations (such as Mac's accelerate framework and I believe
+  // CUBLAS) seem to allow it.  We compile a fixed-size (64) vector of ones
+  // into the program.
+  const Real *ones = Get64Ones<Real>();
+
+  MatrixIndexT num_rows = M.NumRows();
+  for (MatrixIndexT row_offset = 0; row_offset < num_rows; row_offset += 64) {
+    MatrixIndexT this_num_rows =
+        std::min<MatrixIndexT>(64, num_rows - row_offset);
+    cblas_Xgemv(CblasTrans, this_num_rows, M.NumCols(), alpha,
+                M.RowData(row_offset), M.Stride(), ones, 1,
+                beta, data_, 1);
+    beta = 1.0;
   }
 }
 
 template<typename Real>
 void VectorBase<Real>::AddColSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
   KALDI_ASSERT(dim_ == M.NumRows());
+  // the BLAS standard does not support vectors with stride zero, even though
+  // some implementations (such as Mac's accelerate framework and I believe
+  // CUBLAS) seem to allow it.  We compile a fixed-size (64) vector of ones
+  // into the program.
+  const Real *ones = Get64Ones<Real>();
   MatrixIndexT num_cols = M.NumCols();
-
-  // implement the function according to a dimension cutoff for computation efficiency
-  if (num_cols <= 64) {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      double sum = 0.0;
-      const Real *src = M.RowData(i);
-      for (MatrixIndexT j = 0; j < num_cols; j++)
-        sum += src[j];
-      data_[i] = alpha * sum + beta * data_[i];
-    }
-  } else {
-    Vector<Real> ones(M.NumCols());
-    ones.Set(1.0);
-    this->AddMatVec(alpha, M, kNoTrans, ones, beta);
+  for (MatrixIndexT col_offset = 0; col_offset < num_cols; col_offset += 64) {
+    MatrixIndexT this_num_cols =
+        std::min<MatrixIndexT>(64, num_cols - col_offset);
+    cblas_Xgemv(CblasNoTrans, M.NumRows(), this_num_cols, alpha,
+                M.Data() + col_offset, M.Stride(),
+                ones, 1,
+                beta, data_, 1);
+    beta = 1.0;
   }
 }
 
@@ -789,7 +803,7 @@ void VectorBase<Real>::ApplyLog() {
 }
 
 template<typename Real>
-void VectorBase<Real>::ApplyLogAndCopy(const VectorBase<Real> &v) {
+void VectorBase<Real>::ApplyLog(const VectorBase<Real> &v) {
   KALDI_ASSERT(dim_ == v.Dim());
   for (MatrixIndexT i = 0; i < dim_; i++) {
     data_[i] = Log(v(i));
@@ -1002,7 +1016,7 @@ void VectorBase<Real>::AddVecVec(Real alpha, const VectorBase<Real> &v,
   KALDI_ASSERT(v.data_ != this->data_ && r.data_ != this->data_);
   // We pretend that v is a band-diagonal matrix.
   KALDI_ASSERT(dim_ == v.dim_ && dim_ == r.dim_);
-  cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, 1,
+  cblas_Xgbmv(CblasNoTrans, dim_, dim_, 0, 0, alpha, v.data_, 1,
               r.data_, 1, beta, this->data_, 1);
 }
 
diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h
index 2a032354b5b..db24174e3d2 100644
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@@ -136,7 +136,7 @@ class VectorBase {
   void ApplyLog();
 
   /// Apply natural log to another vector and put result in *this.
-  void ApplyLogAndCopy(const VectorBase<Real> &v);
+  void ApplyLog(const VectorBase<Real> &v);
 
   /// Apply exponential to each value in vector.
   void ApplyExp();
diff --git a/src/matrix/matrix-common.h b/src/matrix/matrix-common.h
index f7047d71ca5..5d8e264c4a5 100644
--- a/src/matrix/matrix-common.h
+++ b/src/matrix/matrix-common.h
@@ -24,11 +24,19 @@
 // files in this directory.
 
 #include "base/kaldi-common.h"
+#include "cblasext/kaldi-blas.h"
+#include "cblasext/cblas-wrappers.h"
 
 namespace kaldi {
-// this enums equal to CblasTrans and CblasNoTrans constants from CBLAS library
-// we are writing them as literals because we don't want to include here matrix/kaldi-blas.h,
-// which puts many symbols into global scope (like "real") via the header f2c.h 
+
+
+// Define Kaldi's MatrixTransposeType (which is basically equivalent to enum
+// CBLAS_TRANSPOSE) in case we're including this in a context where it was not
+// already defined.  This is part of a kludge to be able to use this enum while
+// not including the cblas headers in our headers; cblas headers can cause
+// problems because they can bring in a lot of junk (types in the global
+// namespace; preprocessor macros), and there are different flavors of cblas
+// which might put different *kinds* of junk there.
 typedef enum {
   kTrans    = 112, // = CblasTrans
   kNoTrans  = 111  // = CblasNoTrans
@@ -95,14 +103,12 @@ template<> class OtherReal<double> {
 };
 
 
+// BLAS's interface has 'int' which on even many 64 bit systems is
+// 32 bits, so using 64 bits for the matrix index would be like making
+// a promise we can't keep.
 typedef int32 MatrixIndexT;
-typedef int32 SignedMatrixIndexT;
 typedef uint32 UnsignedMatrixIndexT;
 
-// If you want to use size_t for the index type, do as follows instead:
-//typedef size_t MatrixIndexT;
-//typedef ssize_t SignedMatrixIndexT;
-//typedef size_t UnsignedMatrixIndexT;
 
 }
 
diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc
index 496c09f5344..c10b0bb6842 100644
--- a/src/matrix/matrix-functions.cc
+++ b/src/matrix/matrix-functions.cc
@@ -17,577 +17,13 @@
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
-//
-// (*) incorporates, with permission, FFT code from his book
-// "Signal Processing with Lapped Transforms", Artech, 1992.
+
 
 #include "matrix/matrix-functions.h"
 #include "matrix/sp-matrix.h"
 
 namespace kaldi {
 
-template<typename Real> void ComplexFt (const VectorBase<Real> &in,
-                                     VectorBase<Real> *out, bool forward) {
-  int exp_sign = (forward ? -1 : 1);
-  KALDI_ASSERT(out != NULL);
-  KALDI_ASSERT(in.Dim() == out->Dim());
-  KALDI_ASSERT(in.Dim() % 2 == 0);
-  int twoN = in.Dim(), N = twoN / 2;
-  const Real *data_in = in.Data();
-  Real *data_out = out->Data();
-
-  Real exp1N_re, exp1N_im;  //  forward -> exp(-2pi / N), backward -> exp(2pi / N).
-  Real fraction = exp_sign * M_2PI / static_cast<Real>(N);  // forward -> -2pi/N, backward->-2pi/N
-  ComplexImExp(fraction, &exp1N_re, &exp1N_im);
-
-  Real expm_re = 1.0, expm_im = 0.0;  // forward -> exp(-2pi m / N).
-
-  for (int two_m = 0; two_m < twoN; two_m+=2) {  // For each output component.
-    Real expmn_re = 1.0, expmn_im = 0.0;  // forward -> exp(-2pi m n / N).
-    Real sum_re = 0.0, sum_im = 0.0;  // complex output for index m (the sum expression)
-    for (int two_n = 0; two_n < twoN; two_n+=2) {
-      ComplexAddProduct(data_in[two_n], data_in[two_n+1],
-                        expmn_re, expmn_im,
-                        &sum_re, &sum_im);
-      ComplexMul(expm_re, expm_im, &expmn_re, &expmn_im);
-    }
-    data_out[two_m] = sum_re;
-    data_out[two_m + 1] = sum_im;
-
-
-    if (two_m % 10 == 0) {  // occasionally renew "expm" from scratch to avoid
-      // loss of precision.
-      int nextm = 1 + two_m/2;
-      Real fraction_mult = fraction * nextm;
-      ComplexImExp(fraction_mult, &expm_re, &expm_im);
-    } else {
-      ComplexMul(exp1N_re, exp1N_im, &expm_re, &expm_im);
-    }
-  }
-}
-
-template
-void ComplexFt (const VectorBase<float> &in,
-                VectorBase<float> *out, bool forward);
-template
-void ComplexFt (const VectorBase<double> &in,
-                VectorBase<double> *out, bool forward);
-
-
-#define KALDI_COMPLEXFFT_BLOCKSIZE 8192
-// This #define affects how we recurse in ComplexFftRecursive.
-// We assume that memory-caching happens on a scale at
-// least as small as this.
-
-
-//! ComplexFftRecursive is a recursive function that computes the
-//! complex FFT of size N.  The "nffts" arguments specifies how many
-//! separate FFTs to compute in parallel (we assume the data for
-//! each one is consecutive in memory).  The "forward argument"
-//! specifies whether to do the FFT (true) or IFFT (false), although
-//! note that we do not include the factor of 1/N (the user should
-//! do this if required.  The iterators factor_begin and factor_end
-//! point to the beginning and end (i.e. one past the last element)
-//! of an array of small factors of N (typically prime factors).
-//! See the comments below this code for the detailed equations
-//! of the recursion.
-
-
-template<typename Real>
-void ComplexFftRecursive (Real *data, int nffts, int N,
-                          const int *factor_begin,
-                          const int *factor_end, bool forward,
-                          Vector<Real> *tmp_vec) {
-  if (factor_begin == factor_end) {
-    KALDI_ASSERT(N == 1);
-    return;
-  }
-
-  {  // an optimization: compute in smaller blocks.
-    // this block of code could be removed and it would still work.
-    MatrixIndexT size_perblock = N * 2 * sizeof(Real);
-    if (nffts > 1 && size_perblock*nffts > KALDI_COMPLEXFFT_BLOCKSIZE) {  // can break it up...
-      // Break up into multiple blocks.  This is an optimization.  We make
-      // no progress on the FFT when we do this.
-      int block_skip = KALDI_COMPLEXFFT_BLOCKSIZE / size_perblock;  // n blocks per call
-      if (block_skip == 0) block_skip = 1;
-      if (block_skip < nffts) {
-        int blocks_left = nffts;
-        while (blocks_left > 0) {
-          int skip_now = std::min(blocks_left, block_skip);
-          ComplexFftRecursive(data, skip_now, N, factor_begin, factor_end, forward, tmp_vec);
-          blocks_left -= skip_now;
-          data += skip_now * N*2;
-        }
-        return;
-      } // else do the actual algorithm.
-    } // else do the actual algorithm.
-  }
-
-  int P = *factor_begin;
-  KALDI_ASSERT(P > 1);
-  int Q = N / P;
-
-
-  if (P > 1 && Q > 1) {  // Do the rearrangement.   C.f. eq. (8) below.  Transform
-    // (a) to (b).
-    Real *data_thisblock = data;
-    if (tmp_vec->Dim() < (MatrixIndexT)N) tmp_vec->Resize(N);
-    Real *data_tmp = tmp_vec->Data();
-    for (int thisfft = 0; thisfft < nffts; thisfft++, data_thisblock+=N*2) {
-      for (int offset = 0; offset < 2; offset++) {  // 0 == real, 1 == im.
-        for (int p = 0; p < P; p++) {
-          for (int q = 0; q < Q; q++) {
-            int aidx = q*P + p, bidx = p*Q + q;
-            data_tmp[bidx] = data_thisblock[2*aidx+offset];
-          }
-        }
-        for (int n = 0;n < P*Q;n++) data_thisblock[2*n+offset] = data_tmp[n];
-      }
-    }
-  }
-
-  {  // Recurse.
-    ComplexFftRecursive(data, nffts*P, Q, factor_begin+1, factor_end, forward, tmp_vec);
-  }
-
-  int exp_sign = (forward ? -1 : 1);
-  Real rootN_re, rootN_im;  // Nth root of unity.
-  ComplexImExp(static_cast<Real>(exp_sign * M_2PI / N), &rootN_re, &rootN_im);
-
-  Real rootP_re, rootP_im;  // Pth root of unity.
-  ComplexImExp(static_cast<Real>(exp_sign * M_2PI / P), &rootP_re, &rootP_im);
-
-  {  // Do the multiplication
-    // could avoid a bunch of complex multiplies by moving the loop over data_thisblock
-    // inside.
-    if (tmp_vec->Dim() < (MatrixIndexT)(P*2)) tmp_vec->Resize(P*2);
-    Real *temp_a = tmp_vec->Data();
-
-    Real *data_thisblock = data, *data_end = data+(N*2*nffts);
-    for (; data_thisblock != data_end; data_thisblock += N*2) {  // for each separate fft.
-      Real qd_re = 1.0, qd_im = 0.0;  // 1^(q'/N)
-      for (int qd = 0; qd < Q; qd++) {
-        Real pdQ_qd_re = qd_re, pdQ_qd_im = qd_im;  // 1^((p'Q+q') / N) == 1^((p'/P) + (q'/N))
-                                              // Initialize to q'/N, corresponding to p' == 0.
-        for (int pd = 0; pd < P; pd++) {  // pd == p'
-          {  // This is the p = 0 case of the loop below [an optimization].
-            temp_a[pd*2] = data_thisblock[qd*2];
-            temp_a[pd*2 + 1] = data_thisblock[qd*2 + 1];
-          }
-          {  // This is the p = 1 case of the loop below [an optimization]
-            // **** MOST OF THE TIME (>60% I think) gets spent here. ***
-            ComplexAddProduct(pdQ_qd_re, pdQ_qd_im,
-                              data_thisblock[(qd+Q)*2], data_thisblock[(qd+Q)*2 + 1],
-                              &(temp_a[pd*2]), &(temp_a[pd*2 + 1]));
-          }
-          if (P > 2) {
-            Real p_pdQ_qd_re = pdQ_qd_re, p_pdQ_qd_im = pdQ_qd_im;  // 1^(p(p'Q+q')/N)
-            for (int p = 2; p < P; p++) {
-              ComplexMul(pdQ_qd_re, pdQ_qd_im, &p_pdQ_qd_re, &p_pdQ_qd_im);  // p_pdQ_qd *= pdQ_qd.
-              int data_idx = p*Q + qd;
-              ComplexAddProduct(p_pdQ_qd_re, p_pdQ_qd_im,
-                                data_thisblock[data_idx*2], data_thisblock[data_idx*2 + 1],
-                                &(temp_a[pd*2]), &(temp_a[pd*2 + 1]));
-            }
-          }
-          if (pd != P-1)
-            ComplexMul(rootP_re, rootP_im, &pdQ_qd_re, &pdQ_qd_im);  // pdQ_qd *= (rootP == 1^{1/P})
-          // (using 1/P == Q/N)
-        }
-        for (int pd = 0; pd < P; pd++) {
-          data_thisblock[(pd*Q + qd)*2] = temp_a[pd*2];
-          data_thisblock[(pd*Q + qd)*2 + 1] = temp_a[pd*2 + 1];
-        }
-        ComplexMul(rootN_re, rootN_im, &qd_re, &qd_im);  // qd *= rootN.
-      }
-    }
-  }
-}
-
-/* Equations for ComplexFftRecursive.
-   We consider here one of the "nffts" separate ffts; it's just a question of
-   doing them all in parallel.  We also write all equations in terms of
-   complex math (the conversion to real arithmetic is not hard, and anyway
-   takes place inside function calls).
-
-
-   Let the input (i.e. "data" at start) be a_n, n = 0..N-1, and
-   the output (Fourier transform) be d_k, k = 0..N-1.  We use these letters because
-   there will be two intermediate variables b and c.
-   We want to compute:
-
-     d_k = \sum_n a_n 1^(kn/N)                                             (1)
-
-   where we use 1^x as shorthand for exp(-2pi x) for the forward algorithm
-   and exp(2pi x) for the backward one.
-
-   We factorize N = P Q (P small, Q usually large).
-   With p = 0..P-1 and q = 0..Q-1, and also p'=0..P-1 and q'=0..P-1, we let:
-
-    k == p'Q + q'                                                           (2)
-    n == qP + p                                                             (3)
-
-   That is, we let p, q, p', q' range over these indices and observe that this way we
-   can cover all n, k.  Expanding (1) using (2) and (3), we can write:
-
-      d_k = \sum_{p, q}  a_n 1^((p'Q+q')(qP+p)/N)
-          = \sum_{p, q}  a_n 1^(p'pQ/N) 1^(q'qP/N) 1^(q'p/N)                 (4)
-
-   using 1^(PQ/N) = 1 to get rid of the terms with PQ in them.  Rearranging (4),
-
-     d_k =  \sum_p 1^(p'pQ/N) 1^(q'p/N)  \sum_q 1^(q'qP/N) a_n              (5)
-
-   The point here is to separate the index q.  Now we can expand out the remaining
-   instances of k and n using (2) and (3):
-
-     d_(p'Q+q') =  \sum_p 1^(p'pQ/N) 1^(q'p/N)  \sum_q 1^(q'qP/N) a_(qP+p)   (6)
-
-   The expression \sum_q varies with the indices p and q'.  Let us define
-
-         C_{p, q'} =  \sum_q 1^(q'qP/N) a_(qP+p)                            (7)
-
-   Here, C_{p, q'}, viewed as a sequence in q', is just the DFT of the points
-   a_(qP+p) for q = 1..Q-1.  These points are not consecutive in memory though,
-   they jump by P each time.  Let us define b as a rearranged version of a,
-   so that
-
-         b_(pQ+q) = a_(qP+p)                                                  (8)
-
-   How to do this rearrangement in place?  In
-
-   We can rearrange (7) to be written in terms of the b's, using (8), so that
-
-         C_{p, q'} =  \sum_q 1^(q'q (P/N)) b_(pQ+q)                            (9)
-
-   Here, the sequence of C_{p, q'} over q'=0..Q-1, is just the DFT of the sequence
-   of b_(pQ) .. b_(p(Q+1)-1).  Let's arrange the C_{p, q'} in a single array in
-   memory in the same way as the b's, i.e. we define
-         c_(pQ+q') == C_{p, q'}.                                                (10)
-   Note that we could have written (10) with q in place of q', as there is only
-   one index of type q present, but q' is just a more natural variable name to use
-   since we use q' elsewhere to subscript c and C.
-
-   Rewriting (9), we have:
-         c_(pQ+q')  = \sum_q 1^(q'q (P/N)) b_(pQ+q)                            (11)
-    which is the DFT computed by the recursive call to this function [after computing
-    the b's by rearranging the a's].  From the c's we want to compute the d's.
-    Taking (6), substituting in the sum (7), and using (10) to write it as an array,
-    we have:
-         d_(p'Q+q') =  \sum_p 1^(p'pQ/N) 1^(q'p/N)  c_(pQ+q')                   (12)
-    This sum is independent for different values of q'.  Note that d overwrites c
-    in memory.  We compute this in  a direct way, using a little array of size P to
-    store the computed d values for one value of q' (we reuse the array for each value
-    of q').
-
-    So the overall picture is this:
-    We get a call to compute DFT on size N.
-
-    - If N == 1 we return (nothing to do).
-    - We factor N = P Q (typically, P is small).
-    - Using (8), we rearrange the data in memory so that we have b not a in memory
-       (this is the block "do the rearrangement").
-       The pseudocode for this is as follows.  For simplicity we use a temporary array.
-
-          for p = 0..P-1
-             for q = 0..Q-1
-                bidx = pQ + q
-                aidx = qP + p
-                tmp[bidx] = data[aidx].
-             end
-          end
-          data <-- tmp
-        else
-
-        endif
-
-
-        The reason this accomplishes (8) is that we want pQ+q and qP+p to be swapped
-        over for each p, q, and the "if m > n" is a convenient way of ensuring that
-        this swapping happens only once (otherwise it would happen twice, since pQ+q
-        and qP+p both range over the entire set of numbers 0..N-1).
-
-    - We do the DFT on the smaller block size to compute c from b (this eq eq. (11)).
-      Note that this is actually multiple DFTs, one for each value of p, but this
-      goes to the "nffts" argument of the function call, which we have ignored up to now.
-
-    -We compute eq. (12) via a loop, as follows
-         allocate temporary array e of size P.
-         For q' = 0..Q-1:
-            for p' = 0..P-1:
-               set sum to zero [this will go in e[p']]
-               for p = p..P-1:
-                  sum += 1^(p'pQ/N) 1^(q'p/N)  c_(pQ+q')
-               end
-               e[p'] = sum
-            end
-            for p' = 0..P-1:
-               d_(p'Q+q') = e[p']
-            end
-         end
-         delete temporary array e
-
-*/
-
-// This is the outer-layer calling code for ComplexFftRecursive.
-// It factorizes the dimension and then calls the FFT routine.
-template<typename Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<Real> *tmp_in) {
-  KALDI_ASSERT(v != NULL);
-
-  if (v->Dim()<=1) return;
-  KALDI_ASSERT(v->Dim() % 2 == 0);  // complex input.
-  int N = v->Dim() / 2;
-  std::vector<int> factors;
-  Factorize(N, &factors);
-  int *factor_beg = NULL;
-  if (factors.size() > 0)
-    factor_beg = &(factors[0]);
-  Vector<Real> tmp;  // allocated in ComplexFftRecursive.
-  ComplexFftRecursive(v->Data(), 1, N, factor_beg, factor_beg+factors.size(), forward, (tmp_in?tmp_in:&tmp));
-}
-
-//! Inefficient version of Fourier transform, for testing purposes.
-template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward) {
-  KALDI_ASSERT(v != NULL);
-  MatrixIndexT N = v->Dim();
-  KALDI_ASSERT(N%2 == 0);
-  if (N == 0) return;
-  Vector<Real> vtmp(N*2);  // store as complex.
-  if (forward) {
-    for (MatrixIndexT i = 0; i < N; i++)  vtmp(i*2) = (*v)(i);
-    ComplexFft(&vtmp, forward);  // this is already tested so we can use this.
-    v->CopyFromVec( vtmp.Range(0, N) );
-    (*v)(1) = vtmp(N);  // Copy the N/2'th fourier component, which is real,
-    // to the imaginary part of the 1st complex output.
-  } else {
-    // reverse the transformation above to get the complex spectrum.
-    vtmp(0) = (*v)(0);  // copy F_0 which is real
-    vtmp(N) = (*v)(1);  // copy F_{N/2} which is real
-    for (MatrixIndexT i = 1; i < N/2; i++) {
-      // Copy i'th to i'th fourier component
-      vtmp(2*i) = (*v)(2*i);
-      vtmp(2*i+1) = (*v)(2*i+1);
-      // Copy i'th to N-i'th, conjugated.
-      vtmp(2*(N-i)) = (*v)(2*i);
-      vtmp(2*(N-i)+1) = -(*v)(2*i+1);
-    }
-    ComplexFft(&vtmp, forward);  // actually backward since forward == false
-    // Copy back real part.  Complex part should be zero.
-    for (MatrixIndexT i = 0; i < N; i++)
-      (*v)(i) = vtmp(i*2);
-  }
-}
-
-template void RealFftInefficient (VectorBase<float> *v, bool forward);
-template void RealFftInefficient (VectorBase<double> *v, bool forward);
-
-template
-void ComplexFft(VectorBase<float> *v, bool forward, Vector<float> *tmp_in);
-template
-void ComplexFft(VectorBase<double> *v, bool forward, Vector<double> *tmp_in);
-
-
-// See the long comment below for the math behind this.
-template<typename Real> void RealFft (VectorBase<Real> *v, bool forward) {
-  KALDI_ASSERT(v != NULL);
-  MatrixIndexT N = v->Dim(), N2 = N/2;
-  KALDI_ASSERT(N%2 == 0);
-  if (N == 0) return;
-
-  if (forward) ComplexFft(v, true);
-
-  Real *data = v->Data();
-  Real rootN_re, rootN_im;  // exp(-2pi/N), forward; exp(2pi/N), backward
-  int forward_sign = forward ? -1 : 1;
-  ComplexImExp(static_cast<Real>(M_2PI/N *forward_sign), &rootN_re, &rootN_im);
-  Real kN_re = -forward_sign, kN_im = 0.0;  // exp(-2pik/N), forward; exp(-2pik/N), backward
-  // kN starts out as 1.0 for forward algorithm but -1.0 for backward.
-  for (MatrixIndexT k = 1; 2*k <= N2; k++) {
-    ComplexMul(rootN_re, rootN_im, &kN_re, &kN_im);
-
-    Real Ck_re, Ck_im, Dk_re, Dk_im;
-    // C_k = 1/2 (B_k + B_{N/2 - k}^*) :
-    Ck_re = 0.5 * (data[2*k] + data[N - 2*k]);
-    Ck_im = 0.5 * (data[2*k + 1] - data[N - 2*k + 1]);
-    // re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})):
-    Dk_re = 0.5 * (data[2*k + 1] + data[N - 2*k + 1]);
-    // im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k}))
-    Dk_im =-0.5 * (data[2*k] - data[N - 2*k]);
-    // A_k = C_k + 1^(k/N) D_k:
-    data[2*k] = Ck_re;  // A_k <-- C_k
-    data[2*k+1] = Ck_im;
-    // now A_k += D_k 1^(k/N)
-    ComplexAddProduct(Dk_re, Dk_im, kN_re, kN_im, &(data[2*k]), &(data[2*k+1]));
-
-    MatrixIndexT kdash = N2 - k;
-    if (kdash != k) {
-      // Next we handle the index k' = N/2 - k.  This is necessary
-      // to do now, to avoid invalidating data that we will later need.
-      // The quantities C_{k'} and D_{k'} are just the conjugates of C_k
-      // and D_k, so the equations are simple modifications of the above,
-      // replacing Ck_im and Dk_im with their negatives.
-      data[2*kdash] = Ck_re;  // A_k' <-- C_k'
-      data[2*kdash+1] = -Ck_im;
-      // now A_k' += D_k' 1^(k'/N)
-      // We use 1^(k'/N) = 1^((N/2 - k) / N) = 1^(1/2) 1^(-k/N) = -1 * (1^(k/N))^*
-      // so it's the same as 1^(k/N) but with the real part negated.
-      ComplexAddProduct(Dk_re, -Dk_im, -kN_re, kN_im, &(data[2*kdash]), &(data[2*kdash+1]));
-    }
-  }
-
-  {  // Now handle k = 0.
-    // In simple terms: after the complex fft, data[0] becomes the sum of real
-    // parts input[0], input[2]... and data[1] becomes the sum of imaginary
-    // pats input[1], input[3]...
-    // "zeroth" [A_0] is just the sum of input[0]+input[1]+input[2]..
-    // and "n2th" [A_{N/2}] is input[0]-input[1]+input[2]... .
-    Real zeroth = data[0] + data[1],
-        n2th = data[0] - data[1];
-    data[0] = zeroth;
-    data[1] = n2th;
-    if (!forward) {
-      data[0] /= 2;
-      data[1] /= 2;
-    }
-  }
-
-  if (!forward) {
-    ComplexFft(v, false);
-    v->Scale(2.0);  // This is so we get a factor of N increase, rather than N/2 which we would
-    // otherwise get from [ComplexFft, forward] + [ComplexFft, backward] in dimension N/2.
-    // It's for consistency with our normal FFT convensions.
-  }
-}
-
-template void RealFft (VectorBase<float> *v, bool forward);
-template void RealFft (VectorBase<double> *v, bool forward);
-
-/* Notes for real FFTs.
-   We are using the same convention as above, 1^x to mean exp(-2\pi x) for the forward transform.
-   Actually, in a slight abuse of notation, we use this meaning for 1^x in both the forward and
-   backward cases because it's more convenient in this section.
-
-   Suppose we have real data a[0...N-1], with N even, and want to compute its Fourier transform.
-   We can make do with the first N/2 points of the transform, since the remaining ones are complex
-   conjugates of the first.  We want to compute:
-       for k = 0...N/2-1,
-       A_k = \sum_{n = 0}^{N-1}  a_n 1^(kn/N)                 (1)
-
-   We treat a[0..N-1] as a complex sequence of length N/2, i.e. a sequence b[0..N/2 - 1].
-   Viewed as sequences of length N/2, we have:
-       b = c + i d,
-   where c = a_0, a_2 ... and d = a_1, a_3 ...
-
-   We can recover the length-N/2 Fourier transforms of c and d by doing FT on b and
-   then doing the equations below.  Derivation is marked by (*) in a comment below (search
-   for it).  Let B, C, D be the FTs.
-   We have
-       C_k = 1/2 (B_k + B_{N/2 - k}^*)                                 (z0)
-       D_k =-1/2i (B_k - B_{N/2 - k}^*)                                (z1)
-so: re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k}))                             (z2)
-    im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k}))                             (z3)
-
-    To recover the FT A from C and D, we write, rearranging (1):
-
-       A_k = \sum_{n = 0, 2, ..., N-2} a_n 1^(kn/N)
-            +\sum_{n = 1, 3, ..., N-1} a_n 1^(kn/N)
-           = \sum_{n = 0, 1, ..., N/2-1} a_n 1^(2kn/N)  + a_{n+1} 1^(2kn/N) 1^(k/N)
-           = \sum_{n = 0, 1, ..., N/2-1} c_n 1^(2kn/N)  + d_n  1^(2kn/N) 1^(k/N)
-       A_k =  C_k + 1^(k/N) D_k                                              (a0)
-
-    This equation is valid for k = 0...N/2-1, which is the range of the sequences B_k and
-    C_k.  We don't use is for k = 0, which is a special case considered below.  For
-    1 < k < N/2, it's convenient to consider the pair k, k', where k' = N/2 - k.
-    Remember that C_k' = C_k^ *and D_k' = D_k^* [where * is conjugation].  Also,
-    1^(N/2 / N) = -1.  So we have:
-       A_k' = C_k^* - 1^(k/N) D_k^*                                          (a0b)
-    We do (a0) and (a0b) together.
-
-
-
-    By symmetry this gives us the Fourier components for N/2+1, ... N, if we want
-    them.  However, it doesn't give us the value for exactly k = N/2.  For k = 0 and k = N/2, it
-    is easiest to argue directly about the meaning of the A_k, B_k and C_k in terms of
-    sums of points.
-       A_0 and A_{N/2} are both real, with A_0=\sum_n a_n, and A_1 an alternating sum
-       A_1 = a_0 - a_1 + a_2 ...
-     It's easy to show that
-              A_0 = B_0 + C_0            (a1)
-              A_{N/2} = B_0 - C_0.       (a2)
-     Since B_0 and C_0 are both real, B_0 is the real coefficient of D_0 and C_0 is the
-     imaginary coefficient.
-
-     *REVERSING THE PROCESS*
-
-     Next we want to reverse this process.  We just need to work out C_k and D_k from the
-     sequence A_k.  Then we do the inverse complex fft and we get back where we started.
-     For 0 and N/2, working from (a1) and (a2) above, we can see that:
-          B_0 = 1/2 (A_0 + A_{N/2})                                       (y0)
-          C_0 = 1/2 (A_0 + A_{N/2})                                       (y1)
-     and we use
-         D_0 = B_0 + i C_0
-     to get the 1st complex coefficient of D.  This is exactly the same as the forward process
-     except with an extra factor of 1/2.
-
-     Consider equations (a0) and (a0b).  We want to work out C_k and D_k from A_k and A_k'.  Remember
-     k' = N/2 - k.
-
-     Write down
-         A_k     =  C_k + 1^(k/N) D_k        (copying a0)
-         A_k'^* =   C_k - 1^(k/N) D_k       (conjugate of a0b)
-      So
-             C_k =            0.5 (A_k + A_k'^*)                    (p0)
-             D_k = 1^(-k/N) . 0.5 (A_k - A_k'^*)                    (p1)
-      Next, we want to compute B_k and B_k' from C_k and D_k.  C.f. (z0)..(z3), and remember
-      that k' = N/2-k.  We can see
-      that
-              B_k  = C_k + i D_k                                    (p2)
-              B_k' = C_k - i D_k                                    (p3)
-
-     We would like to make the equations (p0) ... (p3) look like the forward equations (z0), (z1),
-     (a0) and (a0b) so we can reuse the code.  Define E_k = -i 1^(k/N) D_k.  Then write down (p0)..(p3).
-     We have
-             C_k  =            0.5 (A_k + A_k'^*)                    (p0')
-             E_k  =       -0.5 i   (A_k - A_k'^*)                    (p1')
-             B_k  =    C_k - 1^(-k/N) E_k                            (p2')
-             B_k' =    C_k + 1^(-k/N) E_k                            (p3')
-     So these are exactly the same as (z0), (z1), (a0), (a0b) except replacing 1^(k/N) with
-     -1^(-k/N) .  Remember that we defined 1^x above to be exp(-2pi x/N), so the signs here
-     might be opposite to what you see in the code.
-
-     MODIFICATION: we need to take care of a factor of two.  The complex FFT we implemented
-     does not divide by N in the reverse case.  So upon inversion we get larger by N/2.
-     However, this is not consistent with normal FFT conventions where you get a factor of N.
-     For this reason we multiply by two after the process described above.
-
-*/
-
-
-/*
-   (*) [this token is referred to in a comment above].
-
-   Notes for separating 2 real transforms from one complex one.  Note that the
-   letters here (A, B, C and N) are all distinct from the same letters used in the
-   place where this comment is used.
-   Suppose we
-   have two sequences a_n and b_n, n = 0..N-1.  We combine them into a complex
-   number,
-      c_n = a_n + i b_n.
-   Then we take the fourier transform to get
-      C_k = \sum_{n = 0}^{N-1} c_n 1^(n/N) .
-   Then we use symmetry.  Define A_k and B_k as the DFTs of a and b.
-   We use A_k = A_{N-k}^*, and B_k = B_{N-k}^*, since a and b are real.  Using
-      C_k     = A_k    +  i B_k,
-      C_{N-k} = A_k^*  +  i B_k^*
-              = A_k^*  -  (i B_k)^*
-   So:
-      A_k     = 1/2  (C_k + C_{N-k}^*)
-    i B_k     = 1/2  (C_k - C_{N-k}^*)
-->    B_k     =-1/2i (C_k - C_{N-k}^*)
-->  re(B_k)   = 1/2 (im(C_k) + im(C_{N-k}))
-    im(B_k)   =-1/2 (re(C_k) - re(C_{N-k}))
-
- */
 
 template<typename Real> void ComputeDctMatrix(Matrix<Real> *M) {
   //KALDI_ASSERT(M->NumRows() == M->NumCols());
diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h
index ca50ddda7c8..8f83a4fdd71 100644
--- a/src/matrix/matrix-functions.h
+++ b/src/matrix/matrix-functions.h
@@ -34,85 +34,18 @@ namespace kaldi {
 /// @addtogroup matrix_funcs_misc
 /// @{
 
-/** The function ComplexFft does an Fft on the vector argument v.
-   v is a vector of even dimension, interpreted for both input
-   and output as a vector of complex numbers i.e.
-   \f[ v = ( re_0, im_0, re_1, im_1, ... )    \f]
-
-   If "forward == true" this routine does the Discrete Fourier Transform
-   (DFT), i.e.:
-   \f[   vout[m] \leftarrow \sum_{n = 0}^{N-1} vin[i] exp( -2pi m n / N )  \f]
-
-   If "backward" it does the Inverse Discrete Fourier Transform (IDFT)
-   *WITHOUT THE FACTOR 1/N*,
-   i.e.:
-   \f[   vout[m] <-- \sum_{n = 0}^{N-1} vin[i] exp(  2pi m n / N )   \f]
-   [note the sign difference on the 2 pi for the backward one.]
-
-   Note that this is the definition of the FT given in most texts, but
-   it differs from the Numerical Recipes version in which the forward
-   and backward algorithms are flipped.
-
-   Note that you would have to multiply by 1/N after the IDFT to get
-   back to where you started from.  We don't do this because
-   in some contexts, the transform is made symmetric by multiplying
-   by sqrt(N) in both passes.   The user can do this by themselves.
-
-   See also SplitRadixComplexFft, declared in srfft.h, which is more efficient
-   but only works if the length of the input is a power of 2.
- */
-template<typename Real> void ComplexFft (VectorBase<Real> *v, bool forward, Vector<Real> *tmp_work = NULL);
-
-/// ComplexFt is the same as ComplexFft but it implements the Fourier
-/// transform in an inefficient way.  It is mainly included for testing purposes.
-/// See comment for ComplexFft to describe the input and outputs and what it does.
-template<typename Real> void ComplexFt (const VectorBase<Real> &in,
-                                     VectorBase<Real> *out, bool forward);
-
-/// RealFft is a fourier transform of real inputs.  Internally it uses
-/// ComplexFft.  The input dimension N must be even.  If forward == true,
-/// it transforms from a sequence of N real points to its complex fourier
-/// transform; otherwise it goes in the reverse direction.  If you call it
-/// in the forward and then reverse direction and multiply by 1.0/N, you
-/// will get back the original data.
-/// The interpretation of the complex-FFT data is as follows: the array
-/// is a sequence of complex numbers C_n of length N/2 with (real, im) format,
-/// i.e. [real0, real_{N/2}, real1, im1, real2, im2, real3, im3, ...].
-/// See also SplitRadixRealFft, declared in srfft.h, which is more efficient
-/// but only works if the length of the input is a power of 2.
-
-template<typename Real> void RealFft (VectorBase<Real> *v, bool forward);
-
-
-/// RealFt has the same input and output format as RealFft above, but it is
-/// an inefficient implementation included for testing purposes.
-template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward);
-
-/// ComputeDctMatrix computes a matrix corresponding to the DCT, such that
-/// M * v equals the DCT of vector v.  M must be square at input.
-/// This is the type = III DCT with normalization, corresponding to the
-/// following equations, where x is the signal and X is the DCT:
-/// X_0 = 1/sqrt(2*N) \sum_{n = 0}^{N-1} x_n
-/// X_k = 1/sqrt(N) \sum_{n = 0}^{N-1} x_n cos( \pi/N (n + 1/2) k )
-/// This matrix's transpose is its own inverse, so transposing this
-/// matrix will give the inverse DCT.
-/// Caution: the type III DCT is generally known as the "inverse DCT" (with the
-/// type II being the actual DCT), so this function is somewhatd mis-named.  It
-/// was probably done this way for HTK compatibility.  We don't change it
-/// because it was this way from the start and changing it would affect the
-/// feature generation.
 
 template<typename Real> void ComputeDctMatrix(Matrix<Real> *M);
 
 
 /// ComplexMul implements, inline, the complex multiplication b *= a.
 template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
-                                            Real *b_re, Real *b_im);
+                                               Real *b_re, Real *b_im);
 
 /// ComplexMul implements, inline, the complex operation c += (a * b).
 template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
-                                                   const Real &b_re, const Real &b_im,
-                                                   Real *c_re, Real *c_im);
+                                                      const Real &b_re, const Real &b_im,
+                                                      Real *c_re, Real *c_im);
 
 
 /// ComplexImExp implements a <-- exp(i x), inline.
diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc
index 7db0d8d822c..754fa836c97 100644
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@@ -27,8 +27,8 @@
 #include "util/stl-utils.h"
 #include <numeric>
 #include <time.h> // This is only needed for UnitTestSvdSpeed, you can
-// comment it (and that function) out if it causes problems.  
-#include <matrix/cblas-wrappers.h>
+// comment it (and that function) out if it causes problems.
+#include <cblasext/cblas-wrappers.h>
 
 namespace kaldi {
 
@@ -2448,7 +2448,7 @@ template<typename Real> static void  UnitTestSimple() {
     Vector<Real> V2(V), V3(dimM*dimN);
     V2.ApplyExp();
     AssertEqual(V.Sum(), V2.SumLog());
-    V3.ApplyLogAndCopy(V2);
+    V3.ApplyLog(V2);
     V2.ApplyLog();
     AssertEqual(V, V2);
     AssertEqual(V3, V2);
@@ -3392,20 +3392,6 @@ template<typename Real> static void UnitTestTrace() {
 }
 
 
-template<typename Real> static void UnitTestComplexFt() {
-
-  // Make sure it inverts properly.
-  for (MatrixIndexT d = 0; d < 10; d++) {
-    MatrixIndexT N = Rand() % 100, twoN = 2*N;
-    Vector<Real> v(twoN), w(twoN), x(twoN);
-    v.SetRandn();
-    ComplexFt(v, &w, true);
-    ComplexFt(w, &x, false);
-    if (N>0) x.Scale(1.0/static_cast<Real>(N));
-    AssertEqual(v, x);
-  }
-}
-
 template<typename Real> static void UnitTestDct() {
 
   // Check that DCT matrix is orthogonal (i.e. M^T M = I);
@@ -3419,35 +3405,6 @@ template<typename Real> static void UnitTestDct() {
   }
 
 }
-template<typename Real> static void UnitTestComplexFft() {
-
-  // Make sure it inverts properly.
-  for (MatrixIndexT N_ = 0; N_ < 100; N_+=3) {
-    MatrixIndexT N = N_;
-    if (N>=95) {
-      N = ( Rand() % 150);
-      N = N*N;  // big number.
-    }
-
-    MatrixIndexT twoN = 2*N;
-    Vector<Real> v(twoN), w_base(twoN), w_alg(twoN), x_base(twoN), x_alg(twoN);
-
-    v.SetRandn();
-
-    if (N< 100) ComplexFt(v, &w_base, true);
-    w_alg.CopyFromVec(v);
-    ComplexFft(&w_alg, true);
-    if (N< 100) AssertEqual(w_base, w_alg, 0.01*N);
-
-    if (N< 100) ComplexFt(w_base, &x_base, false);
-    x_alg.CopyFromVec(w_alg);
-    ComplexFft(&x_alg, false);
-
-    if (N< 100) AssertEqual(x_base, x_alg, 0.01*N);
-    x_alg.Scale(1.0/N);
-    AssertEqual(v, x_alg, 0.001*N);
-  }
-}
 
 
 template<typename Real> static void UnitTestSplitRadixComplexFft() {
@@ -3461,11 +3418,10 @@ template<typename Real> static void UnitTestSplitRadixComplexFft() {
     std::vector<Real> temp_buffer;
     SplitRadixComplexFft<Real> srfft(N), srfft2(srfft);
     for (MatrixIndexT p = 0; p < 3; p++) {
-      Vector<Real> v(twoN), w_base(twoN), w_alg(twoN), x_base(twoN), x_alg(twoN);
+      Vector<Real> v(twoN), w_alg(twoN), x_alg(twoN);
 
       v.SetRandn();
 
-      if (N< 100) ComplexFt(v, &w_base, true);
       w_alg.CopyFromVec(v);
 
       if (Rand() % 2 == 0)
@@ -3473,13 +3429,9 @@ template<typename Real> static void UnitTestSplitRadixComplexFft() {
       else
         srfft2.Compute(w_alg.Data(), true, &temp_buffer);
 
-      if (N< 100) AssertEqual(w_base, w_alg, 0.01*N);
-
-      if (N< 100) ComplexFt(w_base, &x_base, false);
       x_alg.CopyFromVec(w_alg);
       srfft.Compute(x_alg.Data(), false);
 
-      if (N< 100) AssertEqual(x_base, x_alg, 0.01*N);
       x_alg.Scale(1.0/N);
       AssertEqual(v, x_alg, 0.001*N);
     }
@@ -3556,38 +3508,6 @@ template<typename Real> static void UnitTestAddVecToCols() {
   }
 }
 
-template<typename Real> static void UnitTestComplexFft2() {
-
-  // Make sure it inverts properly.
-  for (MatrixIndexT pos = 0; pos < 10; pos++) {
-    for (MatrixIndexT N_ = 2; N_ < 15; N_+=2) {
-      if ( pos < N_) {
-        MatrixIndexT N = N_;
-        Vector<Real> v(N), vorig(N), v2(N);
-        v(pos)  = 1.0;
-        vorig.CopyFromVec(v);
-        // KALDI_LOG << "Original v:\n" << v;
-        ComplexFft(&v, true);
-        // KALDI_LOG << "one fft:\n" << v;
-        ComplexFt(vorig, &v2, true);
-        // KALDI_LOG << "one fft[baseline]:\n" << v2;
-        if (!ApproxEqual(v, v2) ) {
-          ComplexFft(&vorig, true);
-          KALDI_ASSERT(0);
-        }
-        ComplexFft(&v, false);
-        // KALDI_LOG << "one more:\n" << v;
-        v.Scale(1.0/(N/2));
-        if (!ApproxEqual(v, vorig)) {
-          ComplexFft(&vorig, true);
-          KALDI_ASSERT(0);
-        }// AssertEqual(v, vorig);
-      }
-    }
-  }
-}
-
-
 template<typename Real> static void UnitTestSplitRadixComplexFft2() {
 
   // Make sure it inverts properly.
@@ -3608,34 +3528,6 @@ template<typename Real> static void UnitTestSplitRadixComplexFft2() {
 }
 
 
-template<typename Real> static void UnitTestRealFft() {
-
-  // First, test RealFftInefficient.
-  for (MatrixIndexT N_ = 2; N_ < 100; N_ += 6) {
-    MatrixIndexT N = N_;
-    if (N >90) N *= Rand() % 60;
-    Vector<Real> v(N), w(N), x(N), y(N);
-    v.SetRandn();
-    w.CopyFromVec(v);
-    RealFftInefficient(&w, true);
-    y.CopyFromVec(v);
-    RealFft(&y, true);  // test efficient one.
-    // KALDI_LOG <<"v = "<<v;
-    // KALDI_LOG << "Inefficient real fft of v is: "<< w;
-    // KALDI_LOG << "Efficient real fft of v is: "<< y;
-    AssertEqual(w, y, 0.01*N);
-    x.CopyFromVec(w);
-    RealFftInefficient(&x, false);
-    RealFft(&y, false);
-    // KALDI_LOG << "Inefficient real fft of v twice is: "<< x;
-    if (N != 0) x.Scale(1.0/N);
-    if (N != 0) y.Scale(1.0/N);
-    AssertEqual(v, x, 0.001*N);
-    AssertEqual(v, y, 0.001*N);  // ?
-  }
-}
-
-
 template<typename Real> static void UnitTestSplitRadixRealFft() {
 
   for (MatrixIndexT p = 0; p < 30; p++) {
@@ -3645,46 +3537,21 @@ template<typename Real> static void UnitTestSplitRadixRealFft() {
     SplitRadixRealFft<Real> srfft(N), srfft2(srfft);
     std::vector<Real> temp_buffer;
     for (MatrixIndexT q = 0; q < 3; q++) {
-      Vector<Real> v(N), w(N), x(N), y(N);
+      Vector<Real> v(N), y(N);
       v.SetRandn();
-      w.CopyFromVec(v);
-      RealFftInefficient(&w, true);
       y.CopyFromVec(v);
       if (Rand() % 2 == 0)
         srfft.Compute(y.Data(), true);
       else
         srfft2.Compute(y.Data(), true, &temp_buffer);
 
-      // KALDI_LOG <<"v = "<<v;
-      // KALDI_LOG << "Inefficient real fft of v is: "<< w;
-      // KALDI_LOG << "Efficient real fft of v is: "<< y;
-      AssertEqual(w, y, 0.01*N);
-      x.CopyFromVec(w);
-      RealFftInefficient(&x, false);
       srfft.Compute(y.Data(), false);
-      // KALDI_LOG << "Inefficient real fft of v twice is: "<< x;
-      x.Scale(1.0/N);
       y.Scale(1.0/N);
-      AssertEqual(v, x, 0.001*N);
-      AssertEqual(v, y, 0.001*N);  // ?
+      AssertEqual(v, y, 0.001*N);
     }
   }
 }
 
-
-
-template<typename Real> static void UnitTestRealFftSpeed() {
-
-  // First, test RealFftInefficient.
-  KALDI_LOG << "starting. ";
-  MatrixIndexT sz = 512;  // fairly typical size.
-  for (MatrixIndexT i = 0; i < 3000; i++) {
-    if (i % 1000 == 0) KALDI_LOG << "done 1000 [ == ten seconds of speech]";
-    Vector<Real> v(sz);
-    RealFft(&v, true);
-  }
-}
-
 template<typename Real> static void UnitTestSplitRadixRealFftSpeed() {
   KALDI_LOG << "starting. ";
   MatrixIndexT sz = 512;  // fairly typical size.
@@ -4614,14 +4481,10 @@ template<typename Real> static void MatrixUnitTest(bool full_test) {
   // commenting these out for now-- they test the speed, but take a while.
   // UnitTestSplitRadixRealFftSpeed<Real>();
   // UnitTestRealFftSpeed<Real>();   // won't exit!/
-  UnitTestComplexFt<Real>();
   KALDI_LOG << " Point B";
-  UnitTestComplexFft2<Real>();
-  UnitTestComplexFft<Real>();
   UnitTestSplitRadixComplexFft<Real>();
   UnitTestSplitRadixComplexFft2<Real>();
   UnitTestDct<Real>();
-  UnitTestRealFft<Real>();
   KALDI_LOG << " Point C";
   UnitTestSplitRadixRealFft<Real>();
   UnitTestSvd<Real>();
diff --git a/src/matrix/optimization.cc b/src/matrix/optimization.cc
index c17b5b94d8c..a70e31ae0ec 100644
--- a/src/matrix/optimization.cc
+++ b/src/matrix/optimization.cc
@@ -105,7 +105,7 @@ void OptimizeLbfgs<Real>::ComputeHifNeeded(const VectorBase<Real> &gradient) {
       H_.Set(gamma_k);
     }
   }
-}  
+}
 
 // This represents the first 2 lines of Algorithm 7.5 (N&W), which
 // in fact is mostly a call to Algorithm 7.4.
@@ -114,7 +114,7 @@ template<typename Real>
 void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
                                               const VectorBase<Real> &gradient) {
   KALDI_ASSERT(computation_state_ == kBeforeStep);
-  SignedMatrixIndexT m = M(), k = k_;
+  MatrixIndexT m = M(), k = k_;
   ComputeHifNeeded(gradient);
   // The rest of this is computing p_k <-- - H_k \nabla f_k using Algorithm
   // 7.4 of N&W.
@@ -127,16 +127,16 @@ void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
     q.CopyFromVec(gradient); // q <-- \nabla f_k.
   Vector<Real> alpha(m);
   // for i = k - 1, k - 2, ... k - m
-  for (SignedMatrixIndexT i = k - 1;
-       i >= std::max(k - m, static_cast<SignedMatrixIndexT>(0));
-       i--) { 
+  for (MatrixIndexT i = k - 1;
+       i >= std::max(k - m, static_cast<MatrixIndexT>(0));
+       i--) {
     alpha(i % m) = rho_(i % m) * VecVec(S(i), q); // \alpha_i <-- \rho_i s_i^T q.
     q.AddVec(-alpha(i % m), Y(i)); // q <-- q - \alpha_i y_i
   }
   r.SetZero();
   r.AddVecVec(1.0, H_, q, 0.0); // r <-- H_k^{(0)} q.
   // for k = k - m, k - m + 1, ... , k - 1
-  for (SignedMatrixIndexT i = std::max(k - m, static_cast<SignedMatrixIndexT>(0));
+  for (MatrixIndexT i = std::max(k - m, static_cast<MatrixIndexT>(0));
        i < k;
        i++) {
     Real beta = rho_(i % m) * VecVec(Y(i), r); // \beta <-- \rho_i y_i^T r
@@ -148,7 +148,7 @@ void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
     if ((opts_.minimize && dot < 0) || (!opts_.minimize && dot > 0))
       KALDI_WARN << "Step direction has the wrong sign!  Routine will fail.";
   }
-  
+
   // Now we're out of Alg. 7.4 and back into Alg. 7.5.
   // Alg. 7.4 returned r (using new_x_ as the location), and with \alpha_k = 1
   // as the initial guess, we're setting x_{k+1} = x_k + \alpha_k p_k, with
@@ -178,7 +178,7 @@ bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
   s.AddVec(-1.0, x_); // s = new_x_ - x_.
   y.CopyFromVec(gradient);
   y.AddVec(-1.0, deriv_); // y = gradient - deriv_.
-  
+
   // Warning: there is a division in the next line.  This could
   // generate inf or nan, but this wouldn't necessarily be an error
   // at this point because for zero step size or derivative we should
@@ -190,11 +190,11 @@ bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
   if ((opts_.minimize && prod <= 1.0e-20) || (!opts_.minimize && prod >= -1.0e-20)
       || len == 0.0)
     return false; // This will force restart.
-  
+
   KALDI_VLOG(3) << "Accepted step; length was " << len
                 << ", prod was " << prod;
   RecordStepLength(len);
-  
+
   // store x_{k+1} and the function value f_{k+1}.
   x_.CopyFromVec(new_x_);
   f_ = function_value;
@@ -239,12 +239,12 @@ void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
                                             const VectorBase<Real> &gradient) {
   KALDI_VLOG(3) << "In step size iteration, function value changed "
                 << f_ << " to " << function_value;
-  
+
   // We're in some part of the backtracking, and the user is providing
   // the objective function value and gradient.
   // We're checking two conditions: Wolfe i) [the Armijo rule] and
   // Wolfe ii).
-  
+
   // The Armijo rule (when minimizing) is:
   // f(k_k + \alpha_k p_k) <= f(x_k) + c_1 \alpha_k p_k^T \nabla f(x_k), where
   //  \nabla means the derivative.
@@ -255,11 +255,11 @@ void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
   // Below, pf is \alpha_k p_k^T \nabla f(x_k).
   Real pf = VecVec(new_x_, deriv_) - VecVec(x_, deriv_);
   Real temp = f_ + opts_.c1 * pf;
-  
+
   bool wolfe_i_ok;
   if (opts_.minimize) wolfe_i_ok = (function_value <= temp);
   else wolfe_i_ok = (function_value >= temp);
-  
+
   // Wolfe condition ii) can be written as:
   //  p_k^T \nabla f(x_k + \alpha_k p_k) >= c_2 p_k^T \nabla f(x_k)
   // p2f equals \alpha_k p_k^T \nabla f(x_k + \alpha_k p_k), where
@@ -285,7 +285,7 @@ void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
   // code will quickly detect convergence.
 
   d_action = kNoChange; // the default.
-  
+
   if (wolfe_i_ok && wolfe_ii_ok) {
     iteration_action = kAccept;
     d_action = kNoChange; // actually doesn't matter, it'll get reset.
@@ -319,13 +319,13 @@ void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
 
   if (d_action == kDecrease)
     d_ = std::sqrt(d_);
-  
+
   KALDI_VLOG(3) << "d = " << d_ << ", iter = " << k_ << ", action = "
                 << (iteration_action == kAccept ? "accept" :
                     (iteration_action == kDecreaseStep ? "decrease" :
                      (iteration_action == kIncreaseStep ? "increase" :
                       "reject")));
-  
+
   // Note: even if iteration_action != Restart at this point,
   // some code below may set it to Restart.
   if (iteration_action == kAccept) {
@@ -358,7 +358,7 @@ void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
                     << "close to the old value; restarting.";
       iteration_action = kRestart;
     }
-        
+
     if (iteration_action == kDecreaseStep) {
       num_wolfe_i_failures_++;
       last_failure_type_ = kWolfeI;
@@ -433,7 +433,7 @@ OptimizeLbfgs<Real>::GetValue(Real *objf_value) const {
 //  p_k : A-conjugate direction
 //  \beta_k  : coefficient used in A-conjugate direction computation for next
 //  iteration
-//  
+//
 //  Algo.  LinearCG(A,b,x_0)
 //  ========================
 //  r_0 = Ax_0 - b
@@ -464,21 +464,21 @@ int32 LinearCgd(const LinearCgdOptions &opts,
   p.AddSpVec(-1.0, A, *x, 1.0);  // p_0 = b - A x_0
   r.AddVec(-1.0, p);  // r_0 = - p_0
   x_orig.CopyFromVec(*x);  // in case of failure.
-  
+
   Real r_cur_norm_sq = VecVec(r, r),
       r_initial_norm_sq = r_cur_norm_sq,
       r_recompute_norm_sq = r_cur_norm_sq;
 
   KALDI_VLOG(5) << "In linear CG: initial norm-square of residual = "
                 << r_initial_norm_sq;
-  
+
   KALDI_ASSERT(opts.recompute_residual_factor <= 1.0);
   Real max_error_sq = std::max<Real>(opts.max_error * opts.max_error,
                                      std::numeric_limits<Real>::min()),
       residual_factor = opts.recompute_residual_factor *
                         opts.recompute_residual_factor,
       inv_residual_factor = 1.0 / residual_factor;
-  
+
   // Note: although from a mathematical point of view the method should converge
   // after M iterations, in practice (due to roundoff) it does not always
   // converge to good precision after that many iterations so we let the maximum
@@ -492,7 +492,7 @@ int32 LinearCgd(const LinearCgdOptions &opts,
     // Below is how the code used to look.
     // // next line: \alpha_k = (r_k^T r_k) / (p_k^T A p_k)
     // Real alpha = r_cur_norm_sq / VecVec(p, Ap);
-    // 
+    //
     // We changed r_cur_norm_sq below to -VecVec(p, r).  Although this is
     // slightly less efficient, it seems to make the algorithm dramatically more
     // robust.  Note that -p^T r is the mathematically more natural quantity to
@@ -500,23 +500,23 @@ int32 LinearCgd(const LinearCgdOptions &opts,
     // recommended in Nocedal and Wright only as a kind of optimization as it is
     // supposed to be the same as -p^T r and we already have it computed.
     Real alpha = -VecVec(p, r) / VecVec(p, Ap);
-    
+
     // next line: x_{k+1} = x_k + \alpha_k p_k;
     x->AddVec(alpha, p);
     // next line: r_{k+1} = r_k + \alpha_k A p_k
     r.AddVec(alpha, Ap);
     Real r_next_norm_sq = VecVec(r, r);
-    
+
     if (r_next_norm_sq < residual_factor * r_recompute_norm_sq ||
         r_next_norm_sq > inv_residual_factor * r_recompute_norm_sq) {
-         
+
       // Recompute the residual from scratch if the residual norm has decreased
       // a lot; this costs an extra matrix-vector multiply, but helps keep the
       // residual accurate.
       // Also do the same if the residual norm has increased a lot since
       // the last time we recomputed... this shouldn't happen often, but
       // it can indicate bad stuff is happening.
-      
+
       // r_{k+1} = A x_{k+1} - b
       r.AddSpVec(1.0, A, *x, 0.0);
       r.AddVec(-1.0, b);
@@ -530,7 +530,7 @@ int32 LinearCgd(const LinearCgdOptions &opts,
     // Check if converged.
     if (r_next_norm_sq <= max_error_sq)
       break;
-    
+
     // next line: \beta_{k+1} = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_K}
     Real beta_next = r_next_norm_sq / r_cur_norm_sq;
     // next lines: p_{k+1} = -r_{k+1} + \beta_{k+1} p_k
@@ -555,8 +555,8 @@ int32 LinearCgd(const LinearCgdOptions &opts,
     SolveQuadraticProblem(A, b, opts, x);
   }
   return k;
-} 
-    
+}
+
 // Instantiate the class for float and double.
 template
 class OptimizeLbfgs<float>;
diff --git a/src/matrix/optimization.h b/src/matrix/optimization.h
index 66309acaad5..e9c16142fdb 100644
--- a/src/matrix/optimization.h
+++ b/src/matrix/optimization.h
@@ -43,12 +43,12 @@ struct LinearCgdOptions {
   // scratch.  This helps to keep the computed residual accurate even in the
   // presence of roundoff.
   BaseFloat recompute_residual_factor;
-  
+
   LinearCgdOptions(): max_iters(-1),
                       max_error(0.0),
                       recompute_residual_factor(0.01) { }
 };
-  
+
 /*
   This function uses linear conjugate gradient descent to approximately solve
   the system A x = b.  The value of x at entry corresponds to the initial guess
@@ -103,7 +103,7 @@ struct LbfgsOptions {
   int max_line_search_iters; // after this many iters we restart L-BFGS.
   int avg_step_length; // number of iters to avg step length over, in
   // RecentStepLength().
-  
+
   LbfgsOptions (bool minimize = true):
       minimize(minimize),
       m(10),
@@ -123,16 +123,16 @@ class OptimizeLbfgs {
   /// Initializer takes the starting value of x.
   OptimizeLbfgs(const VectorBase<Real> &x,
                 const LbfgsOptions &opts);
-  
+
   /// This returns the value of the variable x that has the best objective
   /// function so far, and the corresponding objective function value if
   /// requested.  This would typically be called only at the end.
   const VectorBase<Real>& GetValue(Real *objf_value = NULL) const;
-  
+
   /// This returns the value at which the function wants us
   /// to compute the objective function and gradient.
   const VectorBase<Real>& GetProposedValue() const { return new_x_; }
-  
+
   /// Returns the average magnitude of the last n steps (but not
   /// more than the number we have stored).  Before we have taken
   /// any steps, returns +infinity.  Note: if the most recent
@@ -140,7 +140,7 @@ class OptimizeLbfgs {
   /// step lengths.  This makes it suitable as a convergence test
   /// (else we'd generate NaN's).
   Real RecentStepLength() const;
-  
+
   /// The user calls this function to provide the class with the
   /// function and gradient info at the point GetProposedValue().
   /// If this point is outside the constraints you can set function_value
@@ -149,7 +149,7 @@ class OptimizeLbfgs {
   /// the second overloaded version of this function) will be ignored.
   void DoStep(Real function_value,
               const VectorBase<Real> &gradient);
-  
+
   /// The user can call this version of DoStep() if it is desired to set some
   /// kind of approximate Hessian on this iteration.  Note: it is a prerequisite
   /// that diag_approx_2nd_deriv must be strictly positive (minimizing), or
@@ -157,7 +157,7 @@ class OptimizeLbfgs {
   void DoStep(Real function_value,
               const VectorBase<Real> &gradient,
               const VectorBase<Real> &diag_approx_2nd_deriv);
-  
+
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(OptimizeLbfgs);
 
@@ -175,7 +175,7 @@ class OptimizeLbfgs {
     kWithinStep, // This means we're within the step-size computation, and
     // have not yet done the 1st function evaluation.
   };
-  
+
   inline MatrixIndexT Dim() { return x_.Dim(); }
   inline MatrixIndexT M() { return opts_.m; }
   SubVector<Real> Y(MatrixIndexT i) {
@@ -196,12 +196,12 @@ class OptimizeLbfgs {
   void StepSizeIteration(Real function_value,
                          const VectorBase<Real> &gradient);
   void RecordStepLength(Real s);
-  
-  
+
+
   LbfgsOptions opts_;
-  SignedMatrixIndexT k_; // Iteration number, starts from zero.  Gets set back to zero
+  MatrixIndexT k_; // Iteration number, starts from zero.  Gets set back to zero
   // when we restart.
-  
+
   ComputationState computation_state_;
   bool H_was_set_; // True if the user specified H_; if false,
   // we'll use a heuristic to estimate it.
@@ -222,7 +222,7 @@ class OptimizeLbfgs {
   int num_wolfe_ii_failures_; // the num times we increased step size.
   enum { kWolfeI, kWolfeII, kNone } last_failure_type_; // last type of step-search
   // failure on this iter.
-  
+
   Vector<Real> H_; // Current inverse-Hessian estimate.  May be computed by this class itself,
   // or provided by user using 2nd form of SetGradientInfo().
   Matrix<Real> data_; // dimension (m*2) x dim.  Even rows store
@@ -233,11 +233,11 @@ class OptimizeLbfgs {
   // (up to m) iterations; these are not stored in a rotating buffer but
   // are shifted by one each time (this is more convenient when we
   // restart, as we keep this info past restarting).
-  
+
 
 };
-  
-/// @} 
+
+/// @}
 
 
 } // end namespace kaldi
@@ -245,4 +245,3 @@ class OptimizeLbfgs {
 
 
 #endif
-
diff --git a/src/matrix/packed-matrix.cc b/src/matrix/packed-matrix.cc
index 80bf5891998..40aed24f938 100644
--- a/src/matrix/packed-matrix.cc
+++ b/src/matrix/packed-matrix.cc
@@ -23,9 +23,10 @@
  *
  * Implementation of specialized PackedMatrix template methods
  */
-#include "matrix/cblas-wrappers.h"
+#include "cblasext/cblas-wrappers.h"
 #include "matrix/packed-matrix.h"
 #include "matrix/kaldi-vector.h"
+#include "matrix/kaldi-matrix.h"
 
 namespace kaldi {
 
@@ -49,7 +50,7 @@ void PackedMatrix<Real>::SetRandn() {
   Real *data = data_;
   size_t dim = num_rows_, size = ((dim*(dim+1))/2);
   for (size_t i = 0; i < size; i++)
-    data[i] = RandGauss();  
+    data[i] = RandGauss();
 }
 
 template<typename Real>
@@ -242,7 +243,7 @@ void PackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
   KALDI_ASSERT(this->NumRows() == (MatrixIndexT) size);
   MatrixIndexT num_elems = ((size+1)*(MatrixIndexT)size)/2;
 
-  if(binary) {  
+  if(binary) {
     std::string my_token = (sizeof(Real) == 4 ? "FP" : "DP");
     WriteToken(os, binary, my_token);
     WriteBasicType(os, binary, size);
@@ -256,7 +257,7 @@ void PackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
     else {
       os<<"[\n";
       MatrixIndexT i = 0;
-      for (int32 j = 0; j < size; j++) {  
+      for (int32 j = 0; j < size; j++) {
         for (int32 k = 0; k < j + 1; k++) {
           WriteBasicType(os, binary, data_[i++]);
         }
@@ -337,7 +338,7 @@ void PackedMatrix<Real>::Read(std::istream& is, bool binary, bool add) {
       goto bad;
     }
     //new format it is
-    is_new_format = true; 
+    is_new_format = true;
   }
   if(!is_new_format) {
     ReadBasicType(is, binary, &size);  // throws on error.
@@ -378,7 +379,7 @@ void PackedMatrix<Real>::Read(std::istream& is, bool binary, bool add) {
         }
         //now process the data:
         num_lines = int32(sqrt(data.size()*2));
-        
+
         KALDI_ASSERT(data.size() == num_lines*(num_lines+1)/2);
 
         this->Resize(num_lines);
@@ -392,12 +393,12 @@ void PackedMatrix<Real>::Read(std::istream& is, bool binary, bool add) {
         //std::cout<<"here!!!!!hxu!!!!!"<<std::endl;
       }
       else if ( (i >= '0' && i <= '9') || i == '-' ) {  // A number...
-        Real r; 
+        Real r;
         is >> r;
         if (is.fail()) {
           specific_error << "Stream failure/EOF while reading matrix data.";
           goto bad;
-        } 
+        }
         data.push_back(r);
       }
       else if (isspace(i)) {
@@ -415,9 +416,9 @@ void PackedMatrix<Real>::Read(std::istream& is, bool binary, bool add) {
         } else {
           specific_error << "Expecting numeric matrix data, got " << str;
           goto bad;
-        } 
-      }       
-    } 
+        }
+      }
+    }
   }
 bad:
   KALDI_ERR << "Failed to read packed matrix from stream. " << specific_error.str()
diff --git a/src/matrix/qr.cc b/src/matrix/qr.cc
index 861dead05ba..efa7a301527 100644
--- a/src/matrix/qr.cc
+++ b/src/matrix/qr.cc
@@ -23,7 +23,7 @@
 #include "matrix/kaldi-vector.h"
 #include "matrix/kaldi-matrix.h"
 #include "matrix/matrix-functions.h"
-#include "matrix/cblas-wrappers.h"
+#include "cblasext/cblas-wrappers.h"
 
 // This file contains an implementation of the Symmetric QR Algorithm
 // for the symmetric eigenvalue problem.  See Golub and Van Loan,
@@ -57,7 +57,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
     if (max_x == 0.0) max_x = 1.0;
     s = 1.0 / max_x;
   }
-  
+
   Real sigma = 0.0;
   v[0] = 1.0;
   for (MatrixIndexT i = 1; i < dim; i++) {
@@ -73,7 +73,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
       v[0] = x1 - mu;
     } else {
       v[0] = -sigma / (x1 + mu);
-      KALDI_ASSERT(KALDI_ISFINITE(v[dim-1]));      
+      KALDI_ASSERT(KALDI_ISFINITE(v[dim-1]));
     }
     Real v1 = v[0];
     Real v1sq = v1 * v1;
@@ -155,11 +155,11 @@ void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
    with packed lower-triangular matrices to do it this way.  There's also
    a shift from one-based to zero-based indexing, so the index
    k is transformed k -> n - k, and a corresponding transpose...
-   
+
    Let the original *this be A.  This algorithms replaces *this with
    a tridiagonal matrix T such that T = Q A Q^T for an orthogonal Q.
    Caution: Q is transposed vs. Golub and Van Loan.
-   If Q != NULL it outputs Q. 
+   If Q != NULL it outputs Q.
 */
 template<typename Real>
 void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
@@ -195,13 +195,13 @@ void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
     if (Q != NULL) { // C.f. Golub, Q is H_1 .. H_n-2... in this
       // case we apply them in the opposite order so it's H_n-1 .. H_1,
       // but also Q is transposed so we really have Q = H_1 .. H_n-1.
-      // It's a double negative.    
+      // It's a double negative.
       // Anyway, we left-multiply Q by each one.  The H_n would each be
       // diag(I + beta v v', I) but we don't ever touch the last dims.
       // We do (in Matlab notation):
       // Q(0:k-1,:) = (I - beta v v') * Q, i.e.:
       // Q(:,0:i-1) += -beta v (v' Q(:,0:k-1)v .. let x = -beta Q(0:k-1,:)^T v.
-      cblas_Xgemv(kTrans, k, n, -beta, qdata, qstride, v, 1, 0.0, x, 1);
+      cblas_Xgemv(CblasTrans, k, n, -beta, qdata, qstride, v, 1, 0.0, x, 1);
       // now x = -beta Q(:,0:k-1) v.
       // The next line does: Q(:,0:k-1) += v x'.
       cblas_Xger(k, n, 1.0, v, 1, x, 1, qdata, qstride);
@@ -309,7 +309,7 @@ void QrStep(MatrixIndexT n,
     if (k < n-2) {
       // Next is the elements (k+2, k) and (k+2, k-1), to be rotated, again
       // backwards.
-      Real &elem_kp2_k = z, 
+      Real &elem_kp2_k = z,
           &elem_kp2_kp1 = off_diag[k+1];
       // Note: elem_kp2_k == z would start off as zero because it's
        // two off the diagonal, and not been touched yet.  Therefore
@@ -338,7 +338,7 @@ void QrInternal(MatrixIndexT n,
   MatrixIndexT counter = 0, max_iters = 500 + 4*n, // Should never take this many iters.
       large_iters = 100 + 2*n;
   Real epsilon = (pow(2.0, sizeof(Real) == 4 ? -23.0 : -52.0));
-  
+
   for (; counter < max_iters; counter++) { // this takes the place of "until
                                            // q=n"... we'll break out of the
                                            // loop when we converge.
@@ -356,7 +356,7 @@ void QrInternal(MatrixIndexT n,
         off_diag[i] = 0.0;
     }
     // The next code works out p, q, and npq which is n - p - q.
-    // For the definitions of q and p, see Golub and Van Loan; we 
+    // For the definitions of q and p, see Golub and Van Loan; we
     // partition the n dims into pieces of size (p, n-p-q, q) where
     // the part of size q is diagonal and the part of size n-p-p is
     // "unreduced", i.e. has no zero off-diagonal elements.
@@ -392,7 +392,7 @@ void QrInternal(MatrixIndexT n,
     } else {
       QrStep(npq, diag + p, off_diag + p,
              static_cast<MatrixBase<Real>*>(NULL));
-    }      
+    }
   }
   if (counter == max_iters) {
     KALDI_WARN << "Failure to converge in QR algorithm. "
@@ -497,7 +497,7 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
       // We do "full orthogonalization" to preserve stability,
       // even though this is usually a waste of time.
       Real start_prod = VecVec(r, r);
-      for (SignedMatrixIndexT e = d; e >= 0; e--) { // e must be signed!
+      for (MatrixIndexT e = d; e >= 0; e--) { // e must be signed!
         SubVector<Real> q_e(Q, e);
         Real prod = VecVec(r, q_e);
         if (counter == 0 && static_cast<MatrixIndexT>(e) + 1 >= d) // Keep T tridiagonal, which
@@ -528,11 +528,11 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
     }
   }
 
-  Matrix<Real> R(lanczos_dim, lanczos_dim);  
+  Matrix<Real> R(lanczos_dim, lanczos_dim);
   R.SetUnit();
   T.Qr(&R); // Diagonalizes T.
   Vector<Real> s_tmp(lanczos_dim);
-  s_tmp.CopyDiagFromSp(T);  
+  s_tmp.CopyDiagFromSp(T);
 
   // Now T = R * diag(s_tmp) * R^T.
   // The next call sorts the elements of s from greatest to least absolute value,
@@ -544,7 +544,7 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
   SubMatrix<Real> Rsub(R, 0, eig_dim, 0, lanczos_dim);
   SubVector<Real> s_sub(s_tmp, 0, eig_dim);
   s->CopyFromVec(s_sub);
-      
+
   // For working out what to do now, just assume the other eigenvalues were
   // zero.  This is just for purposes of knowing how to get the result, and
   // not getting things wrongly transposed.
diff --git a/src/matrix/sp-matrix.cc b/src/matrix/sp-matrix.cc
index 224ef39fb6e..32c8ccd9df0 100644
--- a/src/matrix/sp-matrix.cc
+++ b/src/matrix/sp-matrix.cc
@@ -25,7 +25,8 @@
 #include "matrix/kaldi-vector.h"
 #include "matrix/kaldi-matrix.h"
 #include "matrix/matrix-functions.h"
-#include "matrix/cblas-wrappers.h"
+#include "cblasext/cblas-wrappers.h"
+#include "cblasext/cblas-extensions.h"
 
 namespace kaldi {
 
@@ -180,7 +181,7 @@ Real SpMatrix<Real>::Trace() const {
 // diagonal update, this <-- this + diag(v)
 template<typename Real>
 template<typename OtherReal>
-void  SpMatrix<Real>::AddDiagVec(const Real alpha, const VectorBase<OtherReal> &v) {
+void SpMatrix<Real>::AddDiagVec(const Real alpha, const VectorBase<OtherReal> &v) {
   int32 num_rows = this->num_rows_;
   KALDI_ASSERT(num_rows == v.Dim() && num_rows > 0);
   const OtherReal *src = v.Data();
@@ -1010,13 +1011,15 @@ void SpMatrix<Real>::AddMat2Sp(
   if (transM == kNoTrans) {
     for (MatrixIndexT r = 0; r < dim; r++, p_row_data += r) {
       cblas_Xspmv(A.NumRows(), 1.0, p_A_data, M.RowData(r), 1, 0.0, tmp_vec_data, 1);
-      cblas_Xgemv(transM, r+1, M_other_dim, alpha, M_data, M_stride,
+      cblas_Xgemv(static_cast<CBLAS_TRANSPOSE>(transM),
+                  r+1, M_other_dim, alpha, M_data, M_stride,
                   tmp_vec_data, 1, beta, p_row_data, 1);
     }
   } else {
     for (MatrixIndexT r = 0; r < dim; r++, p_row_data += r) {
       cblas_Xspmv(A.NumRows(), 1.0, p_A_data, M.Data() + r, M.Stride(), 0.0, tmp_vec_data, 1);
-      cblas_Xgemv(transM, M_other_dim, r+1, alpha, M_data, M_stride,
+      cblas_Xgemv(static_cast<CBLAS_TRANSPOSE>(transM),
+                  M_other_dim, r+1, alpha, M_data, M_stride,
                   tmp_vec_data, 1, beta, p_row_data, 1);
     }
   }
@@ -1064,15 +1067,17 @@ void SpMatrix<Real>::AddSmat2Sp(
     // The column of M^T corresponds to the rows of the supplied matrix.
     for (MatrixIndexT i = 0; i < dim; i++, data += i) {
       MatrixIndexT num_rows = i + 1, num_cols = Adim;
-      Xgemv_sparsevec(kNoTrans, num_rows, num_cols, alpha, MAdata,
-                      temp_MA_stride, Mdata + (i * Mstride), 1, beta, data, 1);
+      cblasext_Xgemv_sparsevec(CblasNoTrans, num_rows, num_cols, alpha, MAdata,
+                               temp_MA_stride, Mdata + (i * Mstride),
+                               1, beta, data, 1);
     }
   } else {
     // The column of M^T corresponds to the columns of the supplied matrix.
     for (MatrixIndexT i = 0; i < dim; i++, data += i) {
       MatrixIndexT num_rows = i + 1, num_cols = Adim;
-      Xgemv_sparsevec(kNoTrans, num_rows, num_cols, alpha, MAdata,
-                      temp_MA_stride, Mdata + i, Mstride, beta, data, 1);
+      cblasext_Xgemv_sparsevec(CblasNoTrans, num_rows, num_cols, alpha, MAdata,
+                               temp_MA_stride, Mdata + i, Mstride,
+                               beta, data, 1);
     }
   }
 }
@@ -1129,7 +1134,8 @@ void SpMatrix<Real>::AddMat2(const Real alpha, const MatrixBase<Real> &M,
   // doesn't dominate O(N) time.
 
   // This function call is hard-coded to update the lower triangle.
-  cblas_Xsyrk(transM, this_dim, m_other_dim, alpha, M.Data(),
+  cblas_Xsyrk(static_cast<CBLAS_TRANSPOSE>(transM),
+              this_dim, m_other_dim, alpha, M.Data(),
               M.Stride(), beta, temp_mat.Data(), temp_mat.Stride());
 
   this->CopyFromMat(temp_mat, kTakeLower);
diff --git a/src/matrix/sparse-matrix-test.cc b/src/matrix/sparse-matrix-test.cc
index 26b2c227bba..9d7b59a7705 100644
--- a/src/matrix/sparse-matrix-test.cc
+++ b/src/matrix/sparse-matrix-test.cc
@@ -77,7 +77,7 @@ void UnitTestSparseVectorMax() {
     vec.SetRandn();
     svec.CopyElementsToVec(&vec);
 
-    int32 index1, index2;
+    MatrixIndexT index1, index2;
     Real max1, max2;
 
     max1 = svec.Max(&index1);
diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc
index 68a61e17dc3..0e854d1ff9a 100644
--- a/src/matrix/sparse-matrix.cc
+++ b/src/matrix/sparse-matrix.cc
@@ -29,7 +29,7 @@
 namespace kaldi {
 
 template <typename Real>
-std::pair<MatrixIndexT, Real>* SparseVector<Real>::Data() {
+std::pair<int32, Real>* SparseVector<Real>::Data() {
   if (pairs_.empty())
     return NULL;
   else
@@ -37,7 +37,7 @@ std::pair<MatrixIndexT, Real>* SparseVector<Real>::Data() {
 }
 
 template <typename Real>
-const std::pair<MatrixIndexT, Real>* SparseVector<Real>::Data() const {
+const std::pair<int32, Real>* SparseVector<Real>::Data() const {
   if (pairs_.empty())
     return NULL;
   else
@@ -65,7 +65,7 @@ void SparseVector<Real>::CopyElementsToVec(VectorBase<OtherReal> *vec) const {
   KALDI_ASSERT(vec->Dim() == this->dim_);
   vec->SetZero();
   OtherReal *other_data = vec->Data();
-  typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
+  typename std::vector<std::pair<int32, Real> >::const_iterator
       iter = pairs_.begin(), end = pairs_.end();
   for (; iter != end; ++iter)
     other_data[iter->first] = iter->second;
@@ -85,7 +85,7 @@ void SparseVector<Real>::AddToVec(Real alpha,
                                   VectorBase<OtherReal> *vec) const {
   KALDI_ASSERT(vec->Dim() == dim_);
   OtherReal *other_data = vec->Data();
-  typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
+  typename std::vector<std::pair<int32, Real> >::const_iterator
       iter = pairs_.begin(), end = pairs_.end();
   if (alpha == 1.0) {  // treat alpha==1.0 case specially.
     for (; iter != end; ++iter)
@@ -147,10 +147,11 @@ template <typename Real>
 void SparseVector<Real>::Write(std::ostream &os, bool binary) const {
   if (binary) {
     WriteToken(os, binary, "SV");
-    WriteBasicType(os, binary, dim_);
-    MatrixIndexT num_elems = pairs_.size();
+    int32 dim = dim_;
+    WriteBasicType(os, binary, dim);
+    int32 num_elems = pairs_.size();
     WriteBasicType(os, binary, num_elems);
-    typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
+    typename std::vector<std::pair<int32, Real> >::const_iterator
         iter = pairs_.begin(), end = pairs_.end();
     for (; iter != end; ++iter) {
       WriteBasicType(os, binary, iter->first);
@@ -160,7 +161,7 @@ void SparseVector<Real>::Write(std::ostream &os, bool binary) const {
     // In text-mode, use a human-friendly, script-friendly format;
     // format is "dim=5 [ 0 0.2 3 0.9 ] "
     os << "dim=" << dim_ << " [ ";
-    typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
+    typename std::vector<std::pair<int32, Real> >::const_iterator
         iter = pairs_.begin(), end = pairs_.end();
     for (; iter != end; ++iter)
       os << iter->first << ' ' << iter->second << ' ';
@@ -173,13 +174,14 @@ template <typename Real>
 void SparseVector<Real>::Read(std::istream &is, bool binary) {
   if (binary) {
     ExpectToken(is, binary, "SV");
-    ReadBasicType(is, binary, &dim_);
-    KALDI_ASSERT(dim_ >= 0);
-    int32 num_elems;
+    int32 dim, num_elems;
+    ReadBasicType(is, binary, &dim);
+    KALDI_ASSERT(dim >= 0);
+    dim_ = dim;
     ReadBasicType(is, binary, &num_elems);
     KALDI_ASSERT(num_elems >= 0 && num_elems <= dim_);
     pairs_.resize(num_elems);
-    typename std::vector<std::pair<MatrixIndexT, Real> >::iterator
+    typename std::vector<std::pair<int32, Real> >::iterator
         iter = pairs_.begin(), end = pairs_.end();
     for (; iter != end; ++iter) {
       ReadBasicType(is, binary, &(iter->first));
@@ -217,7 +219,7 @@ void SparseVector<Real>::Read(std::istream &is, bool binary) {
         KALDI_ERR << "Error reading sparse vector, expecting numbers.";
       KALDI_ASSERT(i >= 0 && i < dim
                    && (pairs_.empty() || i > pairs_.back().first));
-      pairs_.push_back(std::pair<MatrixIndexT, BaseFloat>(i, p));
+      pairs_.push_back(std::pair<int32, BaseFloat>(i, p));
     }
   }
 }
@@ -226,8 +228,8 @@ void SparseVector<Real>::Read(std::istream &is, bool binary) {
 namespace sparse_vector_utils {
 template <typename Real>
 struct CompareFirst {
-  inline bool operator() (const std::pair<MatrixIndexT, Real> &p1,
-                           const std::pair<MatrixIndexT, Real> &p2) const {
+  inline bool operator() (const std::pair<int32, Real> &p1,
+                           const std::pair<int32, Real> &p2) const {
     return p1.first < p2.first;
   }
 };
@@ -235,12 +237,12 @@ struct CompareFirst {
 
 template <typename Real>
 SparseVector<Real>::SparseVector(
-    MatrixIndexT dim, const std::vector<std::pair<MatrixIndexT, Real> > &pairs):
+    MatrixIndexT dim, const std::vector<std::pair<int32, Real> > &pairs):
     dim_(dim),
     pairs_(pairs) {
   std::sort(pairs_.begin(), pairs_.end(),
             sparse_vector_utils::CompareFirst<Real>());
-  typename std::vector<std::pair<MatrixIndexT, Real> >::iterator
+  typename std::vector<std::pair<int32, Real> >::iterator
       out = pairs_.begin(), in = out,  end = pairs_.end();
   // special case: while there is nothing to be changed, skip over
   // initial input (avoids unnecessary copying).
@@ -273,7 +275,7 @@ void SparseVector<Real>::SetRandn(BaseFloat zero_prob) {
   KALDI_ASSERT(zero_prob >= 0 && zero_prob <= 1.0);
   for (MatrixIndexT i = 0; i < dim_; i++)
     if (WithProb(1.0 - zero_prob))
-      pairs_.push_back(std::pair<MatrixIndexT, Real>(i, RandGauss()));
+      pairs_.push_back(std::pair<int32, Real>(i, RandGauss()));
 }
 
 template <typename Real>
@@ -339,7 +341,7 @@ template<typename Real>
 Real SparseMatrix<Real>::FrobeniusNorm() const {
   Real squared_sum = 0;
   for (int32 i = 0; i < rows_.size(); ++i) {
-    const std::pair<MatrixIndexT, Real> *row_data = rows_[i].Data();
+    const std::pair<int32, Real> *row_data = rows_[i].Data();
     for (int32 j = 0; j < rows_[i].NumElements(); ++j) {
       squared_sum += row_data[j].second * row_data[j].second;
     }
@@ -367,7 +369,7 @@ void SparseMatrix<Real>::CopyToMat(MatrixBase<OtherReal> *other,
     for (MatrixIndexT row = 0; row < num_rows; row++, other_col_data++) {
       const SparseVector<Real> &svec = rows_[row];
       MatrixIndexT num_elems = svec.NumElements();
-      const std::pair<MatrixIndexT, Real> *sdata = svec.Data();
+      const std::pair<int32, Real> *sdata = svec.Data();
       for (MatrixIndexT e = 0; e < num_elems; e++)
         other_col_data[sdata[e].first * other_stride] = sdata[e].second;
     }
@@ -413,7 +415,7 @@ void SparseMatrix<Real>::CopyFromSmat(const SparseMatrix<OtherReal> &other,
       rows_[r].CopyFromSvec(other.Row(r));
     }
   } else {
-    std::vector<std::vector<std::pair<MatrixIndexT, Real> > > pairs(
+    std::vector<std::vector<std::pair<int32, Real> > > pairs(
         other.NumCols());
     for (MatrixIndexT i = 0; i < other.NumRows(); ++i) {
       for (int id = 0; id < other.Row(i).NumElements(); ++id) {
@@ -511,7 +513,7 @@ void SparseMatrix<Real>::AddToMat(BaseFloat alpha,
     for (MatrixIndexT row = 0; row < num_rows; row++, other_col_data++) {
       const SparseVector<Real> &svec = rows_[row];
       MatrixIndexT num_elems = svec.NumElements();
-      const std::pair<MatrixIndexT, Real> *sdata = svec.Data();
+      const std::pair<int32, Real> *sdata = svec.Data();
       for (MatrixIndexT e = 0; e < num_elems; e++)
         other_col_data[sdata[e].first * other_stride] +=
             alpha * sdata[e].second;
@@ -524,7 +526,7 @@ Real VecSvec(const VectorBase<Real> &vec,
              const SparseVector<Real> &svec) {
   KALDI_ASSERT(vec.Dim() == svec.Dim());
   MatrixIndexT n = svec.NumElements();
-  const std::pair<MatrixIndexT, Real> *sdata = svec.Data();
+  const std::pair<int32, Real> *sdata = svec.Data();
   const Real *data = vec.Data();
   Real ans = 0.0;
   for (MatrixIndexT i = 0; i < n; i++)
@@ -546,7 +548,7 @@ const SparseVector<Real> &SparseMatrix<Real>::Row(MatrixIndexT r) const {
 }
 
 template <typename Real>
-void SparseMatrix<Real>::SetRow(int32 r, const SparseVector<Real> &vec) {
+void SparseMatrix<Real>::SetRow(MatrixIndexT r, const SparseVector<Real> &vec) {
   KALDI_ASSERT(static_cast<size_t>(r) < rows_.size() &&
                vec.Dim() == rows_[0].Dim());
   rows_[r] = vec;
@@ -566,7 +568,7 @@ template<typename Real>
 SparseMatrix<Real>::SparseMatrix(const std::vector<int32> &indexes, int32 dim,
                                  MatrixTransposeType trans) {
   const std::vector<int32>& idx = indexes;
-  std::vector<std::vector<std::pair<MatrixIndexT, Real> > > pair(idx.size());
+  std::vector<std::vector<std::pair<int32, Real> > > pair(idx.size());
   for (int i = 0; i < idx.size(); ++i) {
     if (idx[i] >= 0) {
       pair[i].push_back( { idx[i], Real(1) });
@@ -587,7 +589,7 @@ SparseMatrix<Real>::SparseMatrix(const std::vector<int32> &indexes,
                                  MatrixTransposeType trans) {
   const std::vector<int32>& idx = indexes;
   const VectorBase<Real>& w = weights;
-  std::vector<std::vector<std::pair<MatrixIndexT, Real> > > pair(idx.size());
+  std::vector<std::vector<std::pair<int32, Real> > > pair(idx.size());
   for (int i = 0; i < idx.size(); ++i) {
     if (idx[i] >= 0) {
       pair[i].push_back( { idx[i], w(i) });
@@ -617,7 +619,7 @@ void SparseMatrix<Real>::Swap(SparseMatrix<Real> *other) {
 template<typename Real>
 SparseMatrix<Real>::SparseMatrix(
     MatrixIndexT dim,
-    const std::vector<std::vector<std::pair<MatrixIndexT, Real> > > &pairs):
+    const std::vector<std::vector<std::pair<int32, Real> > > &pairs):
     rows_(pairs.size()) {
   MatrixIndexT num_rows = pairs.size();
   for (MatrixIndexT row = 0; row < num_rows; row++) {
@@ -719,7 +721,7 @@ Real TraceMatSmat(const MatrixBase<Real> &A,
       Real col_sum = 0.0;
       const SparseVector<Real> &svec = B.Row(i);
       MatrixIndexT num_elems = svec.NumElements();
-      const std::pair<MatrixIndexT, Real> *sdata = svec.Data();
+      const std::pair<int32, Real> *sdata = svec.Data();
       for (MatrixIndexT e = 0; e < num_elems; e++)
         col_sum += A_col_data[Astride * sdata[e].first] * sdata[e].second;
       sum += col_sum;
@@ -1164,11 +1166,11 @@ void GeneralMatrix::AddToMat(BaseFloat alpha, MatrixBase<BaseFloat> *mat,
 }
 
 template <class Real>
-Real SparseVector<Real>::Max(int32 *index_out) const {
+Real SparseVector<Real>::Max(MatrixIndexT *index_out) const {
   KALDI_ASSERT(dim_ > 0 && pairs_.size() <= static_cast<size_t>(dim_));
   Real ans = -std::numeric_limits<Real>::infinity();
   int32 index = 0;
-  typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
+  typename std::vector<std::pair<int32, Real> >::const_iterator
       iter = pairs_.begin(), end = pairs_.end();
   for (; iter != end; ++iter) {
     if (iter->second > ans) {
@@ -1219,7 +1221,7 @@ SparseVector<Real>::SparseVector(const VectorBase<Real> &vec) {
   for (MatrixIndexT i = 0; i < dim; i++) {
     Real val = ptr[i];
     if (val != 0.0)
-      pairs_.push_back(std::pair<MatrixIndexT,Real>(i,val));
+      pairs_.push_back(std::pair<int32,Real>(i,val));
   }
 }
 
diff --git a/src/matrix/sparse-matrix.h b/src/matrix/sparse-matrix.h
index 76f77f531d5..2b5723bd244 100644
--- a/src/matrix/sparse-matrix.h
+++ b/src/matrix/sparse-matrix.h
@@ -68,22 +68,22 @@ class SparseVector {
   // If all the elements stored were negative and there underlying vector had
   // zero indexes not listed in the elements, or if no elements are stored, it
   // will return the first un-listed index, whose value (implicitly) is zero.
-  Real Max(int32 *index) const;
+  Real Max(MatrixIndexT *index) const;
 
   /// Returns the number of nonzero elements.
   MatrixIndexT NumElements() const { return pairs_.size(); }
 
   /// get an indexed element (0 <= i < NumElements()).
-  const std::pair<MatrixIndexT, Real> &GetElement(MatrixIndexT i) const {
+  const std::pair<int32, Real> &GetElement(MatrixIndexT i) const {
     return pairs_[i];
   }
 
   // returns pointer to element data, or NULL if empty (use with NumElements()).
-  std::pair<MatrixIndexT, Real> *Data();
+  std::pair<int32, Real> *Data();
 
   // returns pointer to element data, or NULL if empty (use with NumElements());
   // const version
-  const std::pair<MatrixIndexT, Real> *Data() const;
+  const std::pair<int32, Real> *Data() const;
 
   /// Sets elements to zero with probability zero_prob, else normally
   /// distributed.  Useful in testing.
@@ -95,7 +95,7 @@ class SparseVector {
 
   // constructor from pairs; does not assume input pairs are sorted and uniq
   SparseVector(MatrixIndexT dim,
-               const std::vector<std::pair<MatrixIndexT, Real> > &pairs);
+               const std::vector<std::pair<int32, Real> > &pairs);
 
   // constructor from a VectorBase that keeps only the nonzero elements of 'vec'.
   explicit SparseVector(const VectorBase<Real> &vec);
@@ -115,7 +115,7 @@ class SparseVector {
   MatrixIndexT dim_;
   // pairs of (row-index, value).  Stored in sorted order with no duplicates.
   // For now we use std::vector, but we could change this.
-  std::vector<std::pair<MatrixIndexT, Real> > pairs_;
+  std::vector<std::pair<int32, Real> > pairs_;
 };
 
 
@@ -181,8 +181,8 @@ class SparseMatrix {
   // Posterior. indexed first by row-index; the pairs are (column-index, value),
   // and the constructor does not require them to be sorted and uniq.
   SparseMatrix(
-      int32 dim,
-      const std::vector<std::vector<std::pair<MatrixIndexT, Real> > > &pairs);
+      MatrixIndexT dim,
+      const std::vector<std::vector<std::pair<int32, Real> > > &pairs);
 
   /// Sets up to a pseudo-randomly initialized matrix, with each element zero
   /// with probability zero_prob and else normally distributed- mostly for
@@ -196,7 +196,7 @@ class SparseMatrix {
   const SparseVector<Real> &Row(MatrixIndexT r) const;
 
   /// Sets row r to "vec"; makes sure it has the correct dimension.
-  void SetRow(int32 r, const SparseVector<Real> &vec);
+  void SetRow(MatrixIndexT r, const SparseVector<Real> &vec);
 
   /// Select a subset of the rows of a SparseMatrix.
   /// Sets *this to only the rows of 'smat_other' that are listed
diff --git a/src/matrix/tp-matrix.cc b/src/matrix/tp-matrix.cc
index 6e34dc643e9..322d3253c6e 100644
--- a/src/matrix/tp-matrix.cc
+++ b/src/matrix/tp-matrix.cc
@@ -21,7 +21,7 @@
 #include "matrix/tp-matrix.h"
 #include "matrix/sp-matrix.h"
 #include "matrix/kaldi-matrix.h"
-#include "matrix/cblas-wrappers.h"
+#include "cblasext/cblas-wrappers.h"
 
 
 namespace kaldi {
diff --git a/src/nnet/Makefile b/src/nnet/Makefile
deleted file mode 100644
index 7f324479a0f..00000000000
--- a/src/nnet/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-
-
-all:
-
-include ../kaldi.mk
-
-LDFLAGS += $(CUDA_LDFLAGS)
-LDLIBS += $(CUDA_LDLIBS)
-
-TESTFILES = nnet-randomizer-test nnet-component-test
-
-OBJFILES = nnet-nnet.o nnet-component.o nnet-loss.o \
-           nnet-pdf-prior.o nnet-randomizer.o
-
-LIBNAME = kaldi-nnet
-
-ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../hmm/kaldi-hmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
-
-include ../makefiles/default_rules.mk
-
diff --git a/src/nnet/nnet-activation.h b/src/nnet/nnet-activation.h
deleted file mode 100644
index ad9acac26bc..00000000000
--- a/src/nnet/nnet-activation.h
+++ /dev/null
@@ -1,373 +0,0 @@
-// nnet/nnet-activation.h
-
-// Copyright 2011-2016  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_ACTIVATION_H_
-#define KALDI_NNET_NNET_ACTIVATION_H_
-
-#include <string>
-#include <vector>
-#include <cmath>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-#include "cudamatrix/cu-rand.h"
-#include "util/text-utils.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class Softmax : public Component {
- public:
-  Softmax(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~Softmax()
-  { }
-
-  Component* Copy() const { return new Softmax(*this); }
-  ComponentType GetType() const { return kSoftmax; }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // y = e^x_j/sum_j(e^x_j)
-    out->SoftMaxPerRow(in);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // simply copy the error derivative
-    // (ie. assume crossentropy error function,
-    // while in_diff contains (net_output-target) :
-    // this is already derivative of the error with
-    // respect to activations of last layer neurons)
-    in_diff->CopyFromMat(out_diff);
-  }
-};
-
-
-class HiddenSoftmax : public Component {
- public:
-  HiddenSoftmax(int32 dim_in, int32 dim_out) :
-    Component(dim_in, dim_out)
-  { }
-
-  ~HiddenSoftmax()
-  { }
-
-  Component* Copy() const { return new HiddenSoftmax(*this); }
-  ComponentType GetType() const { return kHiddenSoftmax; }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // y = e^x_j/sum_j(e^x_j)
-    out->SoftMaxPerRow(in);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // This Softmax should be used for a hidden layer, it calculates
-    // the true Jacobian of Softmax: J = diag(out) - out*out^T
-
-    // The backpropagation formual is:
-    // in_diff = out_diff \odot out - out(out_diff^T * out)
-    // (where \odot is Hadamard product)
-
-    // 1st term, out_diff \odot out,
-    in_diff->CopyFromMat(out_diff);
-    in_diff->MulElements(out);
-
-    // 2nd term, -out(out_diff^T * out),
-    diag_out_diff_out_.Resize(out.NumRows());
-    diag_out_diff_out_.AddDiagMatMat(1.0, out_diff, kNoTrans, out, kTrans, 0.0);
-    in_diff->AddDiagVecMat(-1.0, diag_out_diff_out_, out, kNoTrans, 1.0);
-  }
-
- private:
-  /// buffer for dot-products in BackpropagateFnc,
-  CuVector<BaseFloat> diag_out_diff_out_;
-};
-
-class BlockSoftmax : public Component {
- public:
-  BlockSoftmax(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~BlockSoftmax()
-  { }
-
-  Component* Copy() const { return new BlockSoftmax(*this); }
-  ComponentType GetType() const { return kBlockSoftmax; }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token,
-      dims_str;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<BlockDims>") is >> dims_str;
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (BlockDims)";
-    }
-    // parse dims,
-    if (!kaldi::SplitStringToIntegers(dims_str, ",:", false, &block_dims))
-      KALDI_ERR << "Invalid block-dims " << dims_str;
-    // sanity check
-    int32 sum = 0;
-    for (int32 i = 0; i < block_dims.size(); i++) {
-      sum += block_dims[i];
-    }
-    KALDI_ASSERT(sum == OutputDim());
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    ReadIntegerVector(is, binary, &block_dims);
-    block_offset.resize(block_dims.size()+1, 0);
-    for (int32 i = 0; i < block_dims.size(); i++) {
-      block_offset[i+1] = block_offset[i] + block_dims[i];
-    }
-    // check
-    KALDI_ASSERT(OutputDim() == block_offset[block_offset.size()-1]);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteIntegerVector(os, binary, block_dims);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // perform softmax per block:
-    for (int32 bl = 0; bl < block_dims.size(); bl++) {
-      // get the blocks,
-      CuSubMatrix<BaseFloat> in_bl =
-        in.ColRange(block_offset[bl], block_dims[bl]);
-      CuSubMatrix<BaseFloat> out_bl =
-        out->ColRange(block_offset[bl], block_dims[bl]);
-      // y = e^x_j/sum_j(e^x_j),
-      out_bl.SoftMaxPerRow(in_bl);
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // copy the error derivative:
-    // (assuming we already got softmax-cross-entropy derivative in out_diff)
-    in_diff->CopyFromMat(out_diff);
-
-    // Set the derivatives to zero for the matrix-lines in which
-    // the sum of 'derivatives' was 1.0 (i.e. there was no target):
-    for (int32 bl = 0; bl < block_dims.size(); bl++) {
-      // get the block,
-      CuSubMatrix<BaseFloat> diff_bl =
-        in_diff->ColRange(block_offset[bl], block_dims[bl]);
-      // get the sum of each row,
-      CuVector<BaseFloat> row_sum(diff_bl.NumRows());
-      row_sum.AddColSumMat(1.0, diff_bl, 0.0);  // 0: keep as-is, 1: zero-out
-      // we'll scale rows by 0/1 masks,
-      CuVector<BaseFloat> row_diff_mask(row_sum);
-      row_diff_mask.Scale(-1.0);  // 0: keep as-is, -1: zero-out
-      row_diff_mask.Add(1.0);  // 1: keep as-is, 0: zero-out
-      // here we should have only 0's and 1's,
-      diff_bl.MulRowsVec(row_diff_mask);
-    }
-  }
-
-  std::string Info() const {
-    return "\n  softmax-dims " + ToString(block_dims);
-  }
-
-  std::vector<int32> block_dims;
-  std::vector<int32> block_offset;
-};
-
-
-
-
-class Sigmoid : public Component {
- public:
-  Sigmoid(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~Sigmoid()
-  { }
-
-  Component* Copy() const { return new Sigmoid(*this); }
-  ComponentType GetType() const { return kSigmoid; }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // y = 1/(1+e^-x)
-    out->Sigmoid(in);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // ey = y(1-y)ex,
-    in_diff->DiffSigmoid(out, out_diff);
-  }
-};
-
-
-
-class Tanh : public Component {
- public:
-  Tanh(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~Tanh()
-  { }
-
-  Component* Copy() const { return new Tanh(*this); }
-  ComponentType GetType() const { return kTanh; }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // y = (e^x - e^(-x)) / (e^x + e^(-x)),
-    out->Tanh(in);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // ey = (1 - y^2)ex
-    in_diff->DiffTanh(out, out_diff);
-  }
-};
-
-
-
-class Dropout : public Component {
- public:
-  Dropout(int32 dim_in, int32 dim_out):
-      Component(dim_in, dim_out),
-      dropout_rate_(0.5)
-  { }
-
-  ~Dropout()
-  { }
-
-  Component* Copy() const { return new Dropout(*this); }
-  ComponentType GetType() const { return kDropout; }
-
-  void InitData(std::istream &is) {
-    is >> std::ws;  // eat-up whitespace
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<DropoutRate>") ReadBasicType(is, false, &dropout_rate_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (DropoutRate)";
-    }
-    KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    bool finished = false;
-    while ('<' == Peek(is, binary) && !finished) {
-      std::string token;
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'D': ReadToken(is, false, &token);
-          /**/ if (token == "<DropoutRate>") ReadBasicType(is, binary, &dropout_rate_);
-          else if (token == "<DropoutRetention>") { /* compatibility */
-            BaseFloat dropout_retention;
-            ReadBasicType(is, binary, &dropout_retention);
-            dropout_rate_ = 1.0 - dropout_retention;
-          } else KALDI_ERR << "Unknown token: " << token;
-          break;
-        case '!': ExpectToken(is, binary, "<!EndOfComponent>");
-          finished = true;
-          break;
-        default: ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-    KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<DropoutRate>");
-    WriteBasicType(os, binary, dropout_rate_);
-  }
-
-  std::string Info() const {
-    return std::string("<DropoutRate> ") + ToString(dropout_rate_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    out->CopyFromMat(in);
-    // set N inputs to zero, according to the 'dropout_rate_' ...
-    dropout_mask_.Resize(out->NumRows(), out->NumCols());
-    rand_.RandUniform(&dropout_mask_);  // [0..1]
-    dropout_mask_.Add(-dropout_rate_);  // [(-rate)..(1-rate)]
-    dropout_mask_.Heaviside(dropout_mask_); // (x > 0.0 ? 1 : 0)
-    out->MulElements(dropout_mask_);
-    // rescale to keep the same dynamic range as w/o dropout,
-    out->Scale(1.0 / (1.0 - dropout_rate_));
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    in_diff->CopyFromMat(out_diff);
-    // use same mask on the error derivatives...
-    in_diff->MulElements(dropout_mask_);
-    // enlarge the output to fit same dynamic range as w/o dropout
-    in_diff->Scale(1.0 / (1.0 - dropout_rate_));
-  }
-
-  BaseFloat GetDropoutRate() { return dropout_rate_; }
-
-  void SetDropoutRate(BaseFloat dr) {
-    dropout_rate_ = dr;
-    KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0);
-  }
-
- private:
-  BaseFloat dropout_rate_;  ///< probability that a neuron is dropped,
-
-  CuRand<BaseFloat> rand_;  ///< generator of random numbers,
-
-  CuMatrix<BaseFloat> dropout_mask_;  // random binary mask,
-                                      // 1 = keep neuron, 0 = drop neuron,
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_ACTIVATION_H_
-
diff --git a/src/nnet/nnet-affine-transform.h b/src/nnet/nnet-affine-transform.h
deleted file mode 100644
index 0dc84fae6d8..00000000000
--- a/src/nnet/nnet-affine-transform.h
+++ /dev/null
@@ -1,247 +0,0 @@
-// nnet/nnet-affine-transform.h
-
-// Copyright 2011-2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_AFFINE_TRANSFORM_H_
-#define KALDI_NNET_NNET_AFFINE_TRANSFORM_H_
-
-#include <string>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class AffineTransform : public UpdatableComponent {
- public:
-  AffineTransform(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    linearity_(dim_out, dim_in), bias_(dim_out),
-    linearity_corr_(dim_out, dim_in), bias_corr_(dim_out),
-    max_norm_(0.0)
-  { }
-  ~AffineTransform()
-  { }
-
-  Component* Copy() const { return new AffineTransform(*this); }
-  ComponentType GetType() const { return kAffineTransform; }
-
-  void InitData(std::istream &is) {
-    // define options
-    float bias_mean = -2.0, bias_range = 2.0, param_stddev = 0.1;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
-      else if (token == "<BiasMean>")    ReadBasicType(is, false, &bias_mean);
-      else if (token == "<BiasRange>")   ReadBasicType(is, false, &bias_range);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
-      else if (token == "<MaxNorm>") ReadBasicType(is, false, &max_norm_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (ParamStddev|BiasMean|BiasRange|LearnRateCoef|BiasLearnRateCoef)";
-    }
-
-    //
-    // Initialize trainable parameters,
-    //
-    // Gaussian with given std_dev (mean = 0),
-    linearity_.Resize(OutputDim(), InputDim());
-    RandGauss(0.0, param_stddev, &linearity_);
-    // Uniform,
-    bias_.Resize(OutputDim());
-    RandUniform(bias_mean, bias_range, &bias_);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    while ('<' == Peek(is, binary)) {
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
-          ReadBasicType(is, binary, &learn_rate_coef_);
-          break;
-        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
-          ReadBasicType(is, binary, &bias_learn_rate_coef_);
-          break;
-        case 'M': ExpectToken(is, binary, "<MaxNorm>");
-          ReadBasicType(is, binary, &max_norm_);
-          break;
-        default:
-          std::string token;
-          ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-    // Read the data (data follow the tokens),
-
-    // weight matrix,
-    linearity_.Read(is, binary);
-    // bias vector,
-    bias_.Read(is, binary);
-
-    KALDI_ASSERT(linearity_.NumRows() == output_dim_);
-    KALDI_ASSERT(linearity_.NumCols() == input_dim_);
-    KALDI_ASSERT(bias_.Dim() == output_dim_);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<BiasLearnRateCoef>");
-    WriteBasicType(os, binary, bias_learn_rate_coef_);
-    WriteToken(os, binary, "<MaxNorm>");
-    WriteBasicType(os, binary, max_norm_);
-    if (!binary) os << "\n";
-    // weights
-    linearity_.Write(os, binary);
-    bias_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return linearity_.NumRows()*linearity_.NumCols() + bias_.Dim();
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 linearity_num_elem = linearity_.NumRows() * linearity_.NumCols();
-    gradient->Range(0, linearity_num_elem).CopyRowsFromMat(linearity_corr_);
-    gradient->Range(linearity_num_elem, bias_.Dim()).CopyFromVec(bias_corr_);
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 linearity_num_elem = linearity_.NumRows() * linearity_.NumCols();
-    params->Range(0, linearity_num_elem).CopyRowsFromMat(linearity_);
-    params->Range(linearity_num_elem, bias_.Dim()).CopyFromVec(bias_);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 linearity_num_elem = linearity_.NumRows() * linearity_.NumCols();
-    linearity_.CopyRowsFromVec(params.Range(0, linearity_num_elem));
-    bias_.CopyFromVec(params.Range(linearity_num_elem, bias_.Dim()));
-  }
-
-  std::string Info() const {
-    return std::string("\n  linearity") +
-      MomentStatistics(linearity_) +
-      ", lr-coef " + ToString(learn_rate_coef_) +
-      ", max-norm " + ToString(max_norm_) +
-      "\n  bias" + MomentStatistics(bias_) +
-      ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-  std::string InfoGradient() const {
-    return std::string("\n  linearity_grad") +
-      MomentStatistics(linearity_corr_) +
-      ", lr-coef " + ToString(learn_rate_coef_) +
-      ", max-norm " + ToString(max_norm_) +
-      "\n  bias_grad" + MomentStatistics(bias_corr_) +
-      ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // precopy bias
-    out->AddVecToRows(1.0, bias_, 0.0);
-    // multiply by weights^t
-    out->AddMatMat(1.0, in, kNoTrans, linearity_, kTrans, 1.0);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // multiply error derivative by weights
-    in_diff->AddMatMat(1.0, out_diff, kNoTrans, linearity_, kNoTrans, 0.0);
-  }
-
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // we use following hyperparameters from the option class
-    const BaseFloat lr = opts_.learn_rate * learn_rate_coef_;
-    const BaseFloat lr_bias = opts_.learn_rate * bias_learn_rate_coef_;
-    const BaseFloat mmt = opts_.momentum;
-    const BaseFloat l2 = opts_.l2_penalty;
-    const BaseFloat l1 = opts_.l1_penalty;
-    // we will also need the number of frames in the mini-batch
-    const int32 num_frames = input.NumRows();
-    // compute gradient (incl. momentum)
-    linearity_corr_.AddMatMat(1.0, diff, kTrans, input, kNoTrans, mmt);
-    bias_corr_.AddRowSumMat(1.0, diff, mmt);
-    // l2 regularization
-    if (l2 != 0.0) {
-      linearity_.AddMat(-lr*l2*num_frames, linearity_);
-    }
-    // l1 regularization
-    if (l1 != 0.0) {
-      cu::RegularizeL1(&linearity_, &linearity_corr_, lr*l1*num_frames, lr);
-    }
-    // update
-    linearity_.AddMat(-lr, linearity_corr_);
-    bias_.AddVec(-lr_bias, bias_corr_);
-    // max-norm
-    if (max_norm_ > 0.0) {
-      CuMatrix<BaseFloat> lin_sqr(linearity_);
-      lin_sqr.MulElements(linearity_);
-      CuVector<BaseFloat> l2(OutputDim());
-      l2.AddColSumMat(1.0, lin_sqr, 0.0);
-      l2.ApplyPow(0.5);  // we have per-neuron L2 norms,
-      CuVector<BaseFloat> scl(l2);
-      scl.Scale(1.0/max_norm_);
-      scl.ApplyFloor(1.0);
-      scl.InvertElements();
-      linearity_.MulRowsVec(scl);  // shink to sphere!
-    }
-  }
-
-  /// Accessors to the component parameters,
-  const CuVectorBase<BaseFloat>& GetBias() const { return bias_; }
-
-  void SetBias(const CuVectorBase<BaseFloat>& bias) {
-    KALDI_ASSERT(bias.Dim() == bias_.Dim());
-    bias_.CopyFromVec(bias);
-  }
-
-  const CuMatrixBase<BaseFloat>& GetLinearity() const { return linearity_; }
-
-  void SetLinearity(const CuMatrixBase<BaseFloat>& linearity) {
-    KALDI_ASSERT(linearity.NumRows() == linearity_.NumRows());
-    KALDI_ASSERT(linearity.NumCols() == linearity_.NumCols());
-    linearity_.CopyFromMat(linearity);
-  }
-
- private:
-  CuMatrix<BaseFloat> linearity_;
-  CuVector<BaseFloat> bias_;
-
-  CuMatrix<BaseFloat> linearity_corr_;
-  CuVector<BaseFloat> bias_corr_;
-
-  BaseFloat max_norm_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_AFFINE_TRANSFORM_H_
diff --git a/src/nnet/nnet-average-pooling-component.h b/src/nnet/nnet-average-pooling-component.h
deleted file mode 100644
index 605c6ba327a..00000000000
--- a/src/nnet/nnet-average-pooling-component.h
+++ /dev/null
@@ -1,169 +0,0 @@
-// nnet/nnet-average-pooling-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_AVERAGE_POOLING_COMPONENT_H_
-#define KALDI_NNET_NNET_AVERAGE_POOLING_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * AveragePoolingComponent :
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * The pooling is done over 3rd axis, of the set of 2d matrices.
- * Our pooling supports overlaps, overlaps occur when (pool_step_ < pool_size_).
- */
-class AveragePoolingComponent : public Component {
- public:
-  AveragePoolingComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    pool_size_(0),
-    pool_step_(0),
-    pool_stride_(0)
-  { }
-
-  ~AveragePoolingComponent()
-  { }
-
-  Component* Copy() const { return new AveragePoolingComponent(*this); }
-  ComponentType GetType() const { return kAveragePoolingComponent; }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<PoolSize>") ReadBasicType(is, false, &pool_size_);
-      else if (token == "<PoolStep>") ReadBasicType(is, false, &pool_step_);
-      else if (token == "<PoolStride>") ReadBasicType(is, false, &pool_stride_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (PoolSize|PoolStep|PoolStride)";
-    }
-    // check
-    KALDI_ASSERT(pool_size_ != 0 && pool_step_ != 0 && pool_stride_ != 0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // pooling hyperparameters
-    ExpectToken(is, binary, "<PoolSize>");
-    ReadBasicType(is, binary, &pool_size_);
-    ExpectToken(is, binary, "<PoolStep>");
-    ReadBasicType(is, binary, &pool_step_);
-    ExpectToken(is, binary, "<PoolStride>");
-    ReadBasicType(is, binary, &pool_stride_);
-
-    //
-    // Sanity checks:
-    //
-    // number of patches:
-    KALDI_ASSERT(input_dim_ % pool_stride_ == 0);
-    int32 num_patches = input_dim_ / pool_stride_;
-    // number of pools:
-    KALDI_ASSERT((num_patches - pool_size_) % pool_step_ == 0);
-    int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
-    // check output dim:
-    KALDI_ASSERT(output_dim_ == num_pools * pool_stride_);
-    //
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // pooling hyperparameters
-    WriteToken(os, binary, "<PoolSize>");
-    WriteBasicType(os, binary, pool_size_);
-    WriteToken(os, binary, "<PoolStep>");
-    WriteBasicType(os, binary, pool_step_);
-    WriteToken(os, binary, "<PoolStride>");
-    WriteBasicType(os, binary, pool_stride_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_patches = input_dim_ / pool_stride_;
-    int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
-
-    // do the average-pooling (pools indexed by q)
-    for (int32 q = 0; q < num_pools; q++) {
-      // get output buffer of the pool
-      CuSubMatrix<BaseFloat> pool(out->ColRange(q*pool_stride_, pool_stride_));
-      pool.SetZero();  // reset,
-      for (int32 r = 0; r < pool_size_; r++) {  // sum
-        int32 p = r + q * pool_step_;  // p = input patch
-        pool.AddMat(1.0, in.ColRange(p*pool_stride_, pool_stride_));
-      }
-      pool.Scale(1.0 / pool_size_);  // divide by #summands
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_patches = input_dim_ / pool_stride_;
-    int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
-
-    //
-    // here we note how many diff matrices are summed for each input patch,
-    std::vector<int32> patch_summands(num_patches, 0);
-    // this metainfo will be used to divide diff of patches
-    // used in more than one pool.
-    //
-
-    in_diff->SetZero();  // reset
-
-    for (int32 q = 0; q < num_pools; q++) {  // sum
-      for (int32 r = 0; r < pool_size_; r++) {
-        int32 p = r + q * pool_step_;
-        CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(p*pool_stride_, pool_stride_));
-        CuSubMatrix<BaseFloat> src(out_diff.ColRange(q*pool_stride_, pool_stride_));
-        tgt.AddMat(1.0, src);
-        patch_summands[p] += 1;
-      }
-    }
-
-    // divide diff by average-pooling-dim (derivative of averaging)
-    in_diff->Scale(1.0 / pool_size_);
-
-    // divide diff by #summands (compensate for patches used in more pools)
-    for (int32 p = 0; p < num_patches; p++) {
-      CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(p*pool_stride_, pool_stride_));
-      KALDI_ASSERT(patch_summands[p] > 0);  // patch at least in one pool
-      tgt.Scale(1.0/patch_summands[p]);
-    }
-  }
-
- private:
-  int32 pool_size_,    // input patches used for pooling
-        pool_step_,    // shift used for pooling (allow overlapping pools)
-        pool_stride_;  // stride used to cut input to a vector of matrices
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_AVERAGE_POOLING_COMPONENT_H_
diff --git a/src/nnet/nnet-blstm-projected.h b/src/nnet/nnet-blstm-projected.h
deleted file mode 100644
index 45851f5d9fc..00000000000
--- a/src/nnet/nnet-blstm-projected.h
+++ /dev/null
@@ -1,1206 +0,0 @@
-// nnet/nnet-blstm-projected-streams.h
-
-// Copyright 2016  Brno University of Techology (author: Karel Vesely)
-// Copyright 2015  Chongjia Ni
-// Copyright 2014  Jiayu DU (Jerry), Wei Li
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_BLSTM_PROJECTED_H_
-#define KALDI_NNET_NNET_BLSTM_PROJECTED_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-/*************************************
- * x: input neuron
- * g: squashing neuron near input
- * i: Input gate
- * f: Forget gate
- * o: Output gate
- * c: memory Cell (CEC)
- * h: squashing neuron near output
- * m: output neuron of Memory block
- * r: recurrent projection neuron
- * y: output neuron of LSTMP
- * f-*: forward direction
- * b-*: backward direction
- *************************************/
-
-namespace kaldi {
-namespace nnet1 {
-
-class BlstmProjected : public MultistreamComponent {
- public:
-  BlstmProjected(int32 input_dim, int32 output_dim):
-    MultistreamComponent(input_dim, output_dim),
-    cell_dim_(0),
-    proj_dim_(static_cast<int32>(output_dim/2)),
-    cell_clip_(50.0),
-    diff_clip_(1.0),
-    cell_diff_clip_(0.0),
-    grad_clip_(250.0)
-  { }
-
-  ~BlstmProjected()
-  { }
-
-  Component* Copy() const { return new BlstmProjected(*this); }
-  ComponentType GetType() const { return kBlstmProjected; }
-
-  void InitData(std::istream &is) {
-    // define options,
-    float param_range = 0.1;
-    // parse the line from prototype,
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ParamRange>") ReadBasicType(is, false, &param_range);
-      else if (token == "<CellDim>") ReadBasicType(is, false, &cell_dim_);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
-      else if (token == "<CellClip>") ReadBasicType(is, false, &cell_clip_);
-      else if (token == "<DiffClip>") ReadBasicType(is, false, &diff_clip_);
-      else if (token == "<CellDiffClip>") ReadBasicType(is, false, &cell_diff_clip_);
-      else if (token == "<GradClip>") ReadBasicType(is, false, &grad_clip_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (ParamRange|CellDim|LearnRateCoef|BiasLearnRateCoef|CellClip|DiffClip|GradClip)";
-    }
-
-    // init the weights and biases (from uniform dist.),
-    // forward direction,
-    f_w_gifo_x_.Resize(4*cell_dim_, input_dim_, kUndefined);
-    f_w_gifo_r_.Resize(4*cell_dim_, proj_dim_, kUndefined);
-    f_bias_.Resize(4*cell_dim_, kUndefined);
-    f_peephole_i_c_.Resize(cell_dim_, kUndefined);
-    f_peephole_f_c_.Resize(cell_dim_, kUndefined);
-    f_peephole_o_c_.Resize(cell_dim_, kUndefined);
-    f_w_r_m_.Resize(proj_dim_, cell_dim_, kUndefined);
-    //       (mean), (range)
-    RandUniform(0.0, 2.0 * param_range, &f_w_gifo_x_);
-    RandUniform(0.0, 2.0 * param_range, &f_w_gifo_r_);
-    RandUniform(0.0, 2.0 * param_range, &f_bias_);
-    RandUniform(0.0, 2.0 * param_range, &f_peephole_i_c_);
-    RandUniform(0.0, 2.0 * param_range, &f_peephole_f_c_);
-    RandUniform(0.0, 2.0 * param_range, &f_peephole_o_c_);
-    RandUniform(0.0, 2.0 * param_range, &f_w_r_m_);
-
-    // Add 1.0 to forget-gate bias
-    // [Miao IS16: AN EMPIRICAL EXPLORATION...]
-    f_bias_.Range(2*cell_dim_, cell_dim_).Add(1.0);
-
-    // backward direction,
-    b_w_gifo_x_.Resize(4*cell_dim_, input_dim_, kUndefined);
-    b_w_gifo_r_.Resize(4*cell_dim_, proj_dim_, kUndefined);
-    b_bias_.Resize(4*cell_dim_, kUndefined);
-    b_peephole_i_c_.Resize(cell_dim_, kUndefined);
-    b_peephole_f_c_.Resize(cell_dim_, kUndefined);
-    b_peephole_o_c_.Resize(cell_dim_, kUndefined);
-    b_w_r_m_.Resize(proj_dim_, cell_dim_, kUndefined);
-
-    RandUniform(0.0, 2.0 * param_range, &b_w_gifo_x_);
-    RandUniform(0.0, 2.0 * param_range, &b_w_gifo_r_);
-    RandUniform(0.0, 2.0 * param_range, &b_bias_);
-    RandUniform(0.0, 2.0 * param_range, &b_peephole_i_c_);
-    RandUniform(0.0, 2.0 * param_range, &b_peephole_f_c_);
-    RandUniform(0.0, 2.0 * param_range, &b_peephole_o_c_);
-    RandUniform(0.0, 2.0 * param_range, &b_w_r_m_);
-
-    // Add 1.0 to forget-gate bias,
-    // [Miao IS16: AN EMPIRICAL EXPLORATION...]
-    b_bias_.Range(2*cell_dim_, cell_dim_).Add(1.0);
-
-    KALDI_ASSERT(cell_dim_ > 0);
-    KALDI_ASSERT(learn_rate_coef_ >= 0.0);
-    KALDI_ASSERT(bias_learn_rate_coef_ >= 0.0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    while ('<' == Peek(is, binary)) {
-      std::string token;
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'C': ReadToken(is, false, &token);
-          /**/ if (token == "<CellDim>") ReadBasicType(is, binary, &cell_dim_);
-          else if (token == "<CellClip>") ReadBasicType(is, binary, &cell_clip_);
-          else if (token == "<CellDiffClip>") ReadBasicType(is, binary, &cell_diff_clip_);
-          else if (token == "<ClipGradient>") ReadBasicType(is, binary, &grad_clip_); // bwd-compat.
-          else KALDI_ERR << "Unknown token: " << token;
-          break;
-        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
-          ReadBasicType(is, binary, &learn_rate_coef_);
-          break;
-        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
-          ReadBasicType(is, binary, &bias_learn_rate_coef_);
-          break;
-        case 'D': ExpectToken(is, binary, "<DiffClip>");
-          ReadBasicType(is, binary, &diff_clip_);
-          break;
-        case 'G': ExpectToken(is, binary, "<GradClip>");
-          ReadBasicType(is, binary, &grad_clip_);
-          break;
-        default: ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-    KALDI_ASSERT(cell_dim_ != 0);
-    // Read the data (data follow the tokens),
-
-    // reading parameters corresponding to forward direction
-    f_w_gifo_x_.Read(is, binary);
-    f_w_gifo_r_.Read(is, binary);
-    f_bias_.Read(is, binary);
-
-    f_peephole_i_c_.Read(is, binary);
-    f_peephole_f_c_.Read(is, binary);
-    f_peephole_o_c_.Read(is, binary);
-
-    f_w_r_m_.Read(is, binary);
-
-    // reading parameters corresponding to backward direction
-    b_w_gifo_x_.Read(is, binary);
-    b_w_gifo_r_.Read(is, binary);
-    b_bias_.Read(is, binary);
-
-    b_peephole_i_c_.Read(is, binary);
-    b_peephole_f_c_.Read(is, binary);
-    b_peephole_o_c_.Read(is, binary);
-
-    b_w_r_m_.Read(is, binary);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<CellDim>");
-    WriteBasicType(os, binary, cell_dim_);
-
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<BiasLearnRateCoef>");
-    WriteBasicType(os, binary, bias_learn_rate_coef_);
-
-    WriteToken(os, binary, "<CellClip>");
-    WriteBasicType(os, binary, cell_clip_);
-    WriteToken(os, binary, "<DiffClip>");
-    WriteBasicType(os, binary, diff_clip_);
-    WriteToken(os, binary, "<CellDiffClip>");
-    WriteBasicType(os, binary, cell_diff_clip_);
-    WriteToken(os, binary, "<GradClip>");
-    WriteBasicType(os, binary, grad_clip_);
-
-    if (!binary) os << "\n";
-    // writing parameters, forward direction,
-    f_w_gifo_x_.Write(os, binary);
-    f_w_gifo_r_.Write(os, binary);
-    f_bias_.Write(os, binary);
-
-    f_peephole_i_c_.Write(os, binary);
-    f_peephole_f_c_.Write(os, binary);
-    f_peephole_o_c_.Write(os, binary);
-
-    f_w_r_m_.Write(os, binary);
-
-    if (!binary) os << "\n";
-    // writing parameters, backward direction,
-    b_w_gifo_x_.Write(os, binary);
-    b_w_gifo_r_.Write(os, binary);
-    b_bias_.Write(os, binary);
-
-    b_peephole_i_c_.Write(os, binary);
-    b_peephole_f_c_.Write(os, binary);
-    b_peephole_o_c_.Write(os, binary);
-
-    b_w_r_m_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return 2 * ( f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols() +
-      f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols() +
-      f_bias_.Dim() +
-      f_peephole_i_c_.Dim() +
-      f_peephole_f_c_.Dim() +
-      f_peephole_o_c_.Dim() +
-      f_w_r_m_.NumRows() * f_w_r_m_.NumCols() );
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 offset, len;
-
-    // Copying parameters corresponding to forward direction
-    offset = 0;    len = f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(f_w_gifo_x_corr_);
-
-    offset += len; len = f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(f_w_gifo_r_corr_);
-
-    offset += len; len = f_bias_.Dim();
-    gradient->Range(offset, len).CopyFromVec(f_bias_corr_);
-
-    offset += len; len = f_peephole_i_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(f_peephole_i_c_corr_);
-
-    offset += len; len = f_peephole_f_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(f_peephole_f_c_corr_);
-
-    offset += len; len = f_peephole_o_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(f_peephole_o_c_corr_);
-
-    offset += len; len = f_w_r_m_.NumRows() * f_w_r_m_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(f_w_r_m_corr_);
-
-    // Copying parameters corresponding to backward direction
-    offset += len; len = b_w_gifo_x_.NumRows() * b_w_gifo_x_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(b_w_gifo_x_corr_);
-
-    offset += len; len = b_w_gifo_r_.NumRows() * b_w_gifo_r_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(b_w_gifo_r_corr_);
-
-    offset += len; len = b_bias_.Dim();
-    gradient->Range(offset, len).CopyFromVec(b_bias_corr_);
-
-    offset += len; len = b_peephole_i_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(b_peephole_i_c_corr_);
-
-    offset += len; len = b_peephole_f_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(b_peephole_f_c_corr_);
-
-    offset += len; len = b_peephole_o_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(b_peephole_o_c_corr_);
-
-    offset += len; len = b_w_r_m_.NumRows() * b_w_r_m_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(b_w_r_m_corr_);
-
-    // check the dim,
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 offset, len;
-
-    // Copying parameters corresponding to forward direction
-    offset = 0;    len = f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(f_w_gifo_x_);
-
-    offset += len; len = f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(f_w_gifo_r_);
-
-    offset += len; len = f_bias_.Dim();
-    params->Range(offset, len).CopyFromVec(f_bias_);
-
-    offset += len; len = f_peephole_i_c_.Dim();
-    params->Range(offset, len).CopyFromVec(f_peephole_i_c_);
-
-    offset += len; len = f_peephole_f_c_.Dim();
-    params->Range(offset, len).CopyFromVec(f_peephole_f_c_);
-
-    offset += len; len = f_peephole_o_c_.Dim();
-    params->Range(offset, len).CopyFromVec(f_peephole_o_c_);
-
-    offset += len; len = f_w_r_m_.NumRows() * f_w_r_m_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(f_w_r_m_);
-
-    // Copying parameters corresponding to backward direction
-    offset += len; len = b_w_gifo_x_.NumRows() * b_w_gifo_x_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(b_w_gifo_x_);
-
-    offset += len; len = b_w_gifo_r_.NumRows() * b_w_gifo_r_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(b_w_gifo_r_);
-
-    offset += len; len = b_bias_.Dim();
-    params->Range(offset, len).CopyFromVec(b_bias_);
-
-    offset += len; len = b_peephole_i_c_.Dim();
-    params->Range(offset, len).CopyFromVec(b_peephole_i_c_);
-
-    offset += len; len = b_peephole_f_c_.Dim();
-    params->Range(offset, len).CopyFromVec(b_peephole_f_c_);
-
-    offset += len; len = b_peephole_o_c_.Dim();
-    params->Range(offset, len).CopyFromVec(b_peephole_o_c_);
-
-    offset += len; len = b_w_r_m_.NumRows() * b_w_r_m_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(b_w_r_m_);
-
-    // check the dim,
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 offset, len;
-
-    // Copying parameters corresponding to forward direction
-    offset = 0;    len = f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols();
-    f_w_gifo_x_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols();
-    f_w_gifo_r_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = f_bias_.Dim();
-    f_bias_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = f_peephole_i_c_.Dim();
-    f_peephole_i_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = f_peephole_f_c_.Dim();
-    f_peephole_f_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = f_peephole_o_c_.Dim();
-    f_peephole_o_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = f_w_r_m_.NumRows() * f_w_r_m_.NumCols();
-    f_w_r_m_.CopyRowsFromVec(params.Range(offset, len));
-
-    // Copying parameters corresponding to backward direction
-    offset += len; len = b_w_gifo_x_.NumRows() * b_w_gifo_x_.NumCols();
-    b_w_gifo_x_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = b_w_gifo_r_.NumRows() * b_w_gifo_r_.NumCols();
-    b_w_gifo_r_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = b_bias_.Dim();
-    b_bias_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = b_peephole_i_c_.Dim();
-    b_peephole_i_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = b_peephole_f_c_.Dim();
-    b_peephole_f_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = b_peephole_o_c_.Dim();
-    b_peephole_o_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = b_w_r_m_.NumRows() * b_w_r_m_.NumCols();
-    b_w_r_m_.CopyRowsFromVec(params.Range(offset, len));
-
-    // check the dim,
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-
-  std::string Info() const {
-    return std::string("cell-dim 2x") + ToString(cell_dim_) + " " +
-      "( learn_rate_coef_ " + ToString(learn_rate_coef_) +
-      ", bias_learn_rate_coef_ " + ToString(bias_learn_rate_coef_) +
-      ", cell_clip_ " + ToString(cell_clip_) +
-      ", diff_clip_ " + ToString(diff_clip_) +
-      ", grad_clip_ " + ToString(grad_clip_) + " )" +
-      "\n  Forward Direction weights:" +
-      "\n  f_w_gifo_x_  "     + MomentStatistics(f_w_gifo_x_) +
-      "\n  f_w_gifo_r_  "     + MomentStatistics(f_w_gifo_r_) +
-      "\n  f_bias_  "         + MomentStatistics(f_bias_) +
-      "\n  f_peephole_i_c_  " + MomentStatistics(f_peephole_i_c_) +
-      "\n  f_peephole_f_c_  " + MomentStatistics(f_peephole_f_c_) +
-      "\n  f_peephole_o_c_  " + MomentStatistics(f_peephole_o_c_) +
-      "\n  f_w_r_m_  "        + MomentStatistics(f_w_r_m_) +
-      "\n  Backward Direction weights:" +
-      "\n  b_w_gifo_x_  "     + MomentStatistics(b_w_gifo_x_) +
-      "\n  b_w_gifo_r_  "     + MomentStatistics(b_w_gifo_r_) +
-      "\n  b_bias_  "         + MomentStatistics(b_bias_) +
-      "\n  b_peephole_i_c_  " + MomentStatistics(b_peephole_i_c_) +
-      "\n  b_peephole_f_c_  " + MomentStatistics(b_peephole_f_c_) +
-      "\n  b_peephole_o_c_  " + MomentStatistics(b_peephole_o_c_) +
-      "\n  b_w_r_m_  "        + MomentStatistics(b_w_r_m_);
-  }
-
-
-  std::string InfoGradient() const {
-    // forward-direction activations,
-    const CuSubMatrix<BaseFloat> YG_FW(f_propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YI_FW(f_propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YF_FW(f_propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YO_FW(f_propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YC_FW(f_propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YH_FW(f_propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YM_FW(f_propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YR_FW(f_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // forward-direction derivatives,
-    const CuSubMatrix<BaseFloat> DG_FW(f_backpropagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DI_FW(f_backpropagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DF_FW(f_backpropagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DO_FW(f_backpropagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DC_FW(f_backpropagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DH_FW(f_backpropagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DM_FW(f_backpropagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DR_FW(f_backpropagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // backward-direction activations,
-    const CuSubMatrix<BaseFloat> YG_BW(b_propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YI_BW(b_propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YF_BW(b_propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YO_BW(b_propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YC_BW(b_propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YH_BW(b_propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YM_BW(b_propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YR_BW(b_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // backward-direction derivatives,
-    const CuSubMatrix<BaseFloat> DG_BW(b_backpropagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DI_BW(b_backpropagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DF_BW(b_backpropagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DO_BW(b_backpropagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DC_BW(b_backpropagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DH_BW(b_backpropagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DM_BW(b_backpropagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DR_BW(b_backpropagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    return std::string("") +
-      "( learn_rate_coef_ " + ToString(learn_rate_coef_) +
-      ", bias_learn_rate_coef_ " + ToString(bias_learn_rate_coef_) +
-      ", cell_clip_ " + ToString(cell_clip_) +
-      ", diff_clip_ " + ToString(diff_clip_) +
-      ", grad_clip_ " + ToString(grad_clip_) + " )" +
-      "\n  ### Gradients " +
-      "\n  f_w_gifo_x_corr_  "     + MomentStatistics(f_w_gifo_x_corr_) +
-      "\n  f_w_gifo_r_corr_  "     + MomentStatistics(f_w_gifo_r_corr_) +
-      "\n  f_bias_corr_  "         + MomentStatistics(f_bias_corr_) +
-      "\n  f_peephole_i_c_corr_  " + MomentStatistics(f_peephole_i_c_corr_) +
-      "\n  f_peephole_f_c_corr_  " + MomentStatistics(f_peephole_f_c_corr_) +
-      "\n  f_peephole_o_c_corr_  " + MomentStatistics(f_peephole_o_c_corr_) +
-      "\n  f_w_r_m_corr_  "        + MomentStatistics(f_w_r_m_corr_) +
-      "\n  ---" +
-      "\n  b_w_gifo_x_corr_  "     + MomentStatistics(b_w_gifo_x_corr_) +
-      "\n  b_w_gifo_r_corr_  "     + MomentStatistics(b_w_gifo_r_corr_) +
-      "\n  b_bias_corr_  "         + MomentStatistics(b_bias_corr_) +
-      "\n  b_peephole_i_c_corr_  " + MomentStatistics(b_peephole_i_c_corr_) +
-      "\n  b_peephole_f_c_corr_  " + MomentStatistics(b_peephole_f_c_corr_) +
-      "\n  b_peephole_o_c_corr_  " + MomentStatistics(b_peephole_o_c_corr_) +
-      "\n  b_w_r_m_corr_  "        + MomentStatistics(b_w_r_m_corr_) +
-      "\n" +
-      "\n  ### Activations (mostly after non-linearities)" +
-      "\n  YI_FW(0..1)^  " + MomentStatistics(YI_FW) +
-      "\n  YF_FW(0..1)^  " + MomentStatistics(YF_FW) +
-      "\n  YO_FW(0..1)^  " + MomentStatistics(YO_FW) +
-      "\n  YG_FW(-1..1)  " + MomentStatistics(YG_FW) +
-      "\n  YC_FW(-R..R)* " + MomentStatistics(YC_FW) +
-      "\n  YH_FW(-1..1)  " + MomentStatistics(YH_FW) +
-      "\n  YM_FW(-1..1)  " + MomentStatistics(YM_FW) +
-      "\n  YR_FW(-R..R)  " + MomentStatistics(YR_FW) +
-      "\n  ---" +
-      "\n  YI_BW(0..1)^  " + MomentStatistics(YI_BW) +
-      "\n  YF_BW(0..1)^  " + MomentStatistics(YF_BW) +
-      "\n  YO_BW(0..1)^  " + MomentStatistics(YO_BW) +
-      "\n  YG_BW(-1..1)  " + MomentStatistics(YG_BW) +
-      "\n  YC_BW(-R..R)* " + MomentStatistics(YC_BW) +
-      "\n  YH_BW(-1..1)  " + MomentStatistics(YH_BW) +
-      "\n  YM_BW(-1..1)  " + MomentStatistics(YM_BW) +
-      "\n  YR_BW(-R..R)  " + MomentStatistics(YR_BW) +
-      "\n" +
-      "\n  ### Derivatives (w.r.t. inputs of non-linearities)" +
-      "\n  DI_FW^ " + MomentStatistics(DI_FW) +
-      "\n  DF_FW^ " + MomentStatistics(DF_FW) +
-      "\n  DO_FW^ " + MomentStatistics(DO_FW) +
-      "\n  DG_FW  " + MomentStatistics(DG_FW) +
-      "\n  DC_FW* " + MomentStatistics(DC_FW) +
-      "\n  DH_FW  " + MomentStatistics(DH_FW) +
-      "\n  DM_FW  " + MomentStatistics(DM_FW) +
-      "\n  DR_FW  " + MomentStatistics(DR_FW) +
-      "\n  ---" +
-      "\n  DI_BW^ " + MomentStatistics(DI_BW) +
-      "\n  DF_BW^ " + MomentStatistics(DF_BW) +
-      "\n  DO_BW^ " + MomentStatistics(DO_BW) +
-      "\n  DG_BW  " + MomentStatistics(DG_BW) +
-      "\n  DC_BW* " + MomentStatistics(DC_BW) +
-      "\n  DH_BW  " + MomentStatistics(DH_BW) +
-      "\n  DM_BW  " + MomentStatistics(DM_BW) +
-      "\n  DR_BW  " + MomentStatistics(DR_BW);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-
-    KALDI_ASSERT(in.NumRows() % NumStreams() == 0);
-    int32 S = NumStreams();
-    int32 T = in.NumRows() / NumStreams();
-
-    // buffers,
-    f_propagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
-    b_propagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
-
-    // forward-direction activations,
-    CuSubMatrix<BaseFloat> F_YG(f_propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YI(f_propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YF(f_propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YO(f_propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YC(f_propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YH(f_propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YM(f_propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YR(f_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    CuSubMatrix<BaseFloat> F_YGIFO(f_propagate_buf_.ColRange(0, 4*cell_dim_));
-
-    // backward-direction activations,
-    CuSubMatrix<BaseFloat> B_YG(b_propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YI(b_propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YF(b_propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YO(b_propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YC(b_propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YH(b_propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YM(b_propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YR(b_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    CuSubMatrix<BaseFloat> B_YGIFO(b_propagate_buf_.ColRange(0, 4*cell_dim_));
-
-    // FORWARD DIRECTION,
-    // x -> g, i, f, o, not recurrent, do it all in once
-    F_YGIFO.RowRange(1*S, T*S).AddMatMat(1.0, in, kNoTrans, f_w_gifo_x_, kTrans, 0.0);
-
-    // bias -> g, i, f, o
-    F_YGIFO.RowRange(1*S, T*S).AddVecToRows(1.0, f_bias_);
-
-    // BufferPadding [T0]:dummy, [1, T]:current sequence, [T+1]:dummy
-    for (int t = 1; t <= T; t++) {
-      // multistream buffers for current time-step,
-      CuSubMatrix<BaseFloat> y_all(f_propagate_buf_.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_g(F_YG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_i(F_YI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_f(F_YF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_o(F_YO.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_c(F_YC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_h(F_YH.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_m(F_YM.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_r(F_YR.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_gifo(F_YGIFO.RowRange(t*S, S));
-
-      // r(t-1) -> g, i, f, o
-      y_gifo.AddMatMat(1.0, F_YR.RowRange((t-1)*S, S), kNoTrans, f_w_gifo_r_, kTrans, 1.0);
-
-      // c(t-1) -> i(t) via peephole
-      y_i.AddMatDiagVec(1.0, F_YC.RowRange((t-1)*S, S), kNoTrans, f_peephole_i_c_, 1.0);
-
-      // c(t-1) -> f(t) via peephole
-      y_f.AddMatDiagVec(1.0, F_YC.RowRange((t-1)*S, S), kNoTrans, f_peephole_f_c_, 1.0);
-
-      // i, f sigmoid squashing
-      y_i.Sigmoid(y_i);
-      y_f.Sigmoid(y_f);
-
-      // g tanh squashing
-      y_g.Tanh(y_g);
-
-      // g * i -> c
-      y_c.AddMatMatElements(1.0, y_g, y_i, 0.0);
-      // c(t-1) * f -> c(t) via forget-gate
-      y_c.AddMatMatElements(1.0, F_YC.RowRange((t-1)*S, S), y_f, 1.0);
-
-      if (cell_clip_ > 0.0) {
-        y_c.ApplyFloor(-cell_clip_);   // Optional clipping of cell activation,
-        y_c.ApplyCeiling(cell_clip_);  // Google paper Interspeech2014: LSTM for LVCSR
-      }
-
-      // c(t) -> o(t) via peephole (not recurrent, using c(t))
-      y_o.AddMatDiagVec(1.0, y_c, kNoTrans, f_peephole_o_c_, 1.0);
-
-      // o sigmoid squashing,
-      y_o.Sigmoid(y_o);
-
-      // c -> h, tanh squashing,
-      y_h.Tanh(y_c);
-
-      // h * o -> m via output gate,
-      y_m.AddMatMatElements(1.0, y_h, y_o, 0.0);
-
-      // m -> r
-      y_r.AddMatMat(1.0, y_m, kNoTrans, f_w_r_m_, kTrans, 0.0);
-
-      // set zeros to padded frames,
-      if (sequence_lengths_.size() > 0) {
-        for (int s = 0; s < S; s++) {
-          if (t > sequence_lengths_[s]) {
-            y_all.Row(s).SetZero();
-          }
-        }
-      }
-    }
-
-    // BACKWARD DIRECTION,
-    // x -> g, i, f, o, not recurrent, do it all in once
-    B_YGIFO.RowRange(1*S, T*S).AddMatMat(1.0, in, kNoTrans, b_w_gifo_x_, kTrans, 0.0);
-
-    // bias -> g, i, f, o
-    B_YGIFO.RowRange(1*S, T*S).AddVecToRows(1.0, b_bias_);
-
-    // BufferPadding [T0]:dummy, [1, T]:current sequence, [T+1]:dummy
-    for (int t = T; t >= 1; t--) {
-      // multistream buffers for current time-step,
-      CuSubMatrix<BaseFloat> y_all(b_propagate_buf_.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_g(B_YG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_i(B_YI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_f(B_YF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_o(B_YO.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_c(B_YC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_h(B_YH.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_m(B_YM.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_r(B_YR.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_gifo(B_YGIFO.RowRange(t*S, S));
-
-      // r(t+1) -> g, i, f, o
-      y_gifo.AddMatMat(1.0, B_YR.RowRange((t+1)*S, S), kNoTrans, b_w_gifo_r_, kTrans, 1.0);
-
-      // c(t+1) -> i(t) via peephole
-      y_i.AddMatDiagVec(1.0, B_YC.RowRange((t+1)*S, S), kNoTrans, b_peephole_i_c_, 1.0);
-
-      // c(t+1) -> f(t) via peephole
-      y_f.AddMatDiagVec(1.0, B_YC.RowRange((t+1)*S, S), kNoTrans, b_peephole_f_c_, 1.0);
-
-      // i, f sigmoid squashing
-      y_i.Sigmoid(y_i);
-      y_f.Sigmoid(y_f);
-
-      // g tanh squashing
-      y_g.Tanh(y_g);
-
-      // g * i -> c
-      y_c.AddMatMatElements(1.0, y_g, y_i, 0.0);
-      // c(t+1) * f -> c(t) via forget-gate
-      y_c.AddMatMatElements(1.0, B_YC.RowRange((t+1)*S, S), y_f, 1.0);
-
-      if (cell_clip_ > 0.0) {
-        y_c.ApplyFloor(-cell_clip_);   // optional clipping of cell activation,
-        y_c.ApplyCeiling(cell_clip_);  // google paper Interspeech2014: LSTM for LVCSR
-      }
-
-      // c(t) -> o(t) via peephole (not recurrent, using c(t))
-      y_o.AddMatDiagVec(1.0, y_c, kNoTrans, b_peephole_o_c_, 1.0);
-
-      // o sigmoid squashing,
-      y_o.Sigmoid(y_o);
-
-      // h tanh squashing,
-      y_h.Tanh(y_c);
-
-      // h * o -> m via output gate,
-      y_m.AddMatMatElements(1.0, y_h, y_o, 0.0);
-
-      // m -> r
-      y_r.AddMatMat(1.0, y_m, kNoTrans, b_w_r_m_, kTrans, 0.0);
-
-      // set zeros to padded frames,
-      if (sequence_lengths_.size() > 0) {
-        for (int s = 0; s < S; s++) {
-          if (t > sequence_lengths_[s]) {
-            y_all.Row(s).SetZero();
-          }
-        }
-      }
-    }
-
-    CuMatrix<BaseFloat> YR_FB;
-    YR_FB.Resize((T+2)*S, 2 * proj_dim_, kSetZero);
-    // forward part
-    YR_FB.ColRange(0, proj_dim_).CopyFromMat(f_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    // backward part
-    YR_FB.ColRange(proj_dim_, proj_dim_).CopyFromMat(b_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    // recurrent projection layer is also feed-forward as BLSTM output
-    out->CopyFromMat(YR_FB.RowRange(1*S, T*S));
-  }
-
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-
-    // the number of sequences to be processed in parallel
-    int32 T = in.NumRows() / NumStreams();
-    int32 S = NumStreams();
-
-    // buffers,
-    f_backpropagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
-    b_backpropagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
-
-    // FORWARD DIRECTION,
-    // forward-direction activations,
-    CuSubMatrix<BaseFloat> F_YG(f_propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YI(f_propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YF(f_propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YO(f_propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YC(f_propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YH(f_propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YM(f_propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_YR(f_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // forward-direction derivatives,
-    CuSubMatrix<BaseFloat> F_DG(f_backpropagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DI(f_backpropagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DF(f_backpropagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DO(f_backpropagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DC(f_backpropagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DH(f_backpropagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DM(f_backpropagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> F_DR(f_backpropagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    CuSubMatrix<BaseFloat> F_DGIFO(f_backpropagate_buf_.ColRange(0, 4*cell_dim_));
-
-    // pre-copy partial derivatives from the BLSTM output,
-    F_DR.RowRange(1*S, T*S).CopyFromMat(out_diff.ColRange(0, proj_dim_));
-
-    // BufferPadding [T0]:dummy, [1,T]:current sequence, [T+1]: dummy,
-    for (int t = T; t >= 1; t--) {
-      CuSubMatrix<BaseFloat> y_g(F_YG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_i(F_YI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_f(F_YF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_o(F_YO.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_c(F_YC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_h(F_YH.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_m(F_YM.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_r(F_YR.RowRange(t*S, S));
-
-      CuSubMatrix<BaseFloat> d_all(f_backpropagate_buf_.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_g(F_DG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_i(F_DI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_f(F_DF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_o(F_DO.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_c(F_DC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_h(F_DH.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_m(F_DM.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_r(F_DR.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_gifo(F_DGIFO.RowRange(t*S, S));
-
-      // r
-      //   Version 1 (precise gradients):
-      //   backprop error from g(t+1), i(t+1), f(t+1), o(t+1) to r(t)
-      d_r.AddMatMat(1.0, F_DGIFO.RowRange((t+1)*S, S), kNoTrans, f_w_gifo_r_, kNoTrans, 1.0);
-
-      /*
-      //   Version 2 (Alex Graves' PhD dissertation):
-      //   only backprop g(t+1) to r(t)
-      CuSubMatrix<BaseFloat> w_g_r_(w_gifo_r_.RowRange(0, cell_dim_));
-      d_r.AddMatMat(1.0, DG.RowRange((t+1)*S,S), kNoTrans, w_g_r_, kNoTrans, 1.0);
-      */
-
-      /*
-      //   Version 3 (Felix Gers' PhD dissertation):
-      //   truncate gradients of g(t+1), i(t+1), f(t+1), o(t+1) once they leak out memory block
-      //   CEC(with forget connection) is the only "error-bridge" through time
-      ;
-      */
-
-      // r -> m
-      d_m.AddMatMat(1.0, d_r, kNoTrans, f_w_r_m_, kNoTrans, 0.0);
-
-      // m -> h, via output gate
-      d_h.AddMatMatElements(1.0, d_m, y_o, 0.0);
-      d_h.DiffTanh(y_h, d_h);
-
-      // o
-      d_o.AddMatMatElements(1.0, d_m, y_h, 0.0);
-      d_o.DiffSigmoid(y_o, d_o);
-
-      // c
-      // 1. diff from h(t)
-      // 2. diff from c(t+1) (via forget-gate between CEC)
-      // 3. diff from i(t+1) (via peephole)
-      // 4. diff from f(t+1) (via peephole)
-      // 5. diff from o(t)   (via peephole, not recurrent)
-      d_c.AddMat(1.0, d_h);
-      d_c.AddMatMatElements(1.0, F_DC.RowRange((t+1)*S, S), F_YF.RowRange((t+1)*S, S), 1.0);
-      d_c.AddMatDiagVec(1.0, F_DI.RowRange((t+1)*S, S), kNoTrans, f_peephole_i_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, F_DF.RowRange((t+1)*S, S), kNoTrans, f_peephole_f_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, d_o                      , kNoTrans, f_peephole_o_c_, 1.0);
-      // optionally clip the cell_derivative,
-      if (cell_diff_clip_ > 0.0) {
-        d_c.ApplyFloor(-cell_diff_clip_);
-        d_c.ApplyCeiling(cell_diff_clip_);
-      }
-
-      // f
-      d_f.AddMatMatElements(1.0, d_c, F_YC.RowRange((t-1)*S, S), 0.0);
-      d_f.DiffSigmoid(y_f, d_f);
-
-      // i
-      d_i.AddMatMatElements(1.0, d_c, y_g, 0.0);
-      d_i.DiffSigmoid(y_i, d_i);
-
-      // c -> g, via input gate
-      d_g.AddMatMatElements(1.0, d_c, y_i, 0.0);
-      d_g.DiffTanh(y_g, d_g);
-
-      // Clipping per-frame derivatives for the next `t'.
-      // Clipping applied to gates and input gate (as done in Google).
-      // [ICASSP2015, Sak, Learning acoustic frame labelling...],
-      //
-      // The path from 'out_diff' to 'd_c' via 'd_h' is unclipped,
-      // which is probably important for the 'Constant Error Carousel'
-      // to work well.
-      //
-      if (diff_clip_ > 0.0) {
-        d_gifo.ApplyFloor(-diff_clip_);
-        d_gifo.ApplyCeiling(diff_clip_);
-      }
-
-      // set zeros to padded frames,
-      if (sequence_lengths_.size() > 0) {
-        for (int s = 0; s < S; s++) {
-          if (t > sequence_lengths_[s]) {
-            d_all.Row(s).SetZero();
-          }
-        }
-      }
-    }
-
-    // BACKWARD DIRECTION,
-    // backward-direction activations,
-    CuSubMatrix<BaseFloat> B_YG(b_propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YI(b_propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YF(b_propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YO(b_propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YC(b_propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YH(b_propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YM(b_propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_YR(b_propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // backward-direction derivatives,
-    CuSubMatrix<BaseFloat> B_DG(b_backpropagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DI(b_backpropagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DF(b_backpropagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DO(b_backpropagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DC(b_backpropagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DH(b_backpropagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DM(b_backpropagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> B_DR(b_backpropagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    CuSubMatrix<BaseFloat> B_DGIFO(b_backpropagate_buf_.ColRange(0, 4*cell_dim_));
-
-    // pre-copy partial derivatives from the BLSTM output,
-    B_DR.RowRange(1*S, T*S).CopyFromMat(out_diff.ColRange(proj_dim_, proj_dim_));
-
-    // BufferPadding [T0]:dummy, [1,T]:current sequence, [T+1]: dummy,
-    for (int t = 1; t <= T; t++) {
-      CuSubMatrix<BaseFloat> y_g(B_YG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_i(B_YI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_f(B_YF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_o(B_YO.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_c(B_YC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_h(B_YH.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_m(B_YM.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_r(B_YR.RowRange(t*S, S));
-
-      CuSubMatrix<BaseFloat> d_all(b_backpropagate_buf_.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_g(B_DG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_i(B_DI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_f(B_DF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_o(B_DO.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_c(B_DC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_h(B_DH.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_m(B_DM.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_r(B_DR.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_gifo(B_DGIFO.RowRange(t*S, S));
-
-      // r
-      //   Version 1 (precise gradients):
-      //   backprop error from g(t-1), i(t-1), f(t-1), o(t-1) to r(t)
-      d_r.AddMatMat(1.0, B_DGIFO.RowRange((t-1)*S, S), kNoTrans, b_w_gifo_r_, kNoTrans, 1.0);
-
-      /*
-      //   Version 2 (Alex Graves' PhD dissertation):
-      //   only backprop g(t+1) to r(t)
-      CuSubMatrix<BaseFloat> w_g_r_(w_gifo_r_.RowRange(0, cell_dim_));
-      d_r.AddMatMat(1.0, DG.RowRange((t+1)*S,S), kNoTrans, w_g_r_, kNoTrans, 1.0);
-      */
-
-      /*
-      //   Version 3 (Felix Gers' PhD dissertation):
-      //   truncate gradients of g(t+1), i(t+1), f(t+1), o(t+1) once they leak out memory block
-      //   CEC(with forget connection) is the only "error-bridge" through time
-      */
-
-      // r -> m
-      d_m.AddMatMat(1.0, d_r, kNoTrans, b_w_r_m_, kNoTrans, 0.0);
-
-      // m -> h via output gate
-      d_h.AddMatMatElements(1.0, d_m, y_o, 0.0);
-      d_h.DiffTanh(y_h, d_h);
-
-      // o
-      d_o.AddMatMatElements(1.0, d_m, y_h, 0.0);
-      d_o.DiffSigmoid(y_o, d_o);
-
-      // c
-      // 1. diff from h(t)
-      // 2. diff from c(t+1) (via forget-gate between CEC)
-      // 3. diff from i(t+1) (via peephole)
-      // 4. diff from f(t+1) (via peephole)
-      // 5. diff from o(t)   (via peephole, not recurrent)
-      d_c.AddMat(1.0, d_h);
-      d_c.AddMatMatElements(1.0, B_DC.RowRange((t-1)*S, S), B_YF.RowRange((t-1)*S, S), 1.0);
-      d_c.AddMatDiagVec(1.0, B_DI.RowRange((t-1)*S, S), kNoTrans, b_peephole_i_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, B_DF.RowRange((t-1)*S, S), kNoTrans, b_peephole_f_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, d_o                      , kNoTrans, b_peephole_o_c_, 1.0);
-      // optionally clip the cell_derivative,
-      if (cell_diff_clip_ > 0.0) {
-        d_c.ApplyFloor(-cell_diff_clip_);
-        d_c.ApplyCeiling(cell_diff_clip_);
-      }
-
-      // f
-      d_f.AddMatMatElements(1.0, d_c, B_YC.RowRange((t-1)*S, S), 0.0);
-      d_f.DiffSigmoid(y_f, d_f);
-
-      // i
-      d_i.AddMatMatElements(1.0, d_c, y_g, 0.0);
-      d_i.DiffSigmoid(y_i, d_i);
-
-      // c -> g, via input gate,
-      d_g.AddMatMatElements(1.0, d_c, y_i, 0.0);
-      d_g.DiffTanh(y_g, d_g);
-
-      // Clipping per-frame derivatives for the next `t'.
-      // Clipping applied to gates and input gate (as done in Google).
-      // [ICASSP2015, Sak, Learning acoustic frame labelling...],
-      //
-      // The path from 'out_diff' to 'd_c' via 'd_h' is unclipped,
-      // which is probably important for the 'Constant Error Carousel'
-      // to work well.
-      //
-      if (diff_clip_ > 0.0) {
-        d_gifo.ApplyFloor(-diff_clip_);
-        d_gifo.ApplyCeiling(diff_clip_);
-      }
-
-      // set zeros to padded frames,
-      if (sequence_lengths_.size() > 0) {
-        for (int s = 0; s < S; s++) {
-          if (t > sequence_lengths_[s]) {
-            d_all.Row(s).SetZero();
-          }
-        }
-      }
-    }
-
-    // g,i,f,o -> x, calculating input derivatives,
-    // forward direction difference
-    in_diff->AddMatMat(1.0, F_DGIFO.RowRange(1*S, T*S), kNoTrans, f_w_gifo_x_, kNoTrans, 0.0);
-    // backward direction difference
-    in_diff->AddMatMat(1.0, B_DGIFO.RowRange(1*S, T*S), kNoTrans, b_w_gifo_x_, kNoTrans, 1.0);
-
-    // lazy initialization of udpate buffers,
-    if (f_w_gifo_x_corr_.NumRows() == 0) {
-      // init delta buffers,
-      // forward direction,
-      f_w_gifo_x_corr_.Resize(4*cell_dim_, input_dim_, kSetZero);
-      f_w_gifo_r_corr_.Resize(4*cell_dim_, proj_dim_, kSetZero);
-      f_bias_corr_.Resize(4*cell_dim_, kSetZero);
-      f_peephole_i_c_corr_.Resize(cell_dim_, kSetZero);
-      f_peephole_f_c_corr_.Resize(cell_dim_, kSetZero);
-      f_peephole_o_c_corr_.Resize(cell_dim_, kSetZero);
-      f_w_r_m_corr_.Resize(proj_dim_, cell_dim_, kSetZero);
-
-      // backward direction,
-      b_w_gifo_x_corr_.Resize(4*cell_dim_, input_dim_, kSetZero);
-      b_w_gifo_r_corr_.Resize(4*cell_dim_, proj_dim_, kSetZero);
-      b_bias_corr_.Resize(4*cell_dim_, kSetZero);
-      b_peephole_i_c_corr_.Resize(cell_dim_, kSetZero);
-      b_peephole_f_c_corr_.Resize(cell_dim_, kSetZero);
-      b_peephole_o_c_corr_.Resize(cell_dim_, kSetZero);
-      b_w_r_m_corr_.Resize(proj_dim_, cell_dim_, kSetZero);
-    }
-
-    // calculate delta
-    const BaseFloat mmt = opts_.momentum;
-
-    // forward direction
-    // weight x -> g, i, f, o
-    f_w_gifo_x_corr_.AddMatMat(1.0, F_DGIFO.RowRange(1*S, T*S), kTrans,
-                                    in,                        kNoTrans, mmt);
-    // recurrent weight r -> g, i, f, o
-    f_w_gifo_r_corr_.AddMatMat(1.0, F_DGIFO.RowRange(1*S, T*S), kTrans,
-                                    F_YR.RowRange(0*S, T*S),    kNoTrans, mmt);
-    // bias of g, i, f, o
-    f_bias_corr_.AddRowSumMat(1.0, F_DGIFO.RowRange(1*S, T*S), mmt);
-
-    // recurrent peephole c -> i
-    f_peephole_i_c_corr_.AddDiagMatMat(1.0, F_DI.RowRange(1*S, T*S), kTrans,
-                                            F_YC.RowRange(0*S, T*S), kNoTrans, mmt);
-    // recurrent peephole c -> f
-    f_peephole_f_c_corr_.AddDiagMatMat(1.0, F_DF.RowRange(1*S, T*S), kTrans,
-                                            F_YC.RowRange(0*S, T*S), kNoTrans, mmt);
-    // peephole c -> o
-    f_peephole_o_c_corr_.AddDiagMatMat(1.0, F_DO.RowRange(1*S, T*S), kTrans,
-                                            F_YC.RowRange(1*S, T*S), kNoTrans, mmt);
-
-    f_w_r_m_corr_.AddMatMat(1.0, F_DR.RowRange(1*S, T*S), kTrans,
-                                 F_YM.RowRange(1*S, T*S), kNoTrans, mmt);
-
-    // backward direction backpropagate
-    // weight x -> g, i, f, o
-    b_w_gifo_x_corr_.AddMatMat(1.0, B_DGIFO.RowRange(1*S, T*S), kTrans, in, kNoTrans, mmt);
-    // recurrent weight r -> g, i, f, o
-    b_w_gifo_r_corr_.AddMatMat(1.0, B_DGIFO.RowRange(1*S, T*S), kTrans,
-                                    B_YR.RowRange(0*S, T*S)   , kNoTrans, mmt);
-    // bias of g, i, f, o
-    b_bias_corr_.AddRowSumMat(1.0, B_DGIFO.RowRange(1*S, T*S), mmt);
-
-    // recurrent peephole c -> i, c(t+1) --> i
-    b_peephole_i_c_corr_.AddDiagMatMat(1.0, B_DI.RowRange(1*S, T*S), kTrans,
-                                            B_YC.RowRange(2*S, T*S), kNoTrans, mmt);
-    // recurrent peephole c -> f, c(t+1) --> f
-    b_peephole_f_c_corr_.AddDiagMatMat(1.0, B_DF.RowRange(1*S, T*S), kTrans,
-                                            B_YC.RowRange(2*S, T*S), kNoTrans, mmt);
-    // peephole c -> o
-    b_peephole_o_c_corr_.AddDiagMatMat(1.0, B_DO.RowRange(1*S, T*S), kTrans,
-                                            B_YC.RowRange(1*S, T*S), kNoTrans, mmt);
-
-    b_w_r_m_corr_.AddMatMat(1.0, B_DR.RowRange(1*S, T*S), kTrans,
-                                 B_YM.RowRange(1*S, T*S), kNoTrans, mmt);
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-
-    // apply the gradient clipping,
-    if (grad_clip_ > 0.0) {
-      f_w_gifo_x_corr_.ApplyFloor(-grad_clip_);
-      f_w_gifo_x_corr_.ApplyCeiling(grad_clip_);
-      f_w_gifo_r_corr_.ApplyFloor(-grad_clip_);
-      f_w_gifo_r_corr_.ApplyCeiling(grad_clip_);
-      f_bias_corr_.ApplyFloor(-grad_clip_);
-      f_bias_corr_.ApplyCeiling(grad_clip_);
-      f_w_r_m_corr_.ApplyFloor(-grad_clip_);
-      f_w_r_m_corr_.ApplyCeiling(grad_clip_);
-      f_peephole_i_c_corr_.ApplyFloor(-grad_clip_);
-      f_peephole_i_c_corr_.ApplyCeiling(grad_clip_);
-      f_peephole_f_c_corr_.ApplyFloor(-grad_clip_);
-      f_peephole_f_c_corr_.ApplyCeiling(grad_clip_);
-      f_peephole_o_c_corr_.ApplyFloor(-grad_clip_);
-      f_peephole_o_c_corr_.ApplyCeiling(grad_clip_);
-
-      b_w_gifo_x_corr_.ApplyFloor(-grad_clip_);
-      b_w_gifo_x_corr_.ApplyCeiling(grad_clip_);
-      b_w_gifo_r_corr_.ApplyFloor(-grad_clip_);
-      b_w_gifo_r_corr_.ApplyCeiling(grad_clip_);
-      b_bias_corr_.ApplyFloor(-grad_clip_);
-      b_bias_corr_.ApplyCeiling(grad_clip_);
-      b_w_r_m_corr_.ApplyFloor(-grad_clip_);
-      b_w_r_m_corr_.ApplyCeiling(grad_clip_);
-      b_peephole_i_c_corr_.ApplyFloor(-grad_clip_);
-      b_peephole_i_c_corr_.ApplyCeiling(grad_clip_);
-      b_peephole_f_c_corr_.ApplyFloor(-grad_clip_);
-      b_peephole_f_c_corr_.ApplyCeiling(grad_clip_);
-      b_peephole_o_c_corr_.ApplyFloor(-grad_clip_);
-      b_peephole_o_c_corr_.ApplyCeiling(grad_clip_);
-    }
-
-    const BaseFloat lr = opts_.learn_rate;
-
-    // forward direction update
-    f_w_gifo_x_.AddMat(-lr * learn_rate_coef_, f_w_gifo_x_corr_);
-    f_w_gifo_r_.AddMat(-lr * learn_rate_coef_, f_w_gifo_r_corr_);
-    f_bias_.AddVec(-lr * bias_learn_rate_coef_, f_bias_corr_, 1.0);
-
-    f_peephole_i_c_.AddVec(-lr * bias_learn_rate_coef_, f_peephole_i_c_corr_, 1.0);
-    f_peephole_f_c_.AddVec(-lr * bias_learn_rate_coef_, f_peephole_f_c_corr_, 1.0);
-    f_peephole_o_c_.AddVec(-lr * bias_learn_rate_coef_, f_peephole_o_c_corr_, 1.0);
-
-    f_w_r_m_.AddMat(-lr * learn_rate_coef_, f_w_r_m_corr_);
-
-    // backward direction update
-    b_w_gifo_x_.AddMat(-lr * learn_rate_coef_, b_w_gifo_x_corr_);
-    b_w_gifo_r_.AddMat(-lr * learn_rate_coef_, b_w_gifo_r_corr_);
-    b_bias_.AddVec(-lr * bias_learn_rate_coef_, b_bias_corr_, 1.0);
-
-    b_peephole_i_c_.AddVec(-lr * bias_learn_rate_coef_, b_peephole_i_c_corr_, 1.0);
-    b_peephole_f_c_.AddVec(-lr * bias_learn_rate_coef_, b_peephole_f_c_corr_, 1.0);
-    b_peephole_o_c_.AddVec(-lr * bias_learn_rate_coef_, b_peephole_o_c_corr_, 1.0);
-
-    b_w_r_m_.AddMat(-lr * learn_rate_coef_, b_w_r_m_corr_);
-  }
-
- private:
-  // dims
-  int32 cell_dim_;  ///< the number of memory-cell blocks,
-  int32 proj_dim_;  ///< recurrent projection layer dim,
-
-  BaseFloat cell_clip_;  ///< Clipping of 'cell-values' in forward pass (per-frame),
-  BaseFloat diff_clip_;  ///< Clipping of 'derivatives' in backprop (per-frame),
-  BaseFloat cell_diff_clip_; ///< Clipping of 'cell-derivatives' accumulated over CEC (per-frame),
-  BaseFloat grad_clip_;  ///< Clipping of the updates,
-
-  // feed-forward connections: from x to [g, i, f, o]
-  // forward direction
-  CuMatrix<BaseFloat> f_w_gifo_x_;
-  CuMatrix<BaseFloat> f_w_gifo_x_corr_;
-  // backward direction
-  CuMatrix<BaseFloat> b_w_gifo_x_;
-  CuMatrix<BaseFloat> b_w_gifo_x_corr_;
-
-  // recurrent projection connections: from r to [g, i, f, o]
-  // forward direction
-  CuMatrix<BaseFloat> f_w_gifo_r_;
-  CuMatrix<BaseFloat> f_w_gifo_r_corr_;
-  // backward direction
-  CuMatrix<BaseFloat> b_w_gifo_r_;
-  CuMatrix<BaseFloat> b_w_gifo_r_corr_;
-
-  // biases of [g, i, f, o]
-  // forward direction
-  CuVector<BaseFloat> f_bias_;
-  CuVector<BaseFloat> f_bias_corr_;
-  // backward direction
-  CuVector<BaseFloat> b_bias_;
-  CuVector<BaseFloat> b_bias_corr_;
-
-  // peephole from c to i, f, g
-  // peephole connections are diagonal, so we use vector form,
-  // forward direction
-  CuVector<BaseFloat> f_peephole_i_c_;
-  CuVector<BaseFloat> f_peephole_f_c_;
-  CuVector<BaseFloat> f_peephole_o_c_;
-  // backward direction
-  CuVector<BaseFloat> b_peephole_i_c_;
-  CuVector<BaseFloat> b_peephole_f_c_;
-  CuVector<BaseFloat> b_peephole_o_c_;
-
-  // forward direction
-  CuVector<BaseFloat> f_peephole_i_c_corr_;
-  CuVector<BaseFloat> f_peephole_f_c_corr_;
-  CuVector<BaseFloat> f_peephole_o_c_corr_;
-  // backward direction
-  CuVector<BaseFloat> b_peephole_i_c_corr_;
-  CuVector<BaseFloat> b_peephole_f_c_corr_;
-  CuVector<BaseFloat> b_peephole_o_c_corr_;
-
-  // projection layer r: from m to r
-  // forward direction
-  CuMatrix<BaseFloat> f_w_r_m_;
-  CuMatrix<BaseFloat> f_w_r_m_corr_;
-  // backward direction
-  CuMatrix<BaseFloat> b_w_r_m_;
-  CuMatrix<BaseFloat> b_w_r_m_corr_;
-
-  // propagate buffer: output of [g, i, f, o, c, h, m, r]
-  // forward direction
-  CuMatrix<BaseFloat> f_propagate_buf_;
-  // backward direction
-  CuMatrix<BaseFloat> b_propagate_buf_;
-
-  // back-propagate buffer: diff-input of [g, i, f, o, c, h, m, r]
-  // forward direction
-  CuMatrix<BaseFloat> f_backpropagate_buf_;
-  // backward direction
-  CuMatrix<BaseFloat> b_backpropagate_buf_;
-};  // class BlstmProjected
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_BLSTM_PROJECTED_H_
diff --git a/src/nnet/nnet-component.cc b/src/nnet/nnet-component.cc
deleted file mode 100644
index cf7741e6e57..00000000000
--- a/src/nnet/nnet-component.cc
+++ /dev/null
@@ -1,272 +0,0 @@
-// nnet/nnet-component.cc
-
-// Copyright 2011-2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include <algorithm>
-#include <sstream>
-
-#include "nnet/nnet-component.h"
-
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-activation.h"
-#include "nnet/nnet-kl-hmm.h"
-#include "nnet/nnet-affine-transform.h"
-#include "nnet/nnet-linear-transform.h"
-#include "nnet/nnet-rbm.h"
-#include "nnet/nnet-various.h"
-
-#include "nnet/nnet-convolutional-component.h"
-#include "nnet/nnet-average-pooling-component.h"
-#include "nnet/nnet-max-pooling-component.h"
-
-#include "nnet/nnet-lstm-projected.h"
-#include "nnet/nnet-blstm-projected.h"
-#include "nnet/nnet-recurrent.h"
-
-#include "nnet/nnet-sentence-averaging-component.h"
-#include "nnet/nnet-frame-pooling-component.h"
-#include "nnet/nnet-parallel-component.h"
-#include "nnet/nnet-multibasis-component.h"
-#include "nnet/nnet-parametric-relu.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-const struct Component::key_value Component::kMarkerMap[] = {
-  { Component::kAffineTransform, "<AffineTransform>" },
-  { Component::kLinearTransform, "<LinearTransform>" },
-  { Component::kConvolutionalComponent, "<ConvolutionalComponent>" },
-  { Component::kLstmProjected, "<LstmProjected>" },
-  { Component::kLstmProjected, "<LstmProjectedStreams>" }, // bwd compat.
-  { Component::kBlstmProjected, "<BlstmProjected>" },
-  { Component::kBlstmProjected, "<BlstmProjectedStreams>" }, // bwd compat.
-  { Component::kRecurrentComponent, "<RecurrentComponent>" },
-  { Component::kSoftmax, "<Softmax>" },
-  { Component::kHiddenSoftmax, "<HiddenSoftmax>" },
-  { Component::kBlockSoftmax, "<BlockSoftmax>" },
-  { Component::kSigmoid, "<Sigmoid>" },
-  { Component::kTanh, "<Tanh>" },
-  { Component::kParametricRelu,"<ParametricRelu>" },
-  { Component::kDropout, "<Dropout>" },
-  { Component::kLengthNormComponent, "<LengthNormComponent>" },
-  { Component::kRbm, "<Rbm>" },
-  { Component::kSplice, "<Splice>" },
-  { Component::kCopy, "<Copy>" },
-  { Component::kAddShift, "<AddShift>" },
-  { Component::kRescale, "<Rescale>" },
-  { Component::kKlHmm, "<KlHmm>" },
-  { Component::kAveragePoolingComponent, "<AveragePoolingComponent>" },
-  { Component::kMaxPoolingComponent, "<MaxPoolingComponent>" },
-  { Component::kSentenceAveragingComponent, "<SentenceAveragingComponent>" },
-  { Component::kSimpleSentenceAveragingComponent, "<SimpleSentenceAveragingComponent>" },
-  { Component::kFramePoolingComponent, "<FramePoolingComponent>" },
-  { Component::kParallelComponent, "<ParallelComponent>" },
-  { Component::kMultiBasisComponent, "<MultiBasisComponent>" },
-};
-
-
-const char* Component::TypeToMarker(ComponentType t) {
-  // Retuns the 1st '<string>' corresponding to the type in 'kMarkerMap',
-  int32 N = sizeof(kMarkerMap) / sizeof(kMarkerMap[0]);
-  for (int i = 0; i < N; i++) {
-    if (kMarkerMap[i].key == t) return kMarkerMap[i].value;
-  }
-  KALDI_ERR << "Unknown type : " << t;
-  return NULL;
-}
-
-Component::ComponentType Component::MarkerToType(const std::string &s) {
-  std::string s_lowercase(s);
-  std::transform(s.begin(), s.end(), s_lowercase.begin(), ::tolower);  // lc
-  int32 N = sizeof(kMarkerMap) / sizeof(kMarkerMap[0]);
-  for (int i = 0; i < N; i++) {
-    std::string m(kMarkerMap[i].value);
-    std::string m_lowercase(m);
-    std::transform(m.begin(), m.end(), m_lowercase.begin(), ::tolower);
-    if (s_lowercase == m_lowercase) return kMarkerMap[i].key;
-  }
-  KALDI_ERR << "Unknown 'Component' marker : '" << s << "'\n"
-            << "(isn't the model 'too old' or incompatible?)";
-  return kUnknown;
-}
-
-
-Component* Component::NewComponentOfType(ComponentType comp_type,
-                      int32 input_dim, int32 output_dim) {
-  Component *ans = NULL;
-  switch (comp_type) {
-    case Component::kAffineTransform :
-      ans = new AffineTransform(input_dim, output_dim);
-      break;
-    case Component::kLinearTransform :
-      ans = new LinearTransform(input_dim, output_dim);
-      break;
-    case Component::kConvolutionalComponent :
-      ans = new ConvolutionalComponent(input_dim, output_dim);
-      break;
-    case Component::kLstmProjected :
-      ans = new LstmProjected(input_dim, output_dim);
-      break;
-    case Component::kBlstmProjected :
-      ans = new BlstmProjected(input_dim, output_dim);
-      break;
-    case Component::kRecurrentComponent :
-      ans = new RecurrentComponent(input_dim, output_dim);
-      break;
-    case Component::kSoftmax :
-      ans = new Softmax(input_dim, output_dim);
-      break;
-    case Component::kHiddenSoftmax :
-      ans = new HiddenSoftmax(input_dim, output_dim);
-      break;
-    case Component::kBlockSoftmax :
-      ans = new BlockSoftmax(input_dim, output_dim);
-      break;
-    case Component::kSigmoid :
-      ans = new Sigmoid(input_dim, output_dim);
-      break;
-    case Component::kTanh :
-      ans = new Tanh(input_dim, output_dim);
-      break;
-    case Component::kParametricRelu :
-      ans = new ParametricRelu(input_dim, output_dim);
-      break;
-    case Component::kDropout :
-      ans = new Dropout(input_dim, output_dim);
-      break;
-    case Component::kLengthNormComponent :
-      ans = new LengthNormComponent(input_dim, output_dim);
-      break;
-    case Component::kRbm :
-      ans = new Rbm(input_dim, output_dim);
-      break;
-    case Component::kSplice :
-      ans = new Splice(input_dim, output_dim);
-      break;
-    case Component::kCopy :
-      ans = new CopyComponent(input_dim, output_dim);
-      break;
-    case Component::kAddShift :
-      ans = new AddShift(input_dim, output_dim);
-      break;
-    case Component::kRescale :
-      ans = new Rescale(input_dim, output_dim);
-      break;
-    case Component::kKlHmm :
-      ans = new KlHmm(input_dim, output_dim);
-      break;
-    case Component::kSentenceAveragingComponent :
-      ans = new SentenceAveragingComponent(input_dim, output_dim);
-      break;
-    case Component::kSimpleSentenceAveragingComponent :
-      ans = new SimpleSentenceAveragingComponent(input_dim, output_dim);
-      break;
-    case Component::kAveragePoolingComponent :
-      ans = new AveragePoolingComponent(input_dim, output_dim);
-      break;
-    case Component::kMaxPoolingComponent :
-      ans = new MaxPoolingComponent(input_dim, output_dim);
-      break;
-    case Component::kFramePoolingComponent :
-      ans = new FramePoolingComponent(input_dim, output_dim);
-      break;
-    case Component::kParallelComponent :
-      ans = new ParallelComponent(input_dim, output_dim);
-      break;
-    case Component::kMultiBasisComponent :
-      ans = new MultiBasisComponent(input_dim, output_dim);
-      break;
-    case Component::kUnknown :
-    default :
-      KALDI_ERR << "Missing type: " << TypeToMarker(comp_type);
-  }
-  return ans;
-}
-
-
-Component* Component::Init(const std::string &conf_line) {
-  std::istringstream is(conf_line);
-  std::string component_type_string;
-  int32 input_dim, output_dim;
-
-  // initialize component w/o internal data
-  ReadToken(is, false, &component_type_string);
-  ComponentType component_type = MarkerToType(component_type_string);
-  ExpectToken(is, false, "<InputDim>");
-  ReadBasicType(is, false, &input_dim);
-  ExpectToken(is, false, "<OutputDim>");
-  ReadBasicType(is, false, &output_dim);
-  Component *ans = NewComponentOfType(component_type, input_dim, output_dim);
-
-  // initialize internal data with the remaining part of config line
-  ans->InitData(is);
-
-  return ans;
-}
-
-
-Component* Component::Read(std::istream &is, bool binary) {
-  int32 dim_out, dim_in;
-  std::string token;
-
-  int first_char = Peek(is, binary);
-  if (first_char == EOF) return NULL;
-
-  ReadToken(is, binary, &token);
-  // Skip the optional initial token,
-  if (token == "<Nnet>") {
-    ReadToken(is, binary, &token);
-  }
-  // Network ends after terminal token appears,
-  if (token == "</Nnet>") {
-    return NULL;
-  }
-
-  // Read the dims,
-  ReadBasicType(is, binary, &dim_out);
-  ReadBasicType(is, binary, &dim_in);
-
-  // Create the component,
-  Component *ans = NewComponentOfType(MarkerToType(token), dim_in, dim_out);
-
-  // Read the content,
-  ans->ReadData(is, binary);
-
-  // 'Eat' the component separtor (can be already consumed by 'ReadData(.)'),
-  if ('<' == Peek(is, binary) && '!' == PeekToken(is, binary)) {
-    ExpectToken(is, binary, "<!EndOfComponent>");
-  }
-
-  return ans;
-}
-
-
-void Component::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, Component::TypeToMarker(GetType()));
-  WriteBasicType(os, binary, OutputDim());
-  WriteBasicType(os, binary, InputDim());
-  if (!binary) os << "\n";
-  this->WriteData(os, binary);
-  WriteToken(os, binary, "<!EndOfComponent>");  // Write component separator.
-  if (!binary) os << "\n";
-}
-
-
-}  // namespace nnet1
-}  // namespace kaldi
diff --git a/src/nnet/nnet-component.h b/src/nnet/nnet-component.h
deleted file mode 100644
index 0cca2608b21..00000000000
--- a/src/nnet/nnet-component.h
+++ /dev/null
@@ -1,355 +0,0 @@
-// nnet/nnet-component.h
-
-// Copyright 2011-2016  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-
-#ifndef KALDI_NNET_NNET_COMPONENT_H_
-#define KALDI_NNET_NNET_COMPONENT_H_
-
-#include <iostream>
-#include <string>
-
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-vector.h"
-#include "nnet/nnet-trnopts.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * Abstract class, building block of the network.
- * It is able to propagate (PropagateFnc: compute the output based on its input)
- * and backpropagate (BackpropagateFnc: i.e. transform loss derivative w.r.t. output to derivative w.r.t. the input)
- * the formulas are implemented in descendant classes (AffineTransform,Sigmoid,Softmax,...).
- */
-class Component {
- /// Component type identification mechanism,
- public:
-  /// Types of Components,
-  typedef enum {
-    kUnknown = 0x0,
-
-    kUpdatableComponent = 0x0100,
-    kAffineTransform,
-    kLinearTransform,
-    kConvolutionalComponent,
-    kLstmProjected,
-    kBlstmProjected,
-    kRecurrentComponent,
-
-    kActivationFunction = 0x0200,
-    kSoftmax,
-    kHiddenSoftmax,
-    kBlockSoftmax,
-    kSigmoid,
-    kTanh,
-    kParametricRelu,
-    kDropout,
-    kLengthNormComponent,
-
-    kTranform = 0x0400,
-    kRbm,
-    kSplice,
-    kCopy,
-    kTranspose,
-    kBlockLinearity,
-    kAddShift,
-    kRescale,
-
-    kKlHmm = 0x0800,
-    kSentenceAveragingComponent, /* deprecated */
-    kSimpleSentenceAveragingComponent,
-    kAveragePoolingComponent,
-    kMaxPoolingComponent,
-    kFramePoolingComponent,
-    kParallelComponent,
-    kMultiBasisComponent
-  } ComponentType;
-
-  /// A pair of type and marker,
-  struct key_value {
-    const Component::ComponentType key;
-    const char *value;
-  };
-
-  /// The table with pairs of Component types and markers
-  /// (defined in nnet-component.cc),
-  static const struct key_value kMarkerMap[];
-
-  /// Converts component type to marker,
-  static const char* TypeToMarker(ComponentType t);
-
-  /// Converts marker to component type (case insensitive),
-  static ComponentType MarkerToType(const std::string &s);
-
- /// Generic interface of a component,
- public:
-  Component(int32 input_dim, int32 output_dim):
-    input_dim_(input_dim),
-    output_dim_(output_dim)
-  { }
-
-  virtual ~Component()
-  { }
-
-  /// Copy component (deep copy),
-  virtual Component* Copy() const = 0;
-
-  /// Get Type Identification of the component,
-  virtual ComponentType GetType() const = 0;
-
-  /// Check if componeny has 'Updatable' interface (trainable components),
-  virtual bool IsUpdatable() const {
-    return false;
-  }
-
-  /// Check if component has 'Recurrent' interface (trainable and recurrent),
-  virtual bool IsMultistream() const {
-    return false;
-  }
-
-  /// Get the dimension of the input,
-  int32 InputDim() const {
-    return input_dim_;
-  }
-
-  /// Get the dimension of the output,
-  int32 OutputDim() const {
-    return output_dim_;
-  }
-
-  /// Perform forward-pass propagation 'in' -> 'out',
-  void Propagate(const CuMatrixBase<BaseFloat> &in, CuMatrix<BaseFloat> *out);
-
-  /// Perform backward-pass propagation 'out_diff' -> 'in_diff'.
-  /// Note: 'in' and 'out' will be used only sometimes...
-  void Backpropagate(const CuMatrixBase<BaseFloat> &in,
-                     const CuMatrixBase<BaseFloat> &out,
-                     const CuMatrixBase<BaseFloat> &out_diff,
-                     CuMatrix<BaseFloat> *in_diff);
-
-  /// Initialize component from a line in config file,
-  static Component* Init(const std::string &conf_line);
-
-  /// Read the component from a stream (static method),
-  static Component* Read(std::istream &is, bool binary);
-
-  /// Write the component to a stream,
-  void Write(std::ostream &os, bool binary) const;
-
-  /// Print some additional info (after <ComponentName> and the dims),
-  virtual std::string Info() const { return ""; }
-
-  /// Print some additional info about gradient (after <...> and dims),
-  virtual std::string InfoGradient() const { return ""; }
-
-
- /// Abstract interface for propagation/backpropagation
- protected:
-  /// Forward pass transformation (to be implemented by descending class...)
-  virtual void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                            CuMatrixBase<BaseFloat> *out) = 0;
-
-  /// Backward pass transformation (to be implemented by descending class...)
-  virtual void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                                const CuMatrixBase<BaseFloat> &out,
-                                const CuMatrixBase<BaseFloat> &out_diff,
-                                CuMatrixBase<BaseFloat> *in_diff) = 0;
-
- /// Virtual interface for initialization and I/O,
- protected:
-  /// Initialize internal data of a component
-  virtual void InitData(std::istream &is) { }
-
-  /// Reads the component content
-  virtual void ReadData(std::istream &is, bool binary) { }
-
-  /// Writes the component content
-  virtual void WriteData(std::ostream &os, bool binary) const { }
-
- /// Data members,
- protected:
-  int32 input_dim_;  ///< Dimension of the input of the Component,
-  int32 output_dim_;  ///< Dimension of the output of the Component,
-
- /// Private members (descending classes cannot call this),
- private:
-  /// Create a new intance of component,
-  static Component* NewComponentOfType(
-    ComponentType t, int32 input_dim, int32 output_dim
-  );
-};
-
-
-/**
- * Class UpdatableComponent is a Component which has trainable parameters,
- * it contains SGD training hyper-parameters in NnetTrainOptions.
- * The constants 'learning_rate_coef_' and 'bias_learn_rate_coef_'
- * are separate, and should be stored by ::WriteData(...),
- */
-class UpdatableComponent : public Component {
- public:
-  UpdatableComponent(int32 input_dim, int32 output_dim):
-    Component(input_dim, output_dim),
-    learn_rate_coef_(1.0),
-    bias_learn_rate_coef_(1.0)
-  { }
-
-  virtual ~UpdatableComponent()
-  { }
-
-  /// Check if contains trainable parameters,
-  bool IsUpdatable() const {
-    return true;
-  }
-
-  /// Number of trainable parameters,
-  virtual int32 NumParams() const = 0;
-
-  /// Get gradient reshaped as a vector,
-  virtual void GetGradient(VectorBase<BaseFloat> *gradient) const = 0;
-
-  /// Get the trainable parameters reshaped as a vector,
-  virtual void GetParams(VectorBase<BaseFloat> *params) const = 0;
-
-  /// Set the trainable parameters from, reshaped as a vector,
-  virtual void SetParams(const VectorBase<BaseFloat> &params) = 0;
-
-  /// Compute gradient and update parameters,
-  virtual void Update(const CuMatrixBase<BaseFloat> &input,
-                      const CuMatrixBase<BaseFloat> &diff) = 0;
-
-  /// Set the training options to the component,
-  virtual void SetTrainOptions(const NnetTrainOptions &opts) {
-    opts_ = opts;
-  }
-
-  /// Get the training options from the component,
-  const NnetTrainOptions& GetTrainOptions() const {
-    return opts_;
-  }
-
-  /// Set the learn-rate coefficient,
-  virtual void SetLearnRateCoef(BaseFloat val) {
-    learn_rate_coef_ = val;
-  }
-
-  /// Set the learn-rate coefficient for bias,
-  virtual void SetBiasLearnRateCoef(BaseFloat val) {
-    bias_learn_rate_coef_ = val;
-  }
-
-  /// Initialize the content of the component by the 'line' from the prototype,
-  virtual void InitData(std::istream &is) = 0;
-
- protected:
-  /// Option-class with training hyper-parameters,
-  NnetTrainOptions opts_;
-
-  /// Scalar applied to learning rate for weight matrices
-  /// (to be used in ::Update method),
-  BaseFloat learn_rate_coef_;
-
-  /// Scalar applied to learning rate for bias
-  /// (to be used in ::Update method),
-  BaseFloat bias_learn_rate_coef_;
-};
-
-
-/**
- * Class MultistreamComponent is an extension of UpdatableComponent
- * for recurrent networks, which are trained with parallel sequences.
- */
-class MultistreamComponent : public UpdatableComponent {
- public:
-  MultistreamComponent(int32 input_dim, int32 output_dim):
-    UpdatableComponent(input_dim, output_dim)
-  { }
-
-  bool IsMultistream() const {
-    return true;
-  }
-
-  virtual void SetSeqLengths(const std::vector<int32>& sequence_lengths) {
-    sequence_lengths_ = sequence_lengths;
-  }
-
-  int32 NumStreams() const {
-    return std::max<int32>(1, sequence_lengths_.size());
-  }
-
-  /// Optional function to reset the transfer of context (not used for BLSTMs
-  virtual void ResetStreams(const std::vector<int32>& stream_reset_flag)
-  { }
-
- protected:
-  std::vector<int32> sequence_lengths_;
-};
-
-
-/*
- * Inline methods for ::Component,
- */
-inline void Component::Propagate(const CuMatrixBase<BaseFloat> &in,
-                                 CuMatrix<BaseFloat> *out) {
-  // Check the dims
-  if (input_dim_ != in.NumCols()) {
-    KALDI_ERR << "Non-matching dims on the input of " << TypeToMarker(GetType())
-              << " component. The input-dim is " << input_dim_
-              << ", the data had " << in.NumCols() << " dims.";
-  }
-  // Allocate target buffer
-  out->Resize(in.NumRows(), output_dim_, kSetZero);  // reset
-  // Call the propagation implementation of the component
-  PropagateFnc(in, out);
-}
-
-inline void Component::Backpropagate(const CuMatrixBase<BaseFloat> &in,
-                                     const CuMatrixBase<BaseFloat> &out,
-                                     const CuMatrixBase<BaseFloat> &out_diff,
-                                     CuMatrix<BaseFloat> *in_diff) {
-  // Check the dims,
-  if (OutputDim() != out_diff.NumCols()) {
-    KALDI_ERR << "Non-matching dims! Component output dim " << OutputDim()
-              << ", the dim of output derivatives " << out_diff.NumCols();
-  }
-
-  int32 num_frames = out_diff.NumRows();
-  KALDI_ASSERT(num_frames == in.NumRows());
-  KALDI_ASSERT(num_frames == out.NumRows());
-
-  KALDI_ASSERT(InputDim() == in.NumCols());
-  KALDI_ASSERT(OutputDim() == out.NumCols());
-
-  // Allocate target buffer,
-  KALDI_ASSERT(in_diff != NULL);
-  in_diff->Resize(num_frames, InputDim(), kSetZero);  // reset,
-
-  // Call the 'virtual' backprop function,
-  BackpropagateFnc(in, out, out_diff, in_diff);
-}
-
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-
-#endif  // KALDI_NNET_NNET_COMPONENT_H_
diff --git a/src/nnet/nnet-convolutional-component.h b/src/nnet/nnet-convolutional-component.h
deleted file mode 100644
index bd4da7d3c0c..00000000000
--- a/src/nnet/nnet-convolutional-component.h
+++ /dev/null
@@ -1,482 +0,0 @@
-// nnet/nnet-convolutional-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_CONVOLUTIONAL_COMPONENT_H_
-#define KALDI_NNET_NNET_CONVOLUTIONAL_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * ConvolutionalComponent implements convolution over single axis
- * (i.e. frequency axis in case we are the 1st component in NN).
- * We don't do convolution along temporal axis, which simplifies the
- * implementation (and was not helpful for Tara).
- *
- * We assume the input featrues are spliced, i.e. each frame
- * is in fact a set of stacked frames, where we can form patches
- * which span over several frequency bands and whole time axis.
- *
- * The convolution is done over whole axis with same filters,
- * i.e. we don't use separate filters for different 'regions'
- * of frequency axis.
- *
- * In order to have a fast implementations, the filters
- * are represented in vectorized form, where each rectangular
- * filter corresponds to a row in a matrix, where all the filters
- * are stored. The features are then re-shaped to a set of matrices,
- * where one matrix corresponds to single patch-position,
- * where all the filters get applied.
- *
- * The type of convolution is controled by hyperparameters:
- * patch_dim_     ... frequency axis size of the patch
- * patch_step_    ... size of shift in the convolution
- * patch_stride_  ... shift for 2nd dim of a patch
- *                    (i.e. frame length before splicing)
- *
- * Due to convolution same weights are used repeateadly,
- * the final gradient is a sum of all position-specific
- * gradients (the sum was found better than averaging).
- *
- */
-class ConvolutionalComponent : public UpdatableComponent {
- public:
-  ConvolutionalComponent(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    patch_dim_(0),
-    patch_step_(0),
-    patch_stride_(0),
-    max_norm_(0.0)
-  { }
-
-  ~ConvolutionalComponent()
-  { }
-
-  Component* Copy() const { return new ConvolutionalComponent(*this); }
-  ComponentType GetType() const { return kConvolutionalComponent; }
-
-  void InitData(std::istream &is) {
-    // define options
-    BaseFloat bias_mean = -2.0, bias_range = 2.0, param_stddev = 0.1;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
-      else if (token == "<BiasMean>")    ReadBasicType(is, false, &bias_mean);
-      else if (token == "<BiasRange>")   ReadBasicType(is, false, &bias_range);
-      else if (token == "<PatchDim>")    ReadBasicType(is, false, &patch_dim_);
-      else if (token == "<PatchStep>")   ReadBasicType(is, false, &patch_step_);
-      else if (token == "<PatchStride>") ReadBasicType(is, false, &patch_stride_);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
-      else if (token == "<MaxNorm>") ReadBasicType(is, false, &max_norm_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (ParamStddev|BiasMean|BiasRange|PatchDim|PatchStep|PatchStride)";
-    }
-
-    //
-    // Sanity checks:
-    //
-    // splice (input are spliced frames):
-    KALDI_ASSERT(input_dim_ % patch_stride_ == 0);
-    int32 num_splice = input_dim_ / patch_stride_;
-    KALDI_LOG << "num_splice " << num_splice;
-    // number of patches:
-    KALDI_ASSERT((patch_stride_ - patch_dim_) % patch_step_ == 0);
-    int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-    KALDI_LOG << "num_patches " << num_patches;
-    // filter dim:
-    int32 filter_dim = num_splice * patch_dim_;
-    KALDI_LOG << "filter_dim " << filter_dim;
-    // num filters:
-    KALDI_ASSERT(output_dim_ % num_patches == 0);
-    int32 num_filters = output_dim_ / num_patches;
-    KALDI_LOG << "num_filters " << num_filters;
-    //
-
-    //
-    // Initialize trainable parameters,
-    //
-    // Gaussian with given std_dev (mean = 0),
-    filters_.Resize(num_filters, filter_dim);
-    RandGauss(0.0, param_stddev, &filters_);
-    // Uniform,
-    bias_.Resize(num_filters);
-    RandUniform(bias_mean, bias_range, &bias_);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // convolution hyperparameters,
-    ExpectToken(is, binary, "<PatchDim>");
-    ReadBasicType(is, binary, &patch_dim_);
-    ExpectToken(is, binary, "<PatchStep>");
-    ReadBasicType(is, binary, &patch_step_);
-    ExpectToken(is, binary, "<PatchStride>");
-    ReadBasicType(is, binary, &patch_stride_);
-
-    // variant-length list of parameters,
-    bool end_loop = false;
-    while (!end_loop) {
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
-          ReadBasicType(is, binary, &learn_rate_coef_);
-          break;
-        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
-          ReadBasicType(is, binary, &bias_learn_rate_coef_);
-          break;
-        case 'M': ExpectToken(is, binary, "<MaxNorm>");
-          ReadBasicType(is, binary, &max_norm_);
-          break;
-        case '!': ExpectToken(is, binary, "<!EndOfComponent>");
-        default: end_loop = true;
-      }
-    }
-
-    // trainable parameters
-    ExpectToken(is, binary, "<Filters>");
-    filters_.Read(is, binary);
-    ExpectToken(is, binary, "<Bias>");
-    bias_.Read(is, binary);
-
-    //
-    // Sanity checks:
-    //
-    // splice (input are spliced frames):
-    KALDI_ASSERT(input_dim_ % patch_stride_ == 0);
-    int32 num_splice = input_dim_ / patch_stride_;
-    // number of patches:
-    KALDI_ASSERT((patch_stride_ - patch_dim_) % patch_step_ == 0);
-    int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-    // filter dim:
-    int32 filter_dim = num_splice * patch_dim_;
-    // num filters:
-    KALDI_ASSERT(output_dim_ % num_patches == 0);
-    int32 num_filters = output_dim_ / num_patches;
-    // check parameter dims:
-    KALDI_ASSERT(num_filters == filters_.NumRows());
-    KALDI_ASSERT(num_filters == bias_.Dim());
-    KALDI_ASSERT(filter_dim == filters_.NumCols());
-    //
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // convolution hyperparameters
-    WriteToken(os, binary, "<PatchDim>");
-    WriteBasicType(os, binary, patch_dim_);
-    WriteToken(os, binary, "<PatchStep>");
-    WriteBasicType(os, binary, patch_step_);
-    WriteToken(os, binary, "<PatchStride>");
-    WriteBasicType(os, binary, patch_stride_);
-    if (!binary) os << "\n";
-
-    // re-scale learn rate
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<BiasLearnRateCoef>");
-    WriteBasicType(os, binary, bias_learn_rate_coef_);
-    // max-norm regularization
-    WriteToken(os, binary, "<MaxNorm>");
-    WriteBasicType(os, binary, max_norm_);
-    if (!binary) os << "\n";
-
-    // trainable parameters
-    WriteToken(os, binary, "<Filters>");
-    if (!binary) os << "\n";
-    filters_.Write(os, binary);
-    WriteToken(os, binary, "<Bias>");
-    if (!binary) os << "\n";
-    bias_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return filters_.NumRows()*filters_.NumCols() + bias_.Dim();
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    gradient->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
-    gradient->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    params->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
-    params->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    filters_.CopyRowsFromVec(params.Range(0, filters_num_elem));
-    bias_.CopyFromVec(params.Range(filters_num_elem, bias_.Dim()));
-  }
-
-  std::string Info() const {
-    return std::string("\n  filters") + MomentStatistics(filters_) +
-      ", lr-coef " + ToString(learn_rate_coef_) +
-      ", max-norm " + ToString(max_norm_) +
-      "\n  bias" + MomentStatistics(bias_) +
-      ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-
-  std::string InfoGradient() const {
-    return std::string("\n  filters_grad") + MomentStatistics(filters_grad_) +
-      ", lr-coef " + ToString(learn_rate_coef_) +
-      ", max-norm " + ToString(max_norm_) +
-      "\n  bias_grad" + MomentStatistics(bias_grad_) +
-      ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_splice = input_dim_ / patch_stride_;
-    int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-    int32 num_filters = filters_.NumRows();
-    int32 num_frames = in.NumRows();
-    int32 filter_dim = filters_.NumCols();
-
-    // we will need the buffers
-    if (vectorized_feature_patches_.NumRows() != num_frames) {
-      vectorized_feature_patches_.Resize(num_frames, filter_dim * num_patches, kUndefined);
-      feature_patch_diffs_.Resize(num_frames, filter_dim * num_patches, kSetZero);
-    }
-
-    /* Prepare feature patches, the layout is:
-     * |----------|----------|----------|---------| (in = spliced frames)
-     *   xxx        xxx        xxx        xxx       (x = selected elements)
-     *
-     *   xxx : patch dim
-     *    xxx
-     *   ^---: patch step
-     * |----------| : patch stride
-     *
-     *   xxx-xxx-xxx-xxx : filter dim
-     *
-     */
-    // build-up a column selection map:
-    int32 index = 0;
-    column_map_.resize(filter_dim * num_patches);
-    for (int32 p = 0; p < num_patches; p++) {
-      for (int32 s = 0; s < num_splice; s++) {
-        for (int32 d = 0; d < patch_dim_; d++) {
-          column_map_[index] = p * patch_step_ + s * patch_stride_ + d;
-          index++;
-        }
-      }
-    }
-    // select the columns
-    CuArray<int32> cu_column_map(column_map_);
-    vectorized_feature_patches_.CopyCols(in, cu_column_map);
-
-    // compute filter activations
-    for (int32 p = 0; p < num_patches; p++) {
-      CuSubMatrix<BaseFloat> tgt(out->ColRange(p * num_filters, num_filters));
-      CuSubMatrix<BaseFloat> patch(vectorized_feature_patches_.ColRange(
-                                   p * filter_dim, filter_dim));
-      tgt.AddVecToRows(1.0, bias_, 0.0);  // add bias
-      // apply all filters
-      tgt.AddMatMat(1.0, patch, kNoTrans, filters_, kTrans, 1.0);
-    }
-  }
-
-  /*
-   This function does an operation similar to reversing a map,
-   except it handles maps that are not one-to-one by outputting
-   the reversed map as a vector of lists.
-   @param[in] forward_indexes is a vector of int32, each of whose
-              elements is between 0 and input_dim - 1.
-   @param[in] input_dim. See definitions of forward_indexes and
-              backward_indexes.
-   @param[out] backward_indexes is a vector of dimension input_dim
-              of lists, The list at (backward_indexes[i]) is a list
-              of all indexes j such that forward_indexes[j] = i.
-  */
-  void ReverseIndexes(const std::vector<int32> &forward_indexes,
-                      std::vector<std::vector<int32> > *backward_indexes) {
-    int32 i;
-    int32 size = forward_indexes.size();
-    backward_indexes->resize(input_dim_);
-    int32 reserve_size = 2+ forward_indexes.size() / input_dim_;
-    std::vector<std::vector<int32> >::iterator iter = backward_indexes->begin(),
-      end = backward_indexes->end();
-    for (; iter != end; ++iter)
-      iter->reserve(reserve_size);
-    for (int32 j = 0; j < size; j++) {
-      i = forward_indexes[j];
-      KALDI_ASSERT(i < input_dim_);
-      (*backward_indexes)[i].push_back(j);
-    }
-  }
-
-  /*
-   This function transforms a vector of lists into a list of vectors,
-   padded with -1.
-   @param[in] The input vector of lists. Let in.size() be D, and let
-              the longest list length (i.e. the max of in[i].size()) be L.
-   @param[out] The output list of vectors. The length of the list will
-              be L, each vector-dimension will be D (i.e. out[i].size() == D),
-              and if in[i] == j, then for some k we will have that
-              out[k][j] = i. The output vectors are padded with -1
-              where necessary if not all the input lists have the same side.
-  */
-  void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                        std::vector<std::vector<int32> > *out) {
-    int32 D = in.size();
-    int32 L = 0;
-    for (int32 i = 0; i < D; i++)
-      if (in[i].size() > L)
-        L = in[i].size();
-    out->resize(L);
-    for (int32 i = 0; i < L; i++)
-      (*out)[i].resize(D, -1);
-    for (int32 i = 0; i < D; i++) {
-      for (int32 j = 0; j < in[i].size(); j++) {
-        (*out)[j][i] = in[i][j];
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-    int32 num_filters = filters_.NumRows();
-    int32 filter_dim = filters_.NumCols();
-
-    // backpropagate to vector of matrices
-    // (corresponding to position of a filter)
-    for (int32 p = 0; p < num_patches; p++) {
-      CuSubMatrix<BaseFloat> patch_diff(feature_patch_diffs_.ColRange(
-                                        p * filter_dim, filter_dim));
-      CuSubMatrix<BaseFloat> out_diff_patch(out_diff.ColRange(
-                                            p * num_filters, num_filters));
-      patch_diff.AddMatMat(1.0, out_diff_patch, kNoTrans,
-                           filters_, kNoTrans, 0.0);
-    }
-
-    // sum the derivatives into in_diff, we will compensate #summands
-    std::vector<std::vector<int32> > reversed_column_map;
-    ReverseIndexes(column_map_, &reversed_column_map);
-    std::vector<std::vector<int32> > rearranged_column_map;
-    RearrangeIndexes(reversed_column_map, &rearranged_column_map);
-    for (int32 p = 0; p < rearranged_column_map.size(); p++) {
-      CuArray<int32> cu_cols(rearranged_column_map[p]);
-      in_diff->AddCols(feature_patch_diffs_, cu_cols);
-    }
-  }
-
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // useful dims
-    int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-    int32 num_filters = filters_.NumRows();
-    int32 filter_dim = filters_.NumCols();
-
-    // we use following hyperparameters from the option class
-    const BaseFloat lr = opts_.learn_rate;
-
-    //
-    // calculate the gradient
-    //
-    filters_grad_.Resize(num_filters, filter_dim, kSetZero);  // reset
-    bias_grad_.Resize(num_filters, kSetZero);  // reset
-    // use all the patches
-    for (int32 p = 0; p < num_patches; p++) {  // sum
-      CuSubMatrix<BaseFloat> diff_patch(diff.ColRange(p * num_filters,
-                                                      num_filters));
-      CuSubMatrix<BaseFloat> patch(vectorized_feature_patches_.ColRange(
-                                   p * filter_dim, filter_dim));
-      filters_grad_.AddMatMat(1.0, diff_patch, kTrans, patch, kNoTrans, 1.0);
-      bias_grad_.AddRowSumMat(1.0, diff_patch, 1.0);
-    }
-
-    //
-    // update
-    //
-    filters_.AddMat(-lr*learn_rate_coef_, filters_grad_);
-    bias_.AddVec(-lr*bias_learn_rate_coef_, bias_grad_);
-    //
-
-    // max-norm
-    if (max_norm_ > 0.0) {
-      CuMatrix<BaseFloat> lin_sqr(filters_);
-      lin_sqr.MulElements(filters_);
-      CuVector<BaseFloat> l2(filters_.NumRows());
-      l2.AddColSumMat(1.0, lin_sqr, 0.0);
-      l2.ApplyPow(0.5);  // we have per-neuron L2 norms
-      CuVector<BaseFloat> scl(l2);
-      scl.Scale(1.0/max_norm_);
-      scl.ApplyFloor(1.0);
-      scl.InvertElements();
-      filters_.MulRowsVec(scl);  // shink to sphere!
-    }
-  }
-
- private:
-  int32 patch_dim_,    ///< number of consecutive inputs, 1st dim of patch
-        patch_step_,   ///< step of the convolution
-                       ///  (i.e. shift between 2 patches)
-        patch_stride_;  ///< shift for 2nd dim of a patch
-                       ///  (i.e. frame length before splicing)
-
-  CuMatrix<BaseFloat> filters_;  ///< row = vectorized rectangular filter
-  CuVector<BaseFloat> bias_;  ///< bias for each filter
-
-  CuMatrix<BaseFloat> filters_grad_;  ///< gradient of filters
-  CuVector<BaseFloat> bias_grad_;  ///< gradient of biases
-
-  BaseFloat max_norm_;  ///< limit L2 norm of a neuron weights to positive value
-
-  /** Buffer of reshaped inputs:
-   *  1row = vectorized rectangular feature patches,
-   *  1col = dim over speech frames
-   *  Map of input features:
-   *  std::vector-dim = patch-position
-   */
-  CuMatrix<BaseFloat> vectorized_feature_patches_;
-  std::vector<int32> column_map_;
-
-  /** Buffer for backpropagation:
-   *  derivatives in the domain of 'vectorized_feature_patches_',
-   *  1row = vectorized rectangular feature patches,
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> feature_patch_diffs_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_CONVOLUTIONAL_COMPONENT_H_
diff --git a/src/nnet/nnet-frame-pooling-component.h b/src/nnet/nnet-frame-pooling-component.h
deleted file mode 100644
index ecc71274993..00000000000
--- a/src/nnet/nnet-frame-pooling-component.h
+++ /dev/null
@@ -1,290 +0,0 @@
-// nnet/nnet-frame-pooling-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_FRAME_POOLING_COMPONENT_H_
-#define KALDI_NNET_NNET_FRAME_POOLING_COMPONENT_H_
-
-#include <string>
-#include <vector>
-#include <algorithm>
-#include <sstream>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * FramePoolingComponent :
- * The input/output matrices are split to frames of width 'feature_dim_'.
- * Here we do weighted pooling of frames along the temporal axis,
- * given a frame-offset of leftmost frame, the pool-size is defined
- * by weight-vector size.
- */
-class FramePoolingComponent : public UpdatableComponent {
- public:
-  FramePoolingComponent(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    feature_dim_(0),
-    normalize_(false)
-  { }
-
-  ~FramePoolingComponent()
-  { }
-
-  Component* Copy() const { return new FramePoolingComponent(*this); }
-  ComponentType GetType() const { return kFramePoolingComponent; }
-
-  /**
-   * Here the offsets are w.r.t. central frames, which has offset 0.
-   * Note.: both the offsets and pool sizes can be negative.
-   */
-  void InitData(std::istream &is) {
-    // temporary, for initialization,
-    std::vector<int32> pool_size;
-    std::vector<int32> central_offset;
-    Vector<BaseFloat> pool_weight;
-    float learn_rate_coef = 0.01;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<FeatureDim>") ReadBasicType(is, false, &feature_dim_);
-      else if (token == "<CentralOffset>") ReadIntegerVector(is, false, &central_offset);
-      else if (token == "<PoolSize>") ReadIntegerVector(is, false, &pool_size);
-      else if (token == "<PoolWeight>") pool_weight.Read(is, false);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef);
-      else if (token == "<Normalize>") ReadBasicType(is, false, &normalize_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (FeatureDim|CentralOffset <vec>|PoolSize <vec>|LearnRateCoef|Normalize)";
-    }
-    // check inputs:
-    KALDI_ASSERT(feature_dim_ > 0);
-    KALDI_ASSERT(central_offset.size() > 0);
-    KALDI_ASSERT(central_offset.size() == pool_size.size());
-    // initialize:
-    int32 num_frames = InputDim() / feature_dim_;
-    int32 central_frame = (num_frames -1) / 2;
-    int32 num_pools = central_offset.size();
-    offset_.resize(num_pools);
-    weight_.resize(num_pools);
-    for (int32 p = 0; p < num_pools; p++) {
-      offset_[p] = central_frame + central_offset[p] + std::min(0, pool_size[p]+1);
-      weight_[p].Resize(std::abs(pool_size[p]));
-      weight_[p].Set(1.0/std::abs(pool_size[p]));
-    }
-    learn_rate_coef_ = learn_rate_coef;
-    if (pool_weight.Dim() != 0) {
-      KALDI_LOG << "Initializing from pool-weight vector";
-      int32 num_weights = 0;
-      for (int32 p = 0; p < num_pools; p++) {
-        weight_[p].CopyFromVec(pool_weight.Range(num_weights, weight_[p].Dim()));
-        num_weights += weight_[p].Dim();
-      }
-      KALDI_ASSERT(num_weights == pool_weight.Dim());
-    }
-    // check that offsets are within the splice we had,
-    for (int32 p = 0; p < num_pools; p++) {
-      KALDI_ASSERT(offset_[p] >= 0);
-      KALDI_ASSERT(offset_[p] + weight_[p].Dim() <= num_frames);
-    }
-  }
-
-  /**
-   * Here the offsets are w.r.t. leftmost frame from splice, its offset is 0.
-   * If we spliced +/- 15 frames, the central frames has index '15'.
-   */
-  void ReadData(std::istream &is, bool binary) {
-    // get the input dimension before splicing
-    ExpectToken(is, binary, "<FeatureDim>");
-    ReadBasicType(is, binary, &feature_dim_);
-    ExpectToken(is, binary, "<LearnRateCoef>");
-    ReadBasicType(is, binary, &learn_rate_coef_);
-    ExpectToken(is, binary, "<Normalize>");
-    ReadBasicType(is, binary, &normalize_);
-    // read the offsets w.r.t. central frame
-    ExpectToken(is, binary, "<FrameOffset>");
-    ReadIntegerVector(is, binary, &offset_);
-    // read the frame-weights
-    ExpectToken(is, binary, "<FrameWeight>");
-    int32 num_pools = offset_.size();
-    weight_.resize(num_pools);
-    for (int32 p = 0; p < num_pools; p++) {
-      weight_[p].Read(is, binary);
-    }
-    //
-    // Sanity checks:
-    //
-    KALDI_ASSERT(input_dim_ % feature_dim_ == 0);
-    KALDI_ASSERT(output_dim_ % feature_dim_ == 0);
-    KALDI_ASSERT(output_dim_ / feature_dim_ == num_pools);
-    KALDI_ASSERT(offset_.size() == weight_.size());
-    // check the shifts don't exceed the splicing
-    int32 total_frame = InputDim() / feature_dim_;
-    for (int32 p = 0; p < num_pools; p++) {
-      KALDI_ASSERT(offset_[p] >= 0);
-      KALDI_ASSERT(offset_[p] + (weight_[p].Dim()-1) < total_frame);
-    }
-    //
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<FeatureDim>");
-    WriteBasicType(os, binary, feature_dim_);
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<Normalize>");
-    WriteBasicType(os, binary, normalize_);
-    WriteToken(os, binary, "<FrameOffset>");
-    WriteIntegerVector(os, binary, offset_);
-    // write pooling weights of individual frames
-    WriteToken(os, binary, "<FrameWeight>");
-    int32 num_pools = offset_.size();
-    for (int32 p = 0; p < num_pools; p++) {
-      weight_[p].Write(os, binary);
-    }
-  }
-
-  int32 NumParams() const {
-    int32 ans = 0;
-    for (int32 p = 0; p < weight_.size(); p++) {
-      ans += weight_[p].Dim();
-    }
-    return ans;
-  }
-
-  void GetGradient(VectorBase<BaseFloat> *gradient) const {
-    KALDI_ERR << "Unimplemented.";
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 offset = 0;
-    for (int32 p = 0; p < weight_.size(); p++) {
-      params->Range(offset, weight_[p].Dim()).CopyFromVec(weight_[p]);
-      offset += weight_[p].Dim();
-    }
-    KALDI_ASSERT(offset == params->Dim());
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ERR << "Unimplemented.";
-  }
-
-  std::string Info() const {
-    std::ostringstream oss;
-    oss << "\n  (offset,weights) : ";
-    for (int32 p = 0; p < weight_.size(); p++) {
-      oss << "(" << offset_[p] << "," << weight_[p] << "), ";
-    }
-    return oss.str();
-  }
-
-  std::string InfoGradient() const {
-    std::ostringstream oss;
-    oss << "\n  lr-coef " << ToString(learn_rate_coef_);
-    oss << "\n  (offset,weights_grad) : ";
-    for (int32 p = 0; p < weight_diff_.size(); p++) {
-      oss << "(" << offset_[p] << ",";
-      // pass the weight vector, remove '\n' as last char
-      oss << weight_diff_[p];
-      oss.seekp(-1, std::ios_base::cur);
-      oss << "), ";
-    }
-    return oss.str();
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // check dims
-    KALDI_ASSERT(in.NumCols() % feature_dim_ == 0);
-    KALDI_ASSERT(out->NumCols() % feature_dim_ == 0);
-    // useful dims
-    int32 num_pools = offset_.size();
-    // compute the output pools
-    for (int32 p = 0; p < num_pools; p++) {
-      CuSubMatrix<BaseFloat> tgt(out->ColRange(p*feature_dim_, feature_dim_));
-      tgt.SetZero();  // reset
-      for (int32 i = 0; i < weight_[p].Dim(); i++) {
-        tgt.AddMat(weight_[p](i), in.ColRange((offset_[p]+i) * feature_dim_, feature_dim_));
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    KALDI_ERR << "Unimplemented.";
-  }
-
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // useful dims
-    int32 num_pools = offset_.size();
-    // lazy init
-    if (weight_diff_.size() != num_pools) weight_diff_.resize(num_pools);
-    // get the derivatives
-    for (int32 p = 0; p < num_pools; p++) {
-      weight_diff_[p].Resize(weight_[p].Dim(), kSetZero);  // reset
-      for (int32 i = 0; i < weight_[p].Dim(); i++) {
-        // multiply matrices element-wise, and sum to get the derivative
-        CuSubMatrix<BaseFloat> in_frame(
-          input.ColRange((offset_[p]+i) * feature_dim_, feature_dim_)
-        );
-        CuSubMatrix<BaseFloat> diff_frame(
-          diff.ColRange(p * feature_dim_, feature_dim_)
-        );
-        CuMatrix<BaseFloat> mul_elems(in_frame);
-        mul_elems.MulElements(diff_frame);
-        weight_diff_[p](i) = mul_elems.Sum();
-      }
-    }
-    // update
-    for (int32 p = 0; p < num_pools; p++) {
-      weight_[p].AddVec(- learn_rate_coef_ * opts_.learn_rate, weight_diff_[p]);
-    }
-    // force to be positive, re-normalize the sum
-    if (normalize_) {
-      for (int32 p = 0; p < num_pools; p++) {
-        weight_[p].ApplyFloor(0.0);
-        weight_[p].Scale(1.0/weight_[p].Sum());
-      }
-    }
-  }
-
- private:
-  int32 feature_dim_;  // feature dimension before splicing
-  std::vector<int32> offset_;  // vector of pooling offsets
-  /// Vector of pooling weight vectors,
-  std::vector<Vector<BaseFloat> > weight_;
-  /// detivatives of weight vectors,
-  std::vector<Vector<BaseFloat> > weight_diff_;
-
-  bool normalize_;  // apply normalization after each update
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_FRAME_POOLING_COMPONENT_H_
diff --git a/src/nnet/nnet-kl-hmm.h b/src/nnet/nnet-kl-hmm.h
deleted file mode 100644
index 8ba3901daa7..00000000000
--- a/src/nnet/nnet-kl-hmm.h
+++ /dev/null
@@ -1,155 +0,0 @@
-// nnet/nnet-kl-hmm.h
-
-// Copyright 2013  Idiap Research Institute (Author: David Imseng)
-//                 Karlsruhe Institute of Technology (Author: Ngoc Thang Vu)
-//                 Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_KL_HMM_H_
-#define KALDI_NNET_NNET_KL_HMM_H_
-
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "cudamatrix/cu-math.h"
-#include "cudamatrix/cu-rand.h"
-#include "matrix/kaldi-vector.h"
-#include "matrix/kaldi-matrix.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class KlHmm : public Component {
- public:
-  KlHmm(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    kl_stats_(dim_out, dim_in, kSetZero)
-  { }
-
-  ~KlHmm()
-  { }
-
-  Component* Copy() const { return new KlHmm(*this); }
-  ComponentType GetType() const { return kKlHmm; }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    if (kl_inv_q_.NumRows() == 0) {
-      // Copy the CudaMatrix to a Matrix
-      Matrix<BaseFloat> in_tmp(in.NumRows(), in.NumCols());
-      in.CopyToMat(&in_tmp);
-      // Check if there are posteriors in the Matrix (check on first row),
-      BaseFloat post_sum = in_tmp.Row(0).Sum();
-      KALDI_ASSERT(ApproxEqual(post_sum, 1.0));
-      // Get a tmp Matrix of the stats
-      Matrix<BaseFloat> kl_stats_tmp(kl_stats_);
-      // Init a vector to get the sum of the rows (for normalization)
-      Vector<BaseFloat> row_sum(kl_stats_.NumRows(), kSetZero);
-      // Get the sum of the posteriors for normalization
-      row_sum.AddColSumMat(1, kl_stats_tmp);
-      // Apply floor to make sure there is no zero
-      row_sum.ApplyFloor(1e-20);
-      // Invert the sum (to normalize)
-      row_sum.InvertElements();
-      // Normalizing the statistics vector
-      kl_stats_tmp.MulRowsVec(row_sum);
-      // Apply floor before inversion and logarithm
-      kl_stats_tmp.ApplyFloor(1e-20);
-      // Apply invesion
-      kl_stats_tmp.InvertElements();
-      // Apply logarithm
-      kl_stats_tmp.ApplyLog();
-      // Inverted and logged values
-      kl_inv_q_.Resize(kl_stats_.NumRows(), kl_stats_.NumCols());
-      // Holds now log (1/Q)
-      kl_inv_q_.CopyFromMat(kl_stats_tmp);
-    }
-    // Get the logarithm of the features for the Entropy calculation
-    // Copy the CudaMatrix to a Matrix
-    Matrix<BaseFloat> in_log_tmp(in.NumRows(), in.NumCols());
-    in.CopyToMat(&in_log_tmp);
-    // Flooring and log
-    in_log_tmp.ApplyFloor(1e-20);
-    in_log_tmp.ApplyLog();
-    CuMatrix<BaseFloat> log_in(in.NumRows(), in.NumCols());
-    log_in.CopyFromMat(in_log_tmp);
-    // P*logP
-    CuMatrix<BaseFloat> tmp_entropy(in);
-    tmp_entropy.MulElements(log_in);
-    // Getting the entropy (sum P*logP)
-    CuVector<BaseFloat> in_entropy(in.NumRows(), kSetZero);
-    in_entropy.AddColSumMat(1, tmp_entropy);
-    // sum P*log (1/Q)
-    out->AddMatMat(1, in, kNoTrans, kl_inv_q_, kTrans, 0);
-    // (sum P*logP) + (sum P*log(1/Q)
-    out->AddVecToCols(1, in_entropy);
-    // return the negative KL-divergence
-    out->Scale(-1);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    KALDI_ERR << "Unimplemented";
-  }
-
-  /// Reads the component content
-  void ReadData(std::istream &is, bool binary) {
-    kl_stats_.Read(is, binary);
-    KALDI_ASSERT(kl_stats_.NumRows() == output_dim_);
-    KALDI_ASSERT(kl_stats_.NumCols() == input_dim_);
-  }
-
-  /// Writes the component content
-  void WriteData(std::ostream &os, bool binary) const {
-    kl_stats_.Write(os, binary);
-  }
-
-  /// Set the statistics matrix
-  void SetStats(const Matrix<BaseFloat> mat) {
-    KALDI_ASSERT(mat.NumRows() == output_dim_);
-    KALDI_ASSERT(mat.NumCols() == input_dim_);
-    kl_stats_.Resize(mat.NumRows(), mat.NumCols());
-    kl_stats_.CopyFromMat(mat);
-  }
-
-  /// Accumulate the statistics for KL-HMM paramter estimation,
-  void Accumulate(const Matrix<BaseFloat> &posteriors,
-                  const std::vector<int32> &alignment) {
-    KALDI_ASSERT(posteriors.NumRows() == alignment.size());
-    KALDI_ASSERT(posteriors.NumCols() == kl_stats_.NumCols());
-    int32 num_frames = alignment.size();
-    for (int32 i = 0; i < num_frames; i++) {
-      // Casting float posterior to double (fixing numerical issue),
-      Vector<double> temp(posteriors.Row(i));
-      // Sum the postiors grouped by states from the alignment,
-      kl_stats_.Row(alignment[i]).AddVec(1, temp);
-    }
-  }
-
- private:
-  Matrix<double> kl_stats_;
-  CuMatrix<BaseFloat> kl_inv_q_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_KL_HMM_H_
-
diff --git a/src/nnet/nnet-linear-transform.h b/src/nnet/nnet-linear-transform.h
deleted file mode 100644
index 733ad778970..00000000000
--- a/src/nnet/nnet-linear-transform.h
+++ /dev/null
@@ -1,212 +0,0 @@
-// nnet/nnet-linear-transform.h
-
-// Copyright 2011-2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_LINEAR_TRANSFORM_H_
-#define KALDI_NNET_NNET_LINEAR_TRANSFORM_H_
-
-#include <string>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class LinearTransform : public UpdatableComponent {
- public:
-  LinearTransform(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    linearity_(dim_out, dim_in),
-    linearity_corr_(dim_out, dim_in)
-  { }
-
-  ~LinearTransform()
-  { }
-
-  Component* Copy() const { return new LinearTransform(*this); }
-  ComponentType GetType() const { return kLinearTransform; }
-
-  void InitData(std::istream &is) {
-    // define options
-    float param_stddev = 0.1;
-    std::string read_matrix_file;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
-      else if (token == "<ReadMatrix>") ReadToken(is, false, &read_matrix_file);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (ParamStddev|ReadMatrix|LearnRateCoef)";
-    }
-
-    if (read_matrix_file != "") {  // load from file,
-      bool binary;
-      Input in(read_matrix_file, &binary);
-      linearity_.Read(in.Stream(), binary);
-      in.Close();
-      // check dims,
-      if (OutputDim() != linearity_.NumRows() ||
-          InputDim() != linearity_.NumCols()) {
-        KALDI_ERR << "Dimensionality mismatch! Expected matrix"
-                  << " r=" << OutputDim() << " c=" << InputDim()
-                  << ", loaded matrix " << read_matrix_file
-                  << " with r=" << linearity_.NumRows()
-                  << " c=" << linearity_.NumCols();
-      }
-      KALDI_LOG << "Loaded <LinearTransform> matrix from file : "
-                << read_matrix_file;
-      return;
-    }
-
-    //
-    // Initialize trainable parameters,
-    //
-    // Gaussian with given std_dev (mean = 0),
-    linearity_.Resize(OutputDim(), InputDim());
-    RandGauss(0.0, param_stddev, &linearity_);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    while ('<' == Peek(is, binary)) {
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
-          ReadBasicType(is, binary, &learn_rate_coef_);
-          break;
-        default:
-          std::string token;
-          ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-    // Read the data (data follow the tokens),
-
-    // weights
-    linearity_.Read(is, binary);
-
-    KALDI_ASSERT(linearity_.NumRows() == output_dim_);
-    KALDI_ASSERT(linearity_.NumCols() == input_dim_);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    if (!binary) os << "\n";
-    linearity_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return linearity_.NumRows()*linearity_.NumCols();
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    gradient->CopyRowsFromMat(linearity_corr_);
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    params->CopyRowsFromMat(linearity_);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    linearity_.CopyRowsFromVec(params);
-  }
-
-  void SetLinearity(const MatrixBase<BaseFloat>& l) {
-    KALDI_ASSERT(l.NumCols() == linearity_.NumCols());
-    KALDI_ASSERT(l.NumRows() == linearity_.NumRows());
-    linearity_.CopyFromMat(l);
-  }
-
-  std::string Info() const {
-    return std::string("\n  linearity") +
-      MomentStatistics(linearity_) +
-      ", lr-coef " + ToString(learn_rate_coef_);
-  }
-  std::string InfoGradient() const {
-    return std::string("\n  linearity_grad") +
-      MomentStatistics(linearity_corr_) +
-      ", lr-coef " + ToString(learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // multiply by weights^t
-    out->AddMatMat(1.0, in, kNoTrans, linearity_, kTrans, 0.0);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // multiply error derivative by weights
-    in_diff->AddMatMat(1.0, out_diff, kNoTrans, linearity_, kNoTrans, 0.0);
-  }
-
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // we use following hyperparameters from the option class
-    const BaseFloat lr = opts_.learn_rate;
-    const BaseFloat mmt = opts_.momentum;
-    const BaseFloat l2 = opts_.l2_penalty;
-    const BaseFloat l1 = opts_.l1_penalty;
-    // we will also need the number of frames in the mini-batch
-    const int32 num_frames = input.NumRows();
-    // compute gradient (incl. momentum)
-    linearity_corr_.AddMatMat(1.0, diff, kTrans, input, kNoTrans, mmt);
-    // l2 regularization
-    if (l2 != 0.0) {
-      linearity_.AddMat(-lr*l2*num_frames, linearity_);
-    }
-    // l1 regularization
-    if (l1 != 0.0) {
-      cu::RegularizeL1(&linearity_, &linearity_corr_, lr*l1*num_frames, lr);
-    }
-    // update
-    linearity_.AddMat(-lr*learn_rate_coef_, linearity_corr_);
-  }
-
-  /// Accessors to the component parameters
-  const CuMatrixBase<BaseFloat>& GetLinearity() { return linearity_; }
-
-  void SetLinearity(const CuMatrixBase<BaseFloat>& linearity) {
-    KALDI_ASSERT(linearity.NumRows() == linearity_.NumRows());
-    KALDI_ASSERT(linearity.NumCols() == linearity_.NumCols());
-    linearity_.CopyFromMat(linearity);
-  }
-
-  const CuMatrixBase<BaseFloat>& GetLinearityCorr() { return linearity_corr_; }
-
- private:
-  CuMatrix<BaseFloat> linearity_;
-  CuMatrix<BaseFloat> linearity_corr_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_LINEAR_TRANSFORM_H_
diff --git a/src/nnet/nnet-loss.cc b/src/nnet/nnet-loss.cc
deleted file mode 100644
index eb2233d33a6..00000000000
--- a/src/nnet/nnet-loss.cc
+++ /dev/null
@@ -1,460 +0,0 @@
-// nnet/nnet-loss.cc
-
-// Copyright 2011-2015  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sstream>
-#include <iterator>
-#include <algorithm>
-#include <iomanip>
-
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-
-/* Xent */
-
-/**
- * Helper function of Xent::Eval,
- * calculates number of matching elemente in 'hyp', 'ref' weighted by 'weights'.
- */
-template <typename T>
-inline void CountCorrectFramesWeighted(const CuArray<T> &hyp,
-                                       const CuArray<T> &ref,
-                                       const CuVectorBase<BaseFloat> &weights,
-                                       Vector<double> *correct) {
-  KALDI_ASSERT(hyp.Dim() == ref.Dim());
-  KALDI_ASSERT(hyp.Dim() == weights.Dim());
-  int32 dim = hyp.Dim();
-  // Get GPU data to host,
-  std::vector<T> hyp_h(dim), ref_h(dim);
-  hyp.CopyToVec(&hyp_h);
-  ref.CopyToVec(&ref_h);
-  Vector<BaseFloat> w(dim);
-  weights.CopyToVec(&w);
-  // Accumulate weighted counts of correct frames,
-  for (int32 i = 0; i < dim; i++) {
-    KALDI_ASSERT(ref_h[i] < correct->Dim());
-    (*correct)(ref_h[i]) += w(i) * (hyp_h[i] == ref_h[i] ? 1.0 : 0.0);
-  }
-}
-
-
-void Xent::Eval(const VectorBase<BaseFloat> &frame_weights,
-                const CuMatrixBase<BaseFloat> &net_out,
-                const CuMatrixBase<BaseFloat> &targets,
-                CuMatrix<BaseFloat> *diff) {
-  // check inputs,
-  KALDI_ASSERT(net_out.NumCols() == targets.NumCols());
-  KALDI_ASSERT(net_out.NumRows() == targets.NumRows());
-  KALDI_ASSERT(net_out.NumRows() == frame_weights.Dim());
-
-  KALDI_ASSERT(KALDI_ISFINITE(frame_weights.Sum()));
-  KALDI_ASSERT(KALDI_ISFINITE(net_out.Sum()));
-  KALDI_ASSERT(KALDI_ISFINITE(targets.Sum()));
-
-  // buffer initialization,
-  int32 num_classes = targets.NumCols();
-  if (frames_.Dim() == 0) {
-    frames_.Resize(num_classes);
-    xentropy_.Resize(num_classes);
-    entropy_.Resize(num_classes);
-    correct_.Resize(num_classes);
-  }
-
-  // get frame_weights to GPU,
-  frame_weights_ = frame_weights;
-
-  // There may be frames for which the sum of targets is zero.
-  // This happens in multi-lingual training when the frame
-  // has target class in the softmax of another language.
-  // We 'switch-off' such frames by masking the 'frame_weights_',
-  target_sum_.Resize(targets.NumRows());
-  target_sum_.AddColSumMat(1.0, targets, 0.0);
-  frame_weights_.MulElements(target_sum_);
-
-  // compute derivative wrt. activations of last layer of neurons,
-  *diff = net_out;
-  diff->AddMat(-1.0, targets);
-  diff->MulRowsVec(frame_weights_);  // weighting,
-
-  // count frames per class,
-  frames_aux_ = targets;
-  frames_aux_.MulRowsVec(frame_weights_);
-  frames_.AddRowSumMat(1.0, CuMatrix<double>(frames_aux_));
-
-  // evaluate the frame-level classification,
-  net_out.FindRowMaxId(&max_id_out_);  // find max in nn-output
-  targets.FindRowMaxId(&max_id_tgt_);  // find max in targets
-  CountCorrectFramesWeighted(max_id_out_, max_id_tgt_,
-                             frame_weights_, &correct_);
-
-  // calculate cross_entropy (in GPU),
-  xentropy_aux_ = net_out;  // y
-  xentropy_aux_.Add(1e-20);  // avoid log(0)
-  xentropy_aux_.ApplyLog();  // log(y)
-  xentropy_aux_.MulElements(targets);  // t*log(y)
-  xentropy_aux_.MulRowsVec(frame_weights_);  // w*t*log(y)
-  xentropy_.AddRowSumMat(-1.0, CuMatrix<double>(xentropy_aux_));
-
-  // caluculate entropy (in GPU),
-  entropy_aux_ = targets;  // t
-  entropy_aux_.Add(1e-20);  // avoid log(0)
-  entropy_aux_.ApplyLog();  // log(t)
-  entropy_aux_.MulElements(targets);  // t*log(t)
-  entropy_aux_.MulRowsVec(frame_weights_);  // w*t*log(t)
-  entropy_.AddRowSumMat(-1.0, CuMatrix<double>(entropy_aux_));
-
-  // progressive loss reporting
-  if (opts_.loss_report_frames > 0) {
-    frames_progress_ += frame_weights_.Sum();
-    xentropy_progress_ += -xentropy_aux_.Sum();
-    entropy_progress_ += -entropy_aux_.Sum();
-
-    KALDI_ASSERT(KALDI_ISFINITE(xentropy_progress_));
-    KALDI_ASSERT(KALDI_ISFINITE(entropy_progress_));
-
-    if (frames_progress_ > opts_.loss_report_frames) {
-      // loss value,
-      double progress_value =
-        (xentropy_progress_ - entropy_progress_) / frames_progress_;
-
-      // time-related info (fps is weighted),
-      double time_now = timer_.Elapsed();
-      double fps = frames_progress_ / (time_now - elapsed_seconds_);
-      double elapsed_hours = time_now / 3600;
-      elapsed_seconds_ = time_now; // store,
-
-      // print,
-      KALDI_LOG << "ProgressLoss[last "
-                << static_cast<int>(frames_progress_/100/3600) << "h of "
-                << static_cast<int>(frames_.Sum()/100/3600) << "h]: "
-                << progress_value << " (Xent)"
-                << ", fps=" << fps
-                << std::setprecision(3)
-                << ", elapsed " << elapsed_hours << "h";
-      // store,
-      loss_vec_.push_back(progress_value);
-      // reset,
-      frames_progress_ = 0;
-      xentropy_progress_ = 0.0;
-      entropy_progress_ = 0.0;
-    }
-  }
-}
-
-
-void Xent::Eval(const VectorBase<BaseFloat> &frame_weights,
-                const CuMatrixBase<BaseFloat> &net_out,
-                const Posterior &post,
-                CuMatrix<BaseFloat> *diff) {
-  int32 num_frames = net_out.NumRows(),
-    num_pdf = net_out.NumCols();
-  KALDI_ASSERT(num_frames == post.size());
-
-  // convert posterior to matrix,
-  PosteriorToMatrix(post, num_pdf, &tgt_mat_);
-
-  // call the other eval function,
-  Eval(frame_weights, net_out, tgt_mat_, diff);
-}
-
-
-std::string Xent::Report() {
-  double loss_value =
-    (xentropy_.Sum() - entropy_.Sum()) / frames_.Sum();
-  std::ostringstream oss;
-  oss << "AvgLoss: " << loss_value << " (Xent), "
-      << "[AvgXent: " << xentropy_.Sum() / frames_.Sum()
-      << ", AvgTargetEnt: " << entropy_.Sum() / frames_.Sum()
-      << "]" << std::endl;
-
-  oss << "progress: [";
-  std::copy(loss_vec_.begin(), loss_vec_.end(),
-            std::ostream_iterator<float>(oss, " "));
-  oss << "]" << std::endl;
-
-  double frame_accuracy = 100.0 * correct_.Sum() / frames_.Sum();
-  oss << "FRAME_ACCURACY >> " << frame_accuracy << "% <<" << std::endl;
-
-  return oss.str();
-}
-
-
-std::string Xent::ReportPerClass() {
-  std::ostringstream oss;
-  oss << "PER-CLASS PERFORMANCE:" << std::endl;
-  oss << "@@@ Frames per-class:" << frames_;
-  // get inverted counts,
-  CuVector<double> inv_frames(frames_);
-  inv_frames.Add(0.5);  // avoid 0-frames,
-  inv_frames.ApplyPow(-1.0);
-  // loss, kl = xentropy-entropy,
-  CuVector<double> loss(xentropy_);
-  loss.AddVec(-1.0, entropy_);
-  loss.MulElements(inv_frames);
-  oss << "@@@ Loss per-class:" << loss;
-  // frame accuracy (assuming targets are binary),
-  CuVector<double> frm_accu(correct_);
-  frm_accu.MulElements(inv_frames);
-  frm_accu.Scale(100.0);
-  oss << "@@@ Frame-accuracy per-class:" << frm_accu;
-  //
-  return oss.str();
-}
-
-
-/* Mse */
-
-void Mse::Eval(const VectorBase<BaseFloat> &frame_weights,
-               const CuMatrixBase<BaseFloat>& net_out,
-               const CuMatrixBase<BaseFloat>& target,
-               CuMatrix<BaseFloat>* diff) {
-  // check inputs,
-  KALDI_ASSERT(net_out.NumCols() == target.NumCols());
-  KALDI_ASSERT(net_out.NumRows() == target.NumRows());
-  KALDI_ASSERT(net_out.NumRows() == frame_weights.Dim());
-
-  KALDI_ASSERT(KALDI_ISFINITE(frame_weights.Sum()));
-  KALDI_ASSERT(KALDI_ISFINITE(net_out.Sum()));
-  KALDI_ASSERT(KALDI_ISFINITE(target.Sum()));
-
-  int32 num_frames = frame_weights.Sum();
-  KALDI_ASSERT(num_frames >= 0.0);
-
-  // get frame_weights to GPU,
-  frame_weights_ = frame_weights;
-
-  // compute derivative w.r.t. neural nerwork outputs
-  *diff = net_out;  // y
-  diff->AddMat(-1.0, target);  // (y - t)
-  diff->MulRowsVec(frame_weights_);  // weighting,
-
-  // Compute MeanSquareError loss of mini-batch
-  diff_pow_2_ = *diff;
-  diff_pow_2_.MulElements(diff_pow_2_);  // (y - t)^2
-  diff_pow_2_.MulRowsVec(frame_weights_);  // w*(y - t)^2
-  double mean_square_error = 0.5 * diff_pow_2_.Sum();  // sum the matrix,
-
-  KALDI_ASSERT(KALDI_ISFINITE(mean_square_error));
-
-  // accumulate
-  loss_ += mean_square_error;
-  frames_ += num_frames;
-
-  // progressive loss reporting
-  if (opts_.loss_report_frames > 0) {
-    frames_progress_ += num_frames;
-    loss_progress_ += mean_square_error;
-    if (frames_progress_ > opts_.loss_report_frames) {
-      KALDI_LOG << "ProgressLoss[last "
-                << static_cast<int>(frames_progress_/100/3600) << "h of "
-                << static_cast<int>(frames_/100/3600) << "h]: "
-                << loss_progress_/frames_progress_ << " (Mse)";
-      // store
-      loss_vec_.push_back(loss_progress_/frames_progress_);
-      // reset
-      frames_progress_ = 0;
-      loss_progress_ = 0.0;
-    }
-  }
-}
-
-
-void Mse::Eval(const VectorBase<BaseFloat> &frame_weights,
-               const CuMatrixBase<BaseFloat>& net_out,
-               const Posterior& post,
-               CuMatrix<BaseFloat>* diff) {
-  int32 num_frames = net_out.NumRows(),
-    num_nn_outputs = net_out.NumCols();
-  KALDI_ASSERT(num_frames == post.size());
-
-  // convert posterior to matrix,
-  PosteriorToMatrix(post, num_nn_outputs, &tgt_mat_);
-
-  // call the other eval function,
-  Eval(frame_weights, net_out, tgt_mat_, diff);
-}
-
-
-std::string Mse::Report() {
-  // compute root mean square,
-  int32 num_tgt = diff_pow_2_.NumCols();
-  BaseFloat root_mean_square = sqrt(loss_/frames_/num_tgt);
-  // build the message,
-  std::ostringstream oss;
-  oss << "AvgLoss: " << loss_/frames_ << " (Mse), "
-      << "[RMS " << root_mean_square << ", frames "
-      << frames_ << "]" << std::endl;
-  oss << "progress: [";
-  std::copy(loss_vec_.begin(), loss_vec_.end(),
-            std::ostream_iterator<float>(oss, " "));
-  oss << "]" << std::endl;
-  return oss.str();
-}
-
-
-/* MultiTaskLoss */
-
-void MultiTaskLoss::InitFromString(const std::string& s) {
-  std::vector<std::string> v;
-  SplitStringToVector(s, ",:" /* delimiter */, false, &v);
-
-  KALDI_ASSERT((v.size()-1) % 3 == 0);  // triplets,
-  KALDI_ASSERT(v[0] == "multitask");  // header,
-
-  // parse the definition of multitask loss,
-  std::vector<std::string>::iterator it(v.begin()+1);  // skip header,
-  for ( ; it != v.end(); ++it) {
-    // type,
-    if (*it == "xent") {
-      loss_vec_.push_back(new Xent(opts_));
-    } else if (*it == "mse") {
-      loss_vec_.push_back(new Mse(opts_));
-    } else {
-      KALDI_ERR << "Unknown objective function code : " << *it;
-    }
-    ++it;
-    // dim,
-    int32 dim;
-    if (!ConvertStringToInteger(*it, &dim)) {
-      KALDI_ERR << "Cannot convert 'dim' " << *it << " to integer!";
-    }
-    loss_dim_.push_back(dim);
-    ++it;
-    // weight,
-    BaseFloat weight;
-    if (!ConvertStringToReal(*it, &weight)) {
-      KALDI_ERR << "Cannot convert 'weight' " << *it << " to integer!";
-    }
-    KALDI_ASSERT(weight >= 0.0);
-    loss_weights_.push_back(weight);
-  }
-
-  // build vector with starting-point offsets,
-  loss_dim_offset_.resize(loss_dim_.size()+1, 0);  // 1st zero stays,
-  for (int32 i = 1; i <= loss_dim_.size(); i++) {
-    loss_dim_offset_[i] = loss_dim_offset_[i-1] + loss_dim_[i-1];
-  }
-
-  // sanity check,
-  KALDI_ASSERT(loss_vec_.size() > 0);
-  KALDI_ASSERT(loss_vec_.size() == loss_dim_.size());
-  KALDI_ASSERT(loss_vec_.size() == loss_weights_.size());
-}
-
-void MultiTaskLoss::Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat>& net_out,
-            const Posterior& post,
-            CuMatrix<BaseFloat>* diff) {
-  int32 num_frames = net_out.NumRows(),
-    num_output = net_out.NumCols();
-  KALDI_ASSERT(num_frames == post.size());
-  KALDI_ASSERT(num_output == loss_dim_offset_.back());  // sum of loss-dims,
-
-  // convert posterior to matrix,
-  PosteriorToMatrix(post, num_output, &tgt_mat_);
-
-  // allocate diff matrix,
-  diff->Resize(num_frames, num_output);
-
-  /// One vector of frame_weights per loss-function,
-  /// The original frame weights are multiplied with
-  /// a mask of `defined targets' according to the 'Posterior'.
-  std::vector<Vector<BaseFloat> > frmwei_have_tgt;
-  for (int32 l = 0; l < loss_vec_.size(); l++) {
-    // copy original weights,
-    frmwei_have_tgt.push_back(Vector<BaseFloat>(frame_weights));
-    // We need to mask-out the frames for which the 'posterior' is not defined (= is empty):
-    int32 loss_beg = loss_dim_offset_[l];   // first column of loss target,
-    int32 loss_end = loss_dim_offset_[l+1]; // (last+1) column of loss target,
-    for (int32 f = 0; f < num_frames; f++) {
-      bool tgt_defined = false;
-      for (int32 p = 0; p < post[f].size(); p++) {
-        if (post[f][p].first >= loss_beg && post[f][p].first < loss_end) {
-          tgt_defined = true;
-          break;
-        }
-      }
-      if (!tgt_defined) {
-        frmwei_have_tgt[l](f) = 0.0; // set zero_weight for the frame with no targets!
-      }
-    }
-  }
-
-  // call the vector of loss functions,
-  CuMatrix<BaseFloat> diff_aux;
-  for (int32 l = 0; l < loss_vec_.size(); l++) {
-    loss_vec_[l]->Eval(frmwei_have_tgt[l],
-      net_out.ColRange(loss_dim_offset_[l], loss_dim_[l]),
-      tgt_mat_.ColRange(loss_dim_offset_[l], loss_dim_[l]),
-      &diff_aux);
-    // Scale the gradients,
-    diff_aux.Scale(loss_weights_[l]);
-    // Copy to diff,
-    diff->ColRange(loss_dim_offset_[l], loss_dim_[l]).CopyFromMat(diff_aux);
-  }
-}
-
-std::string MultiTaskLoss::Report() {
-  // calculate overall loss (weighted),
-  BaseFloat overall_loss = AvgLoss();
-  // copy the loss-values into a vector,
-  std::vector<BaseFloat> loss_values;
-  for (int32 i = 0; i < loss_vec_.size(); i++) {
-    loss_values.push_back(loss_vec_[i]->AvgLoss());
-  }
-
-  // build the message,
-  std::ostringstream oss;
-  oss << "MultiTaskLoss, with " << loss_vec_.size()
-      << " parallel loss functions." << std::endl;
-  // individual loss reports first,
-  for (int32 i = 0; i < loss_vec_.size(); i++) {
-    oss << "Loss " << i+1 << ", " << loss_vec_[i]->Report() << std::endl;
-  }
-
-  // overall loss is last,
-  oss << "Loss (OVERALL), "
-      << "AvgLoss: " << overall_loss << " (MultiTaskLoss), "
-      << "weights " << loss_weights_ << ", "
-      << "values " << loss_values << std::endl;
-
-  return oss.str();
-}
-
-BaseFloat MultiTaskLoss::AvgLoss() {
-  BaseFloat ans(0.0);
-  for (int32 i = 0; i < loss_vec_.size(); i++) {
-    BaseFloat val = loss_weights_[i] * loss_vec_[i]->AvgLoss();
-    if (!KALDI_ISFINITE(val)) {
-      KALDI_WARN << "Loss " << i+1 << ", has bad objective function value '"
-                 << val << "', using 0.0 instead.";
-      val = 0.0;
-    }
-    ans += val;
-  }
-  return ans;
-}
-
-}  // namespace nnet1
-}  // namespace kaldi
diff --git a/src/nnet/nnet-loss.h b/src/nnet/nnet-loss.h
deleted file mode 100644
index 9e54733d63f..00000000000
--- a/src/nnet/nnet-loss.h
+++ /dev/null
@@ -1,251 +0,0 @@
-// nnet/nnet-loss.h
-
-// Copyright 2011-2015  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET_NNET_LOSS_H_
-#define KALDI_NNET_NNET_LOSS_H_
-
-#include <string>
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "base/timer.h"
-#include "util/kaldi-holder.h"
-#include "itf/options-itf.h"
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-vector.h"
-#include "cudamatrix/cu-array.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-struct LossOptions {
-  int32 loss_report_frames; ///< Report loss value every 'report_interval' frames,
-
-  LossOptions():
-    loss_report_frames(5*3600*100) // 5h,
-  { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("loss-report-frames", &loss_report_frames,
-        "Report loss per blocks of N frames (0 = no reports)");
-  }
-};
-
-class LossItf {
- public:
-  LossItf(LossOptions& opts) {
-    opts_ = opts;
-  }
-  virtual ~LossItf() { }
-
-  /// Evaluate cross entropy using target-matrix (supports soft labels),
-  virtual void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat> &net_out,
-            const CuMatrixBase<BaseFloat> &target,
-            CuMatrix<BaseFloat> *diff) = 0;
-
-  /// Evaluate cross entropy using target-posteriors (supports soft labels),
-  virtual void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat> &net_out,
-            const Posterior &target,
-            CuMatrix<BaseFloat> *diff) = 0;
-
-  /// Generate string with error report,
-  virtual std::string Report() = 0;
-
-  /// Get loss value (frame average),
-  virtual BaseFloat AvgLoss() = 0;
-
- protected:
-  LossOptions opts_;
-  Timer timer_;
-};
-
-
-class Xent : public LossItf {
- public:
-  Xent(LossOptions &opts):
-    LossItf(opts),
-    frames_progress_(0.0),
-    xentropy_progress_(0.0),
-    entropy_progress_(0.0),
-    elapsed_seconds_(0.0)
-  { }
-
-  ~Xent()
-  { }
-
-  /// Evaluate cross entropy using target-matrix (supports soft labels),
-  void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat> &net_out,
-            const CuMatrixBase<BaseFloat> &target,
-            CuMatrix<BaseFloat> *diff);
-
-  /// Evaluate cross entropy using target-posteriors (supports soft labels),
-  void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat> &net_out,
-            const Posterior &target,
-            CuMatrix<BaseFloat> *diff);
-
-  /// Generate string with error report,
-  std::string Report();
-
-  /// Generate string with per-class error report,
-  std::string ReportPerClass();
-
-  /// Get loss value (frame average),
-  BaseFloat AvgLoss() {
-    if (frames_.Sum() == 0) return 0.0;
-    return (xentropy_.Sum() - entropy_.Sum()) / frames_.Sum();
-  }
-
- private:
-  // main stats collected per target-class,
-  CuVector<double> frames_;
-  Vector<double> correct_;
-  CuVector<double> xentropy_;
-  CuVector<double> entropy_;
-
-  // partial results during training,
-  double frames_progress_;
-  double xentropy_progress_;
-  double entropy_progress_;
-  std::vector<float> loss_vec_;
-  double elapsed_seconds_;
-
-  // weigting buffer,
-  CuVector<BaseFloat> frame_weights_;
-  CuVector<BaseFloat> target_sum_;
-
-  // loss computation buffers,
-  CuMatrix<BaseFloat> tgt_mat_;
-  CuMatrix<BaseFloat> frames_aux_;
-  CuMatrix<BaseFloat> xentropy_aux_;
-  CuMatrix<BaseFloat> entropy_aux_;
-
-  // frame classification buffers,
-  CuArray<int32> max_id_out_;
-  CuArray<int32> max_id_tgt_;
-};
-
-
-class Mse : public LossItf {
- public:
-  Mse(LossOptions &opts):
-    LossItf(opts),
-    frames_(0.0),
-    loss_(0.0),
-    frames_progress_(0.0),
-    loss_progress_(0.0)
-  { }
-
-  ~Mse()
-  { }
-
-  /// Evaluate mean square error using target-matrix,
-  void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat>& net_out,
-            const CuMatrixBase<BaseFloat>& target,
-            CuMatrix<BaseFloat>* diff);
-
-  /// Evaluate mean square error using target-posteior,
-  void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat>& net_out,
-            const Posterior& target,
-            CuMatrix<BaseFloat>* diff);
-
-  /// Generate string with error report
-  std::string Report();
-
-  /// Get loss value (frame average),
-  BaseFloat AvgLoss() {
-    if (frames_ == 0) return 0.0;
-    return loss_ / frames_;
-  }
-
- private:
-  double frames_;
-  double loss_;
-
-  double frames_progress_;
-  double loss_progress_;
-  std::vector<float> loss_vec_;
-
-  CuVector<BaseFloat> frame_weights_;
-  CuMatrix<BaseFloat> tgt_mat_;
-  CuMatrix<BaseFloat> diff_pow_2_;
-};
-
-
-class MultiTaskLoss : public LossItf {
- public:
-  MultiTaskLoss(LossOptions &opts):
-    LossItf(opts)
-  { }
-
-  ~MultiTaskLoss() {
-    while (loss_vec_.size() > 0) {
-      delete loss_vec_.back();
-      loss_vec_.pop_back();
-    }
-  }
-
-  /// Initialize from string, the format for string 's' is :
-  /// 'multitask,<type1>,<dim1>,<weight1>,...,<typeN>,<dimN>,<weightN>'
-  ///
-  /// Practically it can look like this :
-  /// 'multitask,xent,2456,1.0,mse,440,0.001'
-  void InitFromString(const std::string& s);
-
-  /// Evaluate mean square error using target-matrix,
-  void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat>& net_out,
-            const CuMatrixBase<BaseFloat>& target,
-            CuMatrix<BaseFloat>* diff) {
-    KALDI_ERR << "This is not supposed to be called!";
-  }
-
-  /// Evaluate mean square error using target-posteior,
-  void Eval(const VectorBase<BaseFloat> &frame_weights,
-            const CuMatrixBase<BaseFloat>& net_out,
-            const Posterior& target,
-            CuMatrix<BaseFloat>* diff);
-
-  /// Generate string with error report
-  std::string Report();
-
-  /// Get loss value (frame average),
-  BaseFloat AvgLoss();
-
- private:
-  std::vector<LossItf*>  loss_vec_;
-  std::vector<int32>     loss_dim_;
-  std::vector<BaseFloat> loss_weights_;
-
-  std::vector<int32>     loss_dim_offset_;
-
-  CuMatrix<BaseFloat>    tgt_mat_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_LOSS_H_
-
diff --git a/src/nnet/nnet-lstm-projected.h b/src/nnet/nnet-lstm-projected.h
deleted file mode 100644
index eaf1062794f..00000000000
--- a/src/nnet/nnet-lstm-projected.h
+++ /dev/null
@@ -1,737 +0,0 @@
-// nnet/nnet-lstm-projected-streams.h
-
-// Copyright 2015-2016  Brno University of Technology (author: Karel Vesely)
-// Copyright 2014  Jiayu DU (Jerry), Wei Li
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_LSTM_PROJECTED_H_
-#define KALDI_NNET_NNET_LSTM_PROJECTED_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-/*************************************
- * x: input neuron
- * g: squashing neuron near input
- * i: Input gate
- * f: Forget gate
- * o: Output gate
- * c: memory Cell (CEC)
- * h: squashing neuron near output
- * m: output neuron of Memory block
- * r: recurrent projection neuron
- * y: output neuron of LSTMP
- *************************************/
-
-namespace kaldi {
-namespace nnet1 {
-
-class LstmProjected : public MultistreamComponent {
- public:
-  LstmProjected(int32 input_dim, int32 output_dim):
-    MultistreamComponent(input_dim, output_dim),
-    cell_dim_(0),
-    proj_dim_(output_dim),
-    cell_clip_(50.0),
-    diff_clip_(1.0),
-    cell_diff_clip_(0.0),
-    grad_clip_(250.0)
-  { }
-
-  ~LstmProjected()
-  { }
-
-  Component* Copy() const { return new LstmProjected(*this); }
-  ComponentType GetType() const { return kLstmProjected; }
-
-  void InitData(std::istream &is) {
-    // define options,
-    float param_range = 0.1;
-    // parse the line from prototype,
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ParamRange>") ReadBasicType(is, false, &param_range);
-      else if (token == "<CellDim>") ReadBasicType(is, false, &cell_dim_);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
-      else if (token == "<CellClip>") ReadBasicType(is, false, &cell_clip_);
-      else if (token == "<DiffClip>") ReadBasicType(is, false, &diff_clip_);
-      else if (token == "<CellDiffClip>") ReadBasicType(is, false, &cell_diff_clip_);
-      else if (token == "<GradClip>") ReadBasicType(is, false, &grad_clip_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (ParamRange|CellDim|LearnRateCoef|BiasLearnRateCoef|CellClip|DiffClip|GradClip)";
-    }
-
-    // init the weights and biases (from uniform dist.),
-    w_gifo_x_.Resize(4*cell_dim_, input_dim_, kUndefined);
-    w_gifo_r_.Resize(4*cell_dim_, proj_dim_, kUndefined);
-    bias_.Resize(4*cell_dim_, kUndefined);
-    peephole_i_c_.Resize(cell_dim_, kUndefined);
-    peephole_f_c_.Resize(cell_dim_, kUndefined);
-    peephole_o_c_.Resize(cell_dim_, kUndefined);
-    w_r_m_.Resize(proj_dim_, cell_dim_, kUndefined);
-    //       (mean), (range)
-    RandUniform(0.0, 2.0 * param_range, &w_gifo_x_);
-    RandUniform(0.0, 2.0 * param_range, &w_gifo_r_);
-    RandUniform(0.0, 2.0 * param_range, &bias_);
-    RandUniform(0.0, 2.0 * param_range, &peephole_i_c_);
-    RandUniform(0.0, 2.0 * param_range, &peephole_f_c_);
-    RandUniform(0.0, 2.0 * param_range, &peephole_o_c_);
-    RandUniform(0.0, 2.0 * param_range, &w_r_m_);
-
-    KALDI_ASSERT(cell_dim_ > 0);
-    KALDI_ASSERT(learn_rate_coef_ >= 0.0);
-    KALDI_ASSERT(bias_learn_rate_coef_ >= 0.0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    while ('<' == Peek(is, binary)) {
-      std::string token;
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'C': ReadToken(is, false, &token);
-          /**/ if (token == "<CellDim>") ReadBasicType(is, binary, &cell_dim_);
-          else if (token == "<CellClip>") ReadBasicType(is, binary, &cell_clip_);
-          else if (token == "<CellDiffClip>") ReadBasicType(is, binary, &cell_diff_clip_);
-          else if (token == "<ClipGradient>") ReadBasicType(is, binary, &grad_clip_); // bwd-compat.
-          else KALDI_ERR << "Unknown token: " << token;
-          break;
-        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
-          ReadBasicType(is, binary, &learn_rate_coef_);
-          break;
-        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
-          ReadBasicType(is, binary, &bias_learn_rate_coef_);
-          break;
-        case 'D': ExpectToken(is, binary, "<DiffClip>");
-          ReadBasicType(is, binary, &diff_clip_);
-          break;
-        case 'G': ExpectToken(is, binary, "<GradClip>");
-          ReadBasicType(is, binary, &grad_clip_);
-          break;
-        default: ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-    KALDI_ASSERT(cell_dim_ != 0);
-
-    // Read the model parameters,
-    w_gifo_x_.Read(is, binary);
-    w_gifo_r_.Read(is, binary);
-    bias_.Read(is, binary);
-
-    peephole_i_c_.Read(is, binary);
-    peephole_f_c_.Read(is, binary);
-    peephole_o_c_.Read(is, binary);
-
-    w_r_m_.Read(is, binary);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<CellDim>");
-    WriteBasicType(os, binary, cell_dim_);
-
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<BiasLearnRateCoef>");
-    WriteBasicType(os, binary, bias_learn_rate_coef_);
-
-    WriteToken(os, binary, "<CellClip>");
-    WriteBasicType(os, binary, cell_clip_);
-    WriteToken(os, binary, "<DiffClip>");
-    WriteBasicType(os, binary, diff_clip_);
-    WriteToken(os, binary, "<CellDiffClip>");
-    WriteBasicType(os, binary, cell_diff_clip_);
-    WriteToken(os, binary, "<GradClip>");
-    WriteBasicType(os, binary, grad_clip_);
-
-    // write model parameters,
-    if (!binary) os << "\n";
-    w_gifo_x_.Write(os, binary);
-    w_gifo_r_.Write(os, binary);
-    bias_.Write(os, binary);
-
-    peephole_i_c_.Write(os, binary);
-    peephole_f_c_.Write(os, binary);
-    peephole_o_c_.Write(os, binary);
-
-    w_r_m_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return ( w_gifo_x_.NumRows() * w_gifo_x_.NumCols() +
-         w_gifo_r_.NumRows() * w_gifo_r_.NumCols() +
-         bias_.Dim() +
-         peephole_i_c_.Dim() +
-         peephole_f_c_.Dim() +
-         peephole_o_c_.Dim() +
-         w_r_m_.NumRows() * w_r_m_.NumCols() );
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 offset, len;
-
-    offset = 0;    len = w_gifo_x_.NumRows() * w_gifo_x_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(w_gifo_x_corr_);
-
-    offset += len; len = w_gifo_r_.NumRows() * w_gifo_r_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(w_gifo_r_corr_);
-
-    offset += len; len = bias_.Dim();
-    gradient->Range(offset, len).CopyFromVec(bias_corr_);
-
-    offset += len; len = peephole_i_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(peephole_i_c_corr_);
-
-    offset += len; len = peephole_f_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(peephole_f_c_corr_);
-
-    offset += len; len = peephole_o_c_.Dim();
-    gradient->Range(offset, len).CopyFromVec(peephole_o_c_corr_);
-
-    offset += len; len = w_r_m_.NumRows() * w_r_m_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(w_r_m_corr_);
-
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 offset, len;
-
-    offset = 0;    len = w_gifo_x_.NumRows() * w_gifo_x_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(w_gifo_x_);
-
-    offset += len; len = w_gifo_r_.NumRows() * w_gifo_r_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(w_gifo_r_);
-
-    offset += len; len = bias_.Dim();
-    params->Range(offset, len).CopyFromVec(bias_);
-
-    offset += len; len = peephole_i_c_.Dim();
-    params->Range(offset, len).CopyFromVec(peephole_i_c_);
-
-    offset += len; len = peephole_f_c_.Dim();
-    params->Range(offset, len).CopyFromVec(peephole_f_c_);
-
-    offset += len; len = peephole_o_c_.Dim();
-    params->Range(offset, len).CopyFromVec(peephole_o_c_);
-
-    offset += len; len = w_r_m_.NumRows() * w_r_m_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(w_r_m_);
-
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 offset, len;
-
-    offset = 0;    len = w_gifo_x_.NumRows() * w_gifo_x_.NumCols();
-    w_gifo_x_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = w_gifo_r_.NumRows() * w_gifo_r_.NumCols();
-    w_gifo_r_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = bias_.Dim();
-    bias_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = peephole_i_c_.Dim();
-    peephole_i_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = peephole_f_c_.Dim();
-    peephole_f_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = peephole_o_c_.Dim();
-    peephole_o_c_.CopyFromVec(params.Range(offset, len));
-
-    offset += len; len = w_r_m_.NumRows() * w_r_m_.NumCols();
-    w_r_m_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  std::string Info() const {
-    return std::string("cell-dim ") + ToString(cell_dim_) + " " +
-      "( learn_rate_coef_ " + ToString(learn_rate_coef_) +
-      ", bias_learn_rate_coef_ " + ToString(bias_learn_rate_coef_) +
-      ", cell_clip_ " + ToString(cell_clip_) +
-      ", diff_clip_ " + ToString(diff_clip_) +
-      ", grad_clip_ " + ToString(grad_clip_) + " )" +
-      "\n  w_gifo_x_  "   + MomentStatistics(w_gifo_x_) +
-      "\n  w_gifo_r_  "   + MomentStatistics(w_gifo_r_) +
-      "\n  bias_  "     + MomentStatistics(bias_) +
-      "\n  peephole_i_c_  " + MomentStatistics(peephole_i_c_) +
-      "\n  peephole_f_c_  " + MomentStatistics(peephole_f_c_) +
-      "\n  peephole_o_c_  " + MomentStatistics(peephole_o_c_) +
-      "\n  w_r_m_  "    + MomentStatistics(w_r_m_);
-  }
-
-  std::string InfoGradient() const {
-    // disassemble forward-propagation buffer into different neurons,
-    const CuSubMatrix<BaseFloat> YG(propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YI(propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YF(propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YO(propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YC(propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YH(propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YM(propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> YR(propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // disassemble backpropagate buffer into different neurons,
-    const CuSubMatrix<BaseFloat> DG(backpropagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DI(backpropagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DF(backpropagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DO(backpropagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DC(backpropagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DH(backpropagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DM(backpropagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    const CuSubMatrix<BaseFloat> DR(backpropagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    return std::string("") +
-      "( learn_rate_coef_ " + ToString(learn_rate_coef_) +
-      ", bias_learn_rate_coef_ " + ToString(bias_learn_rate_coef_) +
-      ", cell_clip_ " + ToString(cell_clip_) +
-      ", diff_clip_ " + ToString(diff_clip_) +
-      ", grad_clip_ " + ToString(grad_clip_) + " )" +
-      "\n  ### Gradients " +
-      "\n  w_gifo_x_corr_  "   + MomentStatistics(w_gifo_x_corr_) +
-      "\n  w_gifo_r_corr_  "   + MomentStatistics(w_gifo_r_corr_) +
-      "\n  bias_corr_  "     + MomentStatistics(bias_corr_) +
-      "\n  peephole_i_c_corr_  " + MomentStatistics(peephole_i_c_corr_) +
-      "\n  peephole_f_c_corr_  " + MomentStatistics(peephole_f_c_corr_) +
-      "\n  peephole_o_c_corr_  " + MomentStatistics(peephole_o_c_corr_) +
-      "\n  w_r_m_corr_  "    + MomentStatistics(w_r_m_corr_) +
-      "\n  ### Activations (mostly after non-linearities)" +
-      "\n  YI(0..1)^  " + MomentStatistics(YI) +
-      "\n  YF(0..1)^  " + MomentStatistics(YF) +
-      "\n  YO(0..1)^  " + MomentStatistics(YO) +
-      "\n  YG(-1..1)  " + MomentStatistics(YG) +
-      "\n  YC(-R..R)* " + MomentStatistics(YC) +
-      "\n  YH(-1..1)  " + MomentStatistics(YH) +
-      "\n  YM(-1..1)  " + MomentStatistics(YM) +
-      "\n  YR(-R..R)  " + MomentStatistics(YR) +
-      "\n  ### Derivatives (w.r.t. inputs of non-linearities)" +
-      "\n  DI^ " + MomentStatistics(DI) +
-      "\n  DF^ " + MomentStatistics(DF) +
-      "\n  DO^ " + MomentStatistics(DO) +
-      "\n  DG  " + MomentStatistics(DG) +
-      "\n  DC* " + MomentStatistics(DC) +
-      "\n  DH  " + MomentStatistics(DH) +
-      "\n  DM  " + MomentStatistics(DM) +
-      "\n  DR  " + MomentStatistics(DR);
-  }
-
-  /**
-   * TODO: Do we really need this?
-   */
-  void ResetStreams(const std::vector<int32>& stream_reset_flag) {
-    KALDI_ASSERT(NumStreams() == stream_reset_flag.size());
-    if (prev_nnet_state_.NumRows() != stream_reset_flag.size()) {
-      prev_nnet_state_.Resize(NumStreams(), 7*cell_dim_ + 1*proj_dim_, kSetZero);
-    } else {
-      for (int s = 0; s < NumStreams(); s++) {
-        if (stream_reset_flag[s] == 1) {
-          prev_nnet_state_.Row(s).SetZero();
-        }
-      }
-    }
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-
-    // reset context on each sentence if 'sequence_lengths_' not set
-    // (happens in 'nnet-forward' or 'single-stream' training),
-    if (sequence_lengths_.size() == 0) {
-      ResetStreams(std::vector<int32>(1, 1));
-    }
-
-    KALDI_ASSERT(in.NumRows() % NumStreams() == 0);
-    int32 T = in.NumRows() / NumStreams();
-    int32 S = NumStreams();
-
-    // buffers,
-    propagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
-    if (prev_nnet_state_.NumRows() != NumStreams()) {
-      prev_nnet_state_.Resize(NumStreams(), 7*cell_dim_ + 1*proj_dim_, kSetZero); // lazy init,
-    } else {
-      propagate_buf_.RowRange(0, S).CopyFromMat(prev_nnet_state_); // use the 'previous-state',
-    }
-
-    // split activations by neuron types,
-    CuSubMatrix<BaseFloat> YG(propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YI(propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YF(propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YO(propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YC(propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YH(propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YM(propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YR(propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    CuSubMatrix<BaseFloat> YGIFO(propagate_buf_.ColRange(0, 4*cell_dim_));
-
-    // x -> g, i, f, o, not recurrent, do it all in once
-    YGIFO.RowRange(1*S, T*S).AddMatMat(1.0, in, kNoTrans, w_gifo_x_, kTrans, 0.0);
-
-    // bias -> g, i, f, o
-    YGIFO.RowRange(1*S, T*S).AddVecToRows(1.0, bias_);
-
-    // BufferPadding [T0]:dummy, [1, T]:current sequence, [T+1]:dummy
-    for (int t = 1; t <= T; t++) {
-      // multistream buffers for current time-step,
-      CuSubMatrix<BaseFloat> y_all(propagate_buf_.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_g(YG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_i(YI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_f(YF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_o(YO.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_c(YC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_h(YH.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_m(YM.RowRange(t*S, S));
-       CuSubMatrix<BaseFloat> y_r(YR.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_gifo(YGIFO.RowRange(t*S, S));
-
-      // r(t-1) -> g, i, f, o
-      y_gifo.AddMatMat(1.0, YR.RowRange((t-1)*S, S), kNoTrans, w_gifo_r_, kTrans,  1.0);
-
-      // c(t-1) -> i(t) via peephole
-      y_i.AddMatDiagVec(1.0, YC.RowRange((t-1)*S, S), kNoTrans, peephole_i_c_, 1.0);
-
-      // c(t-1) -> f(t) via peephole
-      y_f.AddMatDiagVec(1.0, YC.RowRange((t-1)*S, S), kNoTrans, peephole_f_c_, 1.0);
-
-      // i, f sigmoid squashing
-      y_i.Sigmoid(y_i);
-      y_f.Sigmoid(y_f);
-
-      // g tanh squashing
-      y_g.Tanh(y_g);
-
-      // g * i -> c
-      y_c.AddMatMatElements(1.0, y_g, y_i, 0.0);
-      // c(t-1) * f -> c(t) via forget-gate
-      y_c.AddMatMatElements(1.0, YC.RowRange((t-1)*S, S), y_f, 1.0);
-
-      if (cell_clip_ > 0.0) {
-        y_c.ApplyFloor(-cell_clip_);   // optional clipping of cell activation,
-        y_c.ApplyCeiling(cell_clip_);  // google paper Interspeech2014: LSTM for LVCSR
-      }
-
-      // c(t) -> o(t) via peephole (non-recurrent, using c(t))
-      y_o.AddMatDiagVec(1.0, y_c, kNoTrans, peephole_o_c_, 1.0);
-
-      // o sigmoid squashing,
-      y_o.Sigmoid(y_o);
-
-      // h tanh squashing,
-      y_h.Tanh(y_c);
-
-      // h * o -> m via output gate,
-      y_m.AddMatMatElements(1.0, y_h, y_o, 0.0);
-
-      // m -> r
-      y_r.AddMatMat(1.0, y_m, kNoTrans, w_r_m_, kTrans, 0.0);
-
-      // set zeros to padded frames,
-      if (sequence_lengths_.size() > 0) {
-        for (int s = 0; s < S; s++) {
-          if (t > sequence_lengths_[s]) {
-            y_all.Row(s).SetZero();
-          }
-        }
-      }
-    }
-
-    // set the 'projection layer' output as the LSTM output,
-    out->CopyFromMat(YR.RowRange(1*S, T*S));
-
-    // the state in the last 'frame' is transferred (can be zero vector)
-    prev_nnet_state_.CopyFromMat(propagate_buf_.RowRange(T*S, S));
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-
-    // the number of sequences to be processed in parallel
-    int32 T = in.NumRows() / NumStreams();
-    int32 S = NumStreams();
-
-    // buffer,
-    backpropagate_buf_.Resize((T+2)*S, 7 * cell_dim_ + proj_dim_, kSetZero);
-
-    // split activations by neuron types,
-    CuSubMatrix<BaseFloat> YG(propagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YI(propagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YF(propagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YO(propagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YC(propagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YH(propagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YM(propagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> YR(propagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-
-    // split derivatives by neuron types,
-    CuSubMatrix<BaseFloat> DG(backpropagate_buf_.ColRange(0*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DI(backpropagate_buf_.ColRange(1*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DF(backpropagate_buf_.ColRange(2*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DO(backpropagate_buf_.ColRange(3*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DC(backpropagate_buf_.ColRange(4*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DH(backpropagate_buf_.ColRange(5*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DM(backpropagate_buf_.ColRange(6*cell_dim_, cell_dim_));
-    CuSubMatrix<BaseFloat> DR(backpropagate_buf_.ColRange(7*cell_dim_, proj_dim_));
-    CuSubMatrix<BaseFloat> DGIFO(backpropagate_buf_.ColRange(0, 4*cell_dim_));
-
-    // pre-copy partial derivatives from the LSTM output,
-    DR.RowRange(1*S, T*S).CopyFromMat(out_diff);
-
-    // BufferPadding [T0]:dummy, [1,T]:current sequence, [T+1]: dummy,
-    for (int t = T; t >= 1; t--) {
-      CuSubMatrix<BaseFloat> y_g(YG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_i(YI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_f(YF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_o(YO.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_c(YC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> y_h(YH.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_m(YM.RowRange(t*S, S));
-      // CuSubMatrix<BaseFloat> y_r(YR.RowRange(t*S, S));
-
-      CuSubMatrix<BaseFloat> d_all(backpropagate_buf_.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_g(DG.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_i(DI.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_f(DF.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_o(DO.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_c(DC.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_h(DH.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_m(DM.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_r(DR.RowRange(t*S, S));
-      CuSubMatrix<BaseFloat> d_gifo(DGIFO.RowRange(t*S, S));
-
-      // r
-      //   Version 1 (precise gradients):
-      //   backprop error from g(t+1), i(t+1), f(t+1), o(t+1) to r(t)
-      d_r.AddMatMat(1.0, DGIFO.RowRange((t+1)*S, S), kNoTrans, w_gifo_r_, kNoTrans, 1.0);
-
-      /*
-      //   Version 2 (Alex Graves' PhD dissertation):
-      //   only backprop g(t+1) to r(t)
-      CuSubMatrix<BaseFloat> w_g_r_(w_gifo_r_.RowRange(0, cell_dim_));
-      d_r.AddMatMat(1.0, DG.RowRange((t+1)*S,S), kNoTrans, w_g_r_, kNoTrans, 1.0);
-      */
-
-      /*
-      //   Version 3 (Felix Gers' PhD dissertation):
-      //   truncate gradients of g(t+1), i(t+1), f(t+1), o(t+1) once they leak out memory block
-      //   CEC(with forget connection) is the only "error-bridge" through time
-      */
-
-      // r -> m
-      d_m.AddMatMat(1.0, d_r, kNoTrans, w_r_m_, kNoTrans, 0.0);
-
-      // m -> h via output gate
-      d_h.AddMatMatElements(1.0, d_m, y_o, 0.0);
-      d_h.DiffTanh(y_h, d_h);
-
-      // o
-      d_o.AddMatMatElements(1.0, d_m, y_h, 0.0);
-      d_o.DiffSigmoid(y_o, d_o);
-
-      // c
-      // 1. diff from h(t)
-      // 2. diff from c(t+1) (via forget-gate between CEC)
-      // 3. diff from i(t+1) (via peephole)
-      // 4. diff from f(t+1) (via peephole)
-      // 5. diff from o(t)   (via peephole, not recurrent)
-      d_c.AddMat(1.0, d_h);
-      d_c.AddMatMatElements(1.0, DC.RowRange((t+1)*S, S), YF.RowRange((t+1)*S,S), 1.0);
-      d_c.AddMatDiagVec(1.0, DI.RowRange((t+1)*S, S), kNoTrans, peephole_i_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, DF.RowRange((t+1)*S, S), kNoTrans, peephole_f_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, d_o                    , kNoTrans, peephole_o_c_, 1.0);
-      // optionally clip the cell_derivative,
-      if (cell_diff_clip_ > 0.0) {
-        d_c.ApplyFloor(-cell_diff_clip_);
-        d_c.ApplyCeiling(cell_diff_clip_);
-      }
-
-      // f
-      d_f.AddMatMatElements(1.0, d_c, YC.RowRange((t-1)*S,S), 0.0);
-      d_f.DiffSigmoid(y_f, d_f);
-
-      // i
-      d_i.AddMatMatElements(1.0, d_c, y_g, 0.0);
-      d_i.DiffSigmoid(y_i, d_i);
-
-      // c -> g via input gate
-      d_g.AddMatMatElements(1.0, d_c, y_i, 0.0);
-      d_g.DiffTanh(y_g, d_g);
-
-      // Clipping per-frame derivatives for the next `t'.
-      // Clipping applied to gates and input gate (as done in Google).
-      // [ICASSP2015, Sak, Learning acoustic frame labelling...],
-      //
-      // The path from 'out_diff' to 'd_c' via 'd_h' is unclipped,
-      // which is probably important for the 'Constant Error Carousel'
-      // to work well.
-      //
-      if (diff_clip_ > 0.0) {
-        d_gifo.ApplyFloor(-diff_clip_);
-        d_gifo.ApplyCeiling(diff_clip_);
-      }
-
-      // set zeros to padded frames,
-      if (sequence_lengths_.size() > 0) {
-        for (int s = 0; s < S; s++) {
-          if (t > sequence_lengths_[s]) {
-            d_all.Row(s).SetZero();
-          }
-        }
-      }
-    }
-
-    // g,i,f,o -> x, calculating input derivatives,
-    in_diff->AddMatMat(1.0, DGIFO.RowRange(1*S,T*S), kNoTrans, w_gifo_x_, kNoTrans, 0.0);
-
-    // lazy initialization of udpate buffers,
-    if (w_gifo_x_corr_.NumRows() == 0) {
-      w_gifo_x_corr_.Resize(4*cell_dim_, input_dim_, kSetZero);
-      w_gifo_r_corr_.Resize(4*cell_dim_, proj_dim_, kSetZero);
-      bias_corr_.Resize(4*cell_dim_, kSetZero);
-      peephole_i_c_corr_.Resize(cell_dim_, kSetZero);
-      peephole_f_c_corr_.Resize(cell_dim_, kSetZero);
-      peephole_o_c_corr_.Resize(cell_dim_, kSetZero);
-      w_r_m_corr_.Resize(proj_dim_, cell_dim_, kSetZero);
-    }
-
-    // calculate delta
-    const BaseFloat mmt = opts_.momentum;
-
-    // weight x -> g, i, f, o
-    w_gifo_x_corr_.AddMatMat(1.0, DGIFO.RowRange(1*S, T*S), kTrans,
-                                  in                      , kNoTrans, mmt);
-    // recurrent weight r -> g, i, f, o
-    w_gifo_r_corr_.AddMatMat(1.0, DGIFO.RowRange(1*S, T*S), kTrans,
-                                  YR.RowRange(0*S, T*S)   , kNoTrans, mmt);
-    // bias of g, i, f, o
-    bias_corr_.AddRowSumMat(1.0, DGIFO.RowRange(1*S, T*S), mmt);
-
-    // recurrent peephole c -> i
-    peephole_i_c_corr_.AddDiagMatMat(1.0, DI.RowRange(1*S, T*S), kTrans,
-                                          YC.RowRange(0*S, T*S), kNoTrans, mmt);
-    // recurrent peephole c -> f
-    peephole_f_c_corr_.AddDiagMatMat(1.0, DF.RowRange(1*S, T*S), kTrans,
-                                          YC.RowRange(0*S, T*S), kNoTrans, mmt);
-    // peephole c -> o
-    peephole_o_c_corr_.AddDiagMatMat(1.0, DO.RowRange(1*S, T*S), kTrans,
-                                          YC.RowRange(1*S, T*S), kNoTrans, mmt);
-
-    w_r_m_corr_.AddMatMat(1.0, DR.RowRange(1*S, T*S), kTrans,
-                               YM.RowRange(1*S, T*S), kNoTrans, mmt);
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-
-    // apply the gradient clipping,
-    if (grad_clip_ > 0.0) {
-      w_gifo_x_corr_.ApplyFloor(-grad_clip_);
-      w_gifo_x_corr_.ApplyCeiling(grad_clip_);
-      w_gifo_r_corr_.ApplyFloor(-grad_clip_);
-      w_gifo_r_corr_.ApplyCeiling(grad_clip_);
-      bias_corr_.ApplyFloor(-grad_clip_);
-      bias_corr_.ApplyCeiling(grad_clip_);
-      w_r_m_corr_.ApplyFloor(-grad_clip_);
-      w_r_m_corr_.ApplyCeiling(grad_clip_);
-      peephole_i_c_corr_.ApplyFloor(-grad_clip_);
-      peephole_i_c_corr_.ApplyCeiling(grad_clip_);
-      peephole_f_c_corr_.ApplyFloor(-grad_clip_);
-      peephole_f_c_corr_.ApplyCeiling(grad_clip_);
-      peephole_o_c_corr_.ApplyFloor(-grad_clip_);
-      peephole_o_c_corr_.ApplyCeiling(grad_clip_);
-    }
-
-    const BaseFloat lr  = opts_.learn_rate;
-
-    w_gifo_x_.AddMat(-lr * learn_rate_coef_, w_gifo_x_corr_);
-    w_gifo_r_.AddMat(-lr * learn_rate_coef_, w_gifo_r_corr_);
-    bias_.AddVec(-lr * bias_learn_rate_coef_, bias_corr_, 1.0);
-
-    peephole_i_c_.AddVec(-lr * bias_learn_rate_coef_, peephole_i_c_corr_, 1.0);
-    peephole_f_c_.AddVec(-lr * bias_learn_rate_coef_, peephole_f_c_corr_, 1.0);
-    peephole_o_c_.AddVec(-lr * bias_learn_rate_coef_, peephole_o_c_corr_, 1.0);
-
-    w_r_m_.AddMat(-lr * learn_rate_coef_, w_r_m_corr_);
-  }
-
- private:
-  // dims
-  int32 cell_dim_;
-  int32 proj_dim_;  ///< recurrent projection layer dim
-
-  BaseFloat cell_clip_;  ///< Clipping of 'cell-values' in forward pass (per-frame),
-  BaseFloat diff_clip_;  ///< Clipping of 'derivatives' in backprop (per-frame),
-  BaseFloat cell_diff_clip_; ///< Clipping of 'cell-derivatives' accumulated over CEC (per-frame),
-  BaseFloat grad_clip_;  ///< Clipping of the updates,
-
-  // buffer for transfering state across batches,
-  CuMatrix<BaseFloat> prev_nnet_state_;
-
-  // feed-forward connections: from x to [g, i, f, o]
-  CuMatrix<BaseFloat> w_gifo_x_;
-  CuMatrix<BaseFloat> w_gifo_x_corr_;
-
-  // recurrent projection connections: from r to [g, i, f, o]
-  CuMatrix<BaseFloat> w_gifo_r_;
-  CuMatrix<BaseFloat> w_gifo_r_corr_;
-
-  // biases of [g, i, f, o]
-  CuVector<BaseFloat> bias_;
-  CuVector<BaseFloat> bias_corr_;
-
-  // peephole from c to i, f, g
-  // peephole connections are block-internal, so we use vector form
-  CuVector<BaseFloat> peephole_i_c_;
-  CuVector<BaseFloat> peephole_f_c_;
-  CuVector<BaseFloat> peephole_o_c_;
-
-  CuVector<BaseFloat> peephole_i_c_corr_;
-  CuVector<BaseFloat> peephole_f_c_corr_;
-  CuVector<BaseFloat> peephole_o_c_corr_;
-
-  // projection layer r: from m to r
-  CuMatrix<BaseFloat> w_r_m_;
-  CuMatrix<BaseFloat> w_r_m_corr_;
-
-  // propagate buffer: output of [g, i, f, o, c, h, m, r]
-  CuMatrix<BaseFloat> propagate_buf_;
-
-  // back-propagate buffer: diff-input of [g, i, f, o, c, h, m, r]
-  CuMatrix<BaseFloat> backpropagate_buf_;
-};  // class LstmProjected
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_LSTM_PROJECTED_H_
diff --git a/src/nnet/nnet-matrix-buffer.h b/src/nnet/nnet-matrix-buffer.h
deleted file mode 100644
index 1790aee7b2c..00000000000
--- a/src/nnet/nnet-matrix-buffer.h
+++ /dev/null
@@ -1,233 +0,0 @@
-// nnet/nnet-matrix-buffer.h
-
-// Copyright 2016  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_MATRIX_BUFFER_H_
-#define KALDI_NNET_NNET_MATRIX_BUFFER_H_
-
-#include <utility>
-#include <vector>
-#include <list>
-#include <string>
-
-#include "itf/options-itf.h"
-#include "util/common-utils.h"
-#include "matrix/kaldi-matrix.h"
-
-
-namespace kaldi {
-namespace nnet1 {
-
-struct MatrixBufferOptions {
-  int32 matrix_buffer_size;
-
-  MatrixBufferOptions():
-    matrix_buffer_size(3 * 1024)  // 3 x 1GB,
-  { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("matrix-buffer-size", &matrix_buffer_size,
-       "Capacity of buffer for feature matrices, in MB.");
-  }
-};
-
-
-/**
- * A buffer for caching (utterance-key, feature-matrix) pairs.
- * Typically, it reads 'matrix_buffer_size' megabytes of data,
- * and returns records with similar number of speech frames
- * through the standard Key(), Value(), Next(), Done() interface.
- *
- * The preferred length is reset by ResetLength().
- * The buffer gets refilled after having less
- * data than 50% of 'matrix_buffer_size'.
- */
-class MatrixBuffer {
- public:
-  MatrixBuffer():
-    reader_(NULL),
-    current_(NULL),
-    preferred_length_(0)
-  { }
-
-  ~MatrixBuffer()
-  { }
-
-  void Init(SequentialBaseFloatMatrixReader* reader,
-            MatrixBufferOptions opts = MatrixBufferOptions()) {
-    KALDI_ASSERT(SizeInBytes() == 0);
-    reader_ = reader;
-    opts_ = opts;
-
-    Read();
-  }
-
-  bool Done() {
-    return (reader_->Done() && NumPairs() <= 1);
-  }
-
-  void Next();
-
-  void ResetLength() {
-    preferred_length_ = 0;
-  }
-
-  std::string Key() {
-    return current_->first;
-  }
-  Matrix<BaseFloat> Value() {
-    return current_->second;
-  }
-
-  /// Total amount of features in the buffer (bytes),
-  size_t SizeInBytes() const;
-
-  /// Total amount of features in the buffer (Mega-bytes),
-  size_t SizeInMegaBytes() const;
-
-  /// Total number of (key,matrix) pairs in the buffer,
-  size_t NumPairs() const;
-
- private:
-
-  void Read();  ///< fills the buffer,
-  void DisposeValue();  ///< removes 'current_' from data structure,
-
-  SequentialBaseFloatMatrixReader* reader_;
-
-  typedef std::pair<std::string,Matrix<BaseFloat> > PairType;
-  typedef std::list<PairType> ListType;
-  typedef std::map<size_t, ListType> BufferType;
-  BufferType buffer_;  ///< Buffer indexed by 'NumRows()',
-
-  PairType* current_;  ///< The currently active (key,value) pair,
-
-  MatrixBufferOptions opts_;
-
-  size_t preferred_length_;
-};
-
-void MatrixBuffer::Next() {
-  KALDI_ASSERT(!buffer_.empty());
-
-  // remove old 'Value()' matrix,
-  DisposeValue();
-
-  // start re-filling,
-  if (SizeInMegaBytes() < 0.5 * opts_.matrix_buffer_size) {
-    Read();
-  }
-
-  KALDI_ASSERT(!buffer_.empty());
-
-  // randomly select 'length' present in the 'map',
-  // (weighted by total #frames in the bin),
-  if (preferred_length_ == 0) {
-    int32 longest = (--buffer_.end())->first;
-    // pre-fill the vector of 'keys',
-    std::vector<int32> keys;
-    BufferType::iterator it;
-    for (it = buffer_.begin(); it != buffer_.end(); ++it) {
-      int32 key = it->first; // i.e. NumRows() of matrices in the bin,
-      int32 frames_in_bin = it->second.size() * key;
-      for (int32 i = 0; i < frames_in_bin; i += longest) {
-        keys.push_back(key); // keys are repeated,
-      }
-    }
-    // choose the key,
-    std::vector<int32>::iterator it2 = keys.begin();
-    std::advance(it2, rand() % keys.size());
-    preferred_length_ = (*it2);  // NumRows(), key of the 'map',
-  }
-
-  // select list by 'preferred_length_',
-  BufferType::iterator it = buffer_.lower_bound(preferred_length_);
-  if (it == buffer_.end()) { --it; } // or the last one,
-
-  // take a front element 'ptr' from that list,
-  current_ = &(it->second.front());
-}
-
-size_t MatrixBuffer::SizeInBytes() const {
-  size_t ans = 0;
-  for (BufferType::const_iterator it = buffer_.begin(); it != buffer_.end(); ++it) {
-    for (ListType::const_iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2) {
-      ans += it2->second.SizeInBytes();
-    }
-  }
-  return ans;
-}
-
-size_t MatrixBuffer::SizeInMegaBytes() const {
-  return (SizeInBytes() / (1024 * 1024));
-}
-
-size_t MatrixBuffer::NumPairs() const {
-  size_t ans = 0;
-  for (BufferType::const_iterator it = buffer_.begin(); it != buffer_.end(); ++it) {
-    ans += it->second.size();
-  }
-  return ans;
-}
-
-void MatrixBuffer::Read() {
-  if (!reader_->Done())
-    KALDI_LOG << "Read() started... Buffer size in MB: "
-              << SizeInMegaBytes() << ", max " << opts_.matrix_buffer_size
-              << ", having " << NumPairs() << " utterances.";
-  for ( ; !reader_->Done(); reader_->Next()) {
-    // see if we are full,
-    if (SizeInMegaBytes() > opts_.matrix_buffer_size) {
-      KALDI_LOG << "Read() finished... Buffer size in MB: "
-                << SizeInMegaBytes() << ", max " << opts_.matrix_buffer_size
-                << ", having " << NumPairs() << " utterances.";
-      break;
-    }
-    // get matrix,
-    const std::string& key = reader_->Key();
-    const Matrix<BaseFloat>& mat = reader_->Value();
-    size_t num_rows = mat.NumRows();
-    // see if 'num_rows' already in keys,
-    if (buffer_.find(num_rows) == buffer_.end()) {
-      buffer_[num_rows] = ListType();  // add empty list,
-    }
-    // add matrix to the buffer,
-    buffer_[num_rows].push_back(PairType(key, mat));
-  }
-}
-
-void MatrixBuffer::DisposeValue() {
-  // remove old 'Value()' matrix,
-  if (current_ != NULL) {
-    size_t r = current_->second.NumRows();
-    KALDI_ASSERT(current_ == &(buffer_[r].front()));
-    // remove the (key,value) pair,
-    buffer_[r].pop_front();
-    // eventually remove the 'NumRows()' key,
-    if (buffer_[r].empty()) { buffer_.erase(r); }
-    current_ = NULL;
-  }
-}
-
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_MATRIX_BUFFER_H_
-
diff --git a/src/nnet/nnet-max-pooling-component.h b/src/nnet/nnet-max-pooling-component.h
deleted file mode 100644
index c1add201b02..00000000000
--- a/src/nnet/nnet-max-pooling-component.h
+++ /dev/null
@@ -1,176 +0,0 @@
-// nnet/nnet-max-pooling-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_MAX_POOLING_COMPONENT_H_
-#define KALDI_NNET_NNET_MAX_POOLING_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * MaxPoolingComponent :
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * The pooling is done over 3rd axis, of the set of 2d matrices.
- * Our pooling supports overlaps, overlaps occur when (pool_step_ < pool_size_).
- */
-class MaxPoolingComponent : public Component {
- public:
-  MaxPoolingComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    pool_size_(0),
-    pool_step_(0),
-    pool_stride_(0)
-  { }
-
-  ~MaxPoolingComponent()
-  { }
-
-  Component* Copy() const { return new MaxPoolingComponent(*this); }
-  ComponentType GetType() const { return kMaxPoolingComponent; }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<PoolSize>") ReadBasicType(is, false, &pool_size_);
-      else if (token == "<PoolStep>") ReadBasicType(is, false, &pool_step_);
-      else if (token == "<PoolStride>") ReadBasicType(is, false, &pool_stride_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (PoolSize|PoolStep|PoolStride)";
-    }
-    // check
-    KALDI_ASSERT(pool_size_ != 0 && pool_step_ != 0 && pool_stride_ != 0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // pooling hyperparameters
-    ExpectToken(is, binary, "<PoolSize>");
-    ReadBasicType(is, binary, &pool_size_);
-    ExpectToken(is, binary, "<PoolStep>");
-    ReadBasicType(is, binary, &pool_step_);
-    ExpectToken(is, binary, "<PoolStride>");
-    ReadBasicType(is, binary, &pool_stride_);
-
-    //
-    // Sanity checks:
-    //
-    // number of patches:
-    KALDI_ASSERT(input_dim_ % pool_stride_ == 0);
-    int32 num_patches = input_dim_ / pool_stride_;
-    // number of pools:
-    KALDI_ASSERT((num_patches - pool_size_) % pool_step_ == 0);
-    int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
-    // check output dim:
-    KALDI_ASSERT(output_dim_ == num_pools * pool_stride_);
-    //
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // pooling hyperparameters
-    WriteToken(os, binary, "<PoolSize>");
-    WriteBasicType(os, binary, pool_size_);
-    WriteToken(os, binary, "<PoolStep>");
-    WriteBasicType(os, binary, pool_step_);
-    WriteToken(os, binary, "<PoolStride>");
-    WriteBasicType(os, binary, pool_stride_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_patches = input_dim_ / pool_stride_;
-    int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
-
-    // do the max-pooling (pools indexed by q)
-    for (int32 q = 0; q < num_pools; q++) {
-      // get output buffer of the pool
-      CuSubMatrix<BaseFloat> pool(out->ColRange(q*pool_stride_, pool_stride_));
-      pool.Set(-1e20);  // reset (large negative value)
-      for (int32 r = 0; r < pool_size_; r++) {  // max
-        int32 p = r + q * pool_step_;  // p = input patch
-        pool.Max(in.ColRange(p*pool_stride_, pool_stride_));
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_patches = input_dim_ / pool_stride_;
-    int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
-
-    //
-    // here we note how many diff matrices are summed for each input patch,
-    std::vector<int32> patch_summands(num_patches, 0);
-    // this metainfo will be used to divide diff of patches
-    // used in more than one pool.
-    //
-
-    in_diff->SetZero();  // reset
-
-    for (int32 q = 0; q<num_pools; q++) {  // sum
-      for (int32 r = 0; r<pool_size_; r++) {
-        int32 p = r + q * pool_step_;  // patch number
-        //
-        CuSubMatrix<BaseFloat> in_p(in.ColRange(p*pool_stride_, pool_stride_));
-        CuSubMatrix<BaseFloat> out_q(out.ColRange(q*pool_stride_, pool_stride_));
-        //
-        CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(p*pool_stride_, pool_stride_));
-        CuMatrix<BaseFloat> src(out_diff.ColRange(q*pool_stride_, pool_stride_));
-
-        // Only the pool-inputs with 'max-values' are used to back-propagate into,
-        // the rest of derivatives is zeroed-out by a mask.
-        CuMatrix<BaseFloat> mask;
-        in_p.EqualElementMask(out_q, &mask);
-        src.MulElements(mask);
-        tgt.AddMat(1.0, src);
-
-        patch_summands[p] += 1;
-      }
-    }
-
-    // divide diff by #summands (compensate for patches used in more pools)
-    for (int32 p = 0; p < num_patches; p++) {
-      CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(p*pool_stride_, pool_stride_));
-      KALDI_ASSERT(patch_summands[p] > 0);  // patch at least in one pool
-      tgt.Scale(1.0/patch_summands[p]);
-    }
-  }
-
- private:
-  int32 pool_size_,    // input patches used for pooling
-        pool_step_,    // shift used for pooling (allow overlapping pools)
-        pool_stride_;  // stride used to slice input to a vector of matrices
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_MAX_POOLING_COMPONENT_H_
diff --git a/src/nnet/nnet-multibasis-component.h b/src/nnet/nnet-multibasis-component.h
deleted file mode 100644
index be3cd05c9ba..00000000000
--- a/src/nnet/nnet-multibasis-component.h
+++ /dev/null
@@ -1,456 +0,0 @@
-// nnet/nnet-multibasis-component.h
-
-// Copyright 2016  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_MULTIBASIS_COMPONENT_H_
-#define KALDI_NNET_NNET_MULTIBASIS_COMPONENT_H_
-
-#include <sstream>
-#include <vector>
-#include <string>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-affine-transform.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class MultiBasisComponent : public UpdatableComponent {
- public:
-  MultiBasisComponent(int32 dim_in, int32 dim_out) :
-    UpdatableComponent(dim_in, dim_out),
-    selector_lr_coef_(1.0),
-    threshold_(0.1)
-  { }
-
-  ~MultiBasisComponent()
-  { }
-
-  Component* Copy() const { return new MultiBasisComponent(*this); }
-  ComponentType GetType() const { return kMultiBasisComponent; }
-
-  void InitData(std::istream &is) {
-    // define options,
-    std::string selector_proto;
-    std::string selector_filename;
-    std::string basis_proto;
-    std::string basis_filename;
-    std::vector<std::string> basis_filename_vector;
-
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<SelectorProto>") ReadToken(is, false, &selector_proto);
-      else if (token == "<SelectorFilename>") ReadToken(is, false, &selector_filename);
-      else if (token == "<SelectorLearnRateCoef>") ReadBasicType(is, false, &selector_lr_coef_);
-      else if (token == "<BasisProto>") ReadToken(is, false, &basis_proto);
-      else if (token == "<BasisFilename>") ReadToken(is, false, &basis_filename);
-      else if (token == "<BasisFilenameVector>") {
-        while(is >> std::ws, !is.eof()) {
-          std::string file_or_end;
-          ReadToken(is, false, &file_or_end);
-          if (file_or_end == "</BasisFilenameVector>") break;
-          basis_filename_vector.push_back(file_or_end);
-        }
-      } else KALDI_ERR << "Unknown token " << token << ", typo in config?"
-               << " (SelectorProto|SelectorFilename|BasisProto|BasisFilename|BasisFilenameVector)";
-    }
-
-    //// INITIALIZE
-
-    // selector,
-    if (selector_proto != "") {
-      KALDI_LOG << "Initializing 'selector' from : " << selector_proto;
-      selector_.Init(selector_proto);
-    }
-    if (selector_filename != "") {
-      KALDI_LOG << "Reading 'selector' from : " << selector_filename;
-      selector_.Read(selector_filename);
-    }
-
-    // as many empty basis as outputs of the selector,
-    nnet_basis_.resize(selector_.OutputDim());
-    // fill the basis,
-    if (basis_proto != "") {
-      // Initialized from prototype,
-      KALDI_LOG << "Initializing 'basis' from : " << basis_proto;
-      for (int32 i = 0; i < nnet_basis_.size(); i++) {
-        nnet_basis_[i].Init(basis_proto);
-      }
-    } else if (basis_filename != "") {
-      // Load 1 initial basis repeateadly,
-      KALDI_LOG << "Reading 'basis' from : " << basis_filename;
-      for (int32 i = 0; i < nnet_basis_.size(); i++) {
-        nnet_basis_[i].Read(basis_filename);
-      }
-    } else if (basis_filename_vector.size() > 0) {
-      // Read a list of basis functions,
-      if (basis_filename_vector.size() != nnet_basis_.size()) {
-        KALDI_ERR << "We need " << nnet_basis_.size() << " filenames. "
-                  << "We got " << basis_filename_vector.size();
-      }
-      for (int32 i = 0; i < nnet_basis_.size(); i++) {
-        KALDI_LOG << "Reading 'basis' from : "
-                  << basis_filename_vector[i];
-        nnet_basis_[i].Read(basis_filename_vector[i]);
-      }
-    } else {
-      // Initialize basis by square identity matrix,
-      int32 basis_input_dim = InputDim() - selector_.InputDim();
-      KALDI_LOG << "Initializing 'basis' to Identity <AffineTransform> "
-                << OutputDim() << "x" << basis_input_dim;
-      KALDI_ASSERT(OutputDim() == basis_input_dim);  // has to be square!
-      Matrix<BaseFloat> m(OutputDim(), basis_input_dim);
-      m.SetUnit();
-      // wrap identity into AffineTransform,
-      // (bias is vector of zeros),
-      AffineTransform identity_comp(basis_input_dim, OutputDim());
-      identity_comp.SetLinearity(CuMatrix<BaseFloat>(m));
-      //
-      for (int32 i = 0; i < nnet_basis_.size(); i++) {
-        nnet_basis_[i].AppendComponent(identity_comp);
-      }
-    }
-
-    // check,
-    KALDI_ASSERT(InputDim() == selector_.InputDim() + nnet_basis_[0].InputDim());
-    KALDI_ASSERT(OutputDim() == nnet_basis_[0].OutputDim());
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    bool end_loop = false;
-    while (!end_loop && '<' == Peek(is, binary)) {
-      std::string token;
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'S': ReadToken(is, false, &token);
-          /**/ if (token == "<SelectorLearnRateCoef>") ReadBasicType(is, binary, &selector_lr_coef_);
-          else if (token == "<Selector>") selector_.Read(is, binary);
-          else KALDI_ERR << "Unknown token: " << token;
-          break;
-        case 'N': ExpectToken(is, binary, "<NumBasis>");
-          int32 num_basis;
-          ReadBasicType(is, binary, &num_basis);
-          nnet_basis_.resize(num_basis);
-          for (int32 i = 0; i < num_basis; i++) {
-            int32 dummy;
-            ExpectToken(is, binary, "<Basis>");
-            ReadBasicType(is, binary, &dummy);
-            nnet_basis_[i].Read(is, binary);
-          }
-          break;
-        case '!':
-          ExpectToken(is, binary, "<!EndOfComponent>");
-          end_loop=true;
-          break;
-        default:
-          ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-
-    // check,
-    KALDI_ASSERT(nnet_basis_.size() == selector_.OutputDim());
-    KALDI_ASSERT(InputDim() == selector_.InputDim() + nnet_basis_[0].InputDim());
-    KALDI_ASSERT(OutputDim() == nnet_basis_[0].OutputDim());
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    int32 num_basis = nnet_basis_.size();
-    WriteToken(os, binary, "<SelectorLearnRateCoef>");
-    WriteBasicType(os, binary, selector_lr_coef_);
-    if (!binary) os << "\n\n";
-    WriteToken(os, binary, "<Selector>");
-    if (!binary) os << "\n";
-    selector_.Write(os, binary);
-    if (!binary) os << "\n";
-    WriteToken(os, binary, "<NumBasis>");
-    WriteBasicType(os, binary, num_basis);
-    if (!binary) os << "\n";
-    for (int32 i = 0; i < num_basis; i++) {
-      WriteToken(os, binary, "<Basis>");
-      WriteBasicType(os, binary, i+1);
-      if (!binary) os << "\n";
-      nnet_basis_.at(i).Write(os, binary);
-    }
-  }
-
-  Nnet& GetBasis(int32 id) { return nnet_basis_.at(id); }
-  const Nnet& GetBasis(int32 id) const { return nnet_basis_.at(id); }
-
-  int32 NumParams() const {
-    int32 num_params_sum = selector_.NumParams();
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      num_params_sum += nnet_basis_[i].NumParams();
-    }
-    return num_params_sum;
-  }
-
-  void GetGradient(VectorBase<BaseFloat> *gradient) const {
-    KALDI_ERR << "TODO, not yet implemented!";
-  }
-
-  void GetParams(VectorBase<BaseFloat> *params) const {
-    int32 offset = 0;
-    Vector<BaseFloat> params_tmp;
-    // selector,
-    selector_.GetParams(&params_tmp);
-    params->Range(offset, params_tmp.Dim()).CopyFromVec(params_tmp);
-    offset += params_tmp.Dim();
-    // basis,
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      nnet_basis_[i].GetParams(&params_tmp);
-      params->Range(offset, params_tmp.Dim()).CopyFromVec(params_tmp);
-      offset += params_tmp.Dim();
-    }
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void SetParams(const VectorBase<BaseFloat> &params) {
-    int32 offset = 0;
-    // selector,
-    selector_.SetParams(params.Range(offset, selector_.NumParams()));
-    offset += selector_.NumParams();
-    // basis,
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      nnet_basis_[i].SetParams(params.Range(offset, nnet_basis_[i].NumParams()));
-      offset += nnet_basis_[i].NumParams();
-    }
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  std::string Info() const {
-    std::ostringstream os;
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      os << "basis_network #" << i+1 << " {\n"
-         << nnet_basis_[i].Info()
-         << "}\n";
-    }
-    os << "\nselector {\n"
-       << selector_.Info()
-       << "}";
-    return os.str();
-  }
-
-  std::string InfoGradient() const {
-    std::ostringstream os;
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      if (posterior_sum_(i) > threshold_) {
-        os << "basis_gradient #" << i+1 << " {\n"
-           << nnet_basis_[i].InfoGradient(false)
-           << "}\n";
-      }
-    }
-    os << "selector_gradient {\n"
-       << selector_.InfoGradient(false)
-       << "}";
-    return os.str();
-  }
-
-  std::string InfoPropagate() const {
-    std::ostringstream os;
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      if (posterior_sum_(i) > threshold_) {
-        os << "basis_propagate #" << i+1 << " {\n"
-           << nnet_basis_[i].InfoPropagate(false)
-           << "}\n";
-      }
-    }
-    os << "selector_propagate {\n"
-       << selector_.InfoPropagate(false)
-       << "}\n";
-    return os.str();
-  }
-
-  std::string InfoBackPropagate() const {
-    std::ostringstream os;
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      if (posterior_sum_(i) > threshold_) {
-        os << "basis_backpropagate #" << i+1 << "{\n"
-           << nnet_basis_[i].InfoBackPropagate(false)
-           << "}\n";
-      }
-    }
-    os << "selector_backpropagate {\n"
-       << selector_.InfoBackPropagate(false)
-       << "}\n";
-    return os.str();
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // dimensions,
-    int32 num_basis = nnet_basis_.size();
-
-    // make sure we have all the buffers,
-    if (basis_out_.size() != num_basis) {
-      basis_out_.resize(num_basis);
-    }
-
-    // split the input,
-    const CuSubMatrix<BaseFloat> in_basis(
-        in.ColRange(0, nnet_basis_[0].InputDim())
-    );
-    const CuSubMatrix<BaseFloat> in_selector(
-        in.ColRange(nnet_basis_[0].InputDim(), selector_.InputDim())
-    );
-
-    // get the 'selector_' posteriors,
-    selector_.Propagate(in_selector, &posterior_);
-    KALDI_ASSERT(posterior_.Row(0).Min() >= 0.0);
-    KALDI_ASSERT(posterior_.Row(0).Max() <= 1.0);
-    KALDI_ASSERT(ApproxEqual(posterior_.Row(0).Sum(), 1.0));
-    posterior_.Transpose();  // trans,
-
-    // sum 'selector_' posteriors over time,
-    CuVector<BaseFloat> posterior_sum(num_basis);
-    posterior_sum.AddColSumMat(1.0, posterior_, 0.0);
-    posterior_sum_ = Vector<BaseFloat>(posterior_sum);
-
-    // combine the 'basis' outputs,
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      if (posterior_sum_(i) > threshold_) {
-        // use only basis with occupancy >0.1,
-        nnet_basis_[i].Propagate(in_basis, &basis_out_[i]);
-        out->AddDiagVecMat(1.0, posterior_.Row(i), basis_out_[i], kNoTrans);
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // dimensions,
-    int32 num_basis = nnet_basis_.size(),
-          num_frames = in.NumRows();
-
-    // split the in_diff,
-    CuSubMatrix<BaseFloat> in_diff_basis(
-        in_diff->ColRange(0, nnet_basis_[0].InputDim())
-    );
-    CuSubMatrix<BaseFloat> in_diff_selector(
-        in_diff->ColRange(nnet_basis_[0].InputDim(), selector_.InputDim())
-    );
-
-    // backprop through 'selector',
-    CuMatrix<BaseFloat> selector_out_diff(num_basis, num_frames);
-    for (int32 i = 0; i < num_basis; i++) {
-      if (posterior_sum_(i) > threshold_) {
-        selector_out_diff.Row(i).AddDiagMatMat(1.0, out_diff, kNoTrans, basis_out_[i], kTrans, 0.0);
-      }
-    }
-    selector_out_diff.Transpose();
-    selector_out_diff.Scale(selector_lr_coef_);
-    CuMatrix<BaseFloat> in_diff_selector_tmp;
-    selector_.Backpropagate(selector_out_diff, &in_diff_selector_tmp);
-    in_diff_selector.CopyFromMat(in_diff_selector_tmp);
-
-    // backprop through 'basis',
-    CuMatrix<BaseFloat> out_diff_scaled(num_frames, OutputDim()),
-                        in_diff_basis_tmp;
-    for (int32 i = 0; i < num_basis; i++) {
-      // use only basis with occupancy >0.1,
-      if (posterior_sum_(i) > threshold_) {
-        out_diff_scaled.AddDiagVecMat(1.0, posterior_.Row(i), out_diff, kNoTrans, 0.0);
-        nnet_basis_[i].Backpropagate(out_diff_scaled, &in_diff_basis_tmp);
-        in_diff_basis.AddMat(1.0, in_diff_basis_tmp);
-      }
-    }
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    { }  // do nothing
-  }
-
-  /**
-   * Overriding the default,
-   * which was UpdatableComponent::SetTrainOptions(...)
-   */
-  void SetTrainOptions(const NnetTrainOptions &opts) {
-    selector_.SetTrainOptions(opts);
-    for (int32 i=0; i<nnet_basis_.size(); i++) {
-      nnet_basis_[i].SetTrainOptions(opts);
-    }
-  }
-
-  /**
-   * Overriding the default,
-   * which was UpdatableComponent::SetLearnRateCoef(...)
-   */
-  void SetLearnRateCoef(BaseFloat val) {
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      // loop over components,
-      for (int32 j = 0; j < nnet_basis_[i].NumComponents(); j++) {
-        if (nnet_basis_[i].GetComponent(j).IsUpdatable()) {
-          UpdatableComponent& comp =
-            dynamic_cast<UpdatableComponent&>(nnet_basis_[i].GetComponent(j));
-          // set the value,
-          comp.SetLearnRateCoef(val);
-        }
-      }
-    }
-  }
-
-  /**
-   * Overriding the default,
-   * which was UpdatableComponent::SetBiasLearnRateCoef(...)
-   */
-  void SetBiasLearnRateCoef(BaseFloat val) {
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_basis_.size(); i++) {
-      // loop over components,
-      for (int32 j = 0; j < nnet_basis_[i].NumComponents(); j++) {
-        if (nnet_basis_[i].GetComponent(j).IsUpdatable()) {
-          UpdatableComponent& comp =
-            dynamic_cast<UpdatableComponent&>(nnet_basis_[i].GetComponent(j));
-          // set the value,
-          comp.SetBiasLearnRateCoef(val);
-        }
-      }
-    }
-  }
-
- private:
-  /// The vector of 'basis' networks (output of basis is combined
-  /// according to the posterior_ from the selector_)
-  std::vector<Nnet> nnet_basis_;
-  std::vector<CuMatrix<BaseFloat> > basis_out_;
-
-  /// Selector network,
-  Nnet selector_;
-  BaseFloat selector_lr_coef_;
-
-  /// The output of 'selector_',
-  CuMatrix<BaseFloat> posterior_;
-  Vector<BaseFloat> posterior_sum_;
-
-  /// Threshold, applied to posterior_sum_, disables the unused basis,
-  BaseFloat threshold_;
-
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_MULTIBASIS_COMPONENT_H_
diff --git a/src/nnet/nnet-nnet.cc b/src/nnet/nnet-nnet.cc
deleted file mode 100644
index 86c5f9e5ad0..00000000000
--- a/src/nnet/nnet-nnet.cc
+++ /dev/null
@@ -1,520 +0,0 @@
-// nnet/nnet-nnet.cc
-
-// Copyright 2011-2016  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-parallel-component.h"
-#include "nnet/nnet-multibasis-component.h"
-#include "nnet/nnet-activation.h"
-#include "nnet/nnet-affine-transform.h"
-#include "nnet/nnet-various.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-Nnet::Nnet() {
-}
-
-Nnet::~Nnet() {
-  Destroy();
-}
-
-Nnet::Nnet(const Nnet& other) {
-  // copy the components
-  for (int32 i = 0; i < other.NumComponents(); i++) {
-    components_.push_back(other.GetComponent(i).Copy());
-  }
-  // create empty buffers
-  propagate_buf_.resize(NumComponents()+1);
-  backpropagate_buf_.resize(NumComponents()+1);
-  // copy train opts
-  SetTrainOptions(other.opts_);
-  Check();
-}
-
-Nnet& Nnet::operator= (const Nnet& other) {
-  Destroy();
-  // copy the components
-  for (int32 i = 0; i < other.NumComponents(); i++) {
-    components_.push_back(other.GetComponent(i).Copy());
-  }
-  // create empty buffers
-  propagate_buf_.resize(NumComponents()+1);
-  backpropagate_buf_.resize(NumComponents()+1);
-  // copy train opts
-  SetTrainOptions(other.opts_);
-  Check();
-  return *this;
-}
-
-/**
- * Forward propagation through the network,
- * (from first component to last).
- */
-void Nnet::Propagate(const CuMatrixBase<BaseFloat> &in,
-                     CuMatrix<BaseFloat> *out) {
-  // In case of empty network copy input to output,
-  if (NumComponents() == 0) {
-    (*out) = in;  // copy,
-    return;
-  }
-  // We need C+1 buffers,
-  if (propagate_buf_.size() != NumComponents()+1) {
-    propagate_buf_.resize(NumComponents()+1);
-  }
-  // Copy input to first buffer,
-  propagate_buf_[0] = in;
-  // Propagate through all the components,
-  for (int32 i = 0; i < static_cast<int32>(components_.size()); i++) {
-    components_[i]->Propagate(propagate_buf_[i], &propagate_buf_[i+1]);
-  }
-  // Copy the output from the last buffer,
-  (*out) = propagate_buf_[NumComponents()];
-}
-
-
-/**
- * Error back-propagation through the network,
- * (from last component to first).
- */
-void Nnet::Backpropagate(const CuMatrixBase<BaseFloat> &out_diff,
-                         CuMatrix<BaseFloat> *in_diff) {
-  // Copy the derivative in case of empty network,
-  if (NumComponents() == 0) {
-    (*in_diff) = out_diff;  // copy,
-    return;
-  }
-  // We need C+1 buffers,
-  KALDI_ASSERT(static_cast<int32>(propagate_buf_.size()) == NumComponents()+1);
-  if (backpropagate_buf_.size() != NumComponents()+1) {
-    backpropagate_buf_.resize(NumComponents()+1);
-  }
-  // Copy 'out_diff' to last buffer,
-  backpropagate_buf_[NumComponents()] = out_diff;
-  // Loop from last Component to the first,
-  for (int32 i = NumComponents()-1; i >= 0; i--) {
-    // Backpropagate through 'Component',
-    components_[i]->Backpropagate(propagate_buf_[i],
-                                  propagate_buf_[i+1],
-                                  backpropagate_buf_[i+1],
-                                  &backpropagate_buf_[i]);
-    // Update 'Component' (if applicable),
-    if (components_[i]->IsUpdatable()) {
-      UpdatableComponent* uc =
-        dynamic_cast<UpdatableComponent*>(components_[i]);
-      uc->Update(propagate_buf_[i], backpropagate_buf_[i+1]);
-    }
-  }
-  // Export the derivative (if applicable),
-  if (NULL != in_diff) {
-    (*in_diff) = backpropagate_buf_[0];
-  }
-}
-
-
-void Nnet::Feedforward(const CuMatrixBase<BaseFloat> &in,
-                       CuMatrix<BaseFloat> *out) {
-  KALDI_ASSERT(NULL != out);
-  (*out) = in;  // works even with 0 components,
-  CuMatrix<BaseFloat> tmp_in;
-  for (int32 i = 0; i < NumComponents(); i++) {
-    out->Swap(&tmp_in);
-    components_[i]->Propagate(tmp_in, out);
-  }
-}
-
-
-int32 Nnet::OutputDim() const {
-  KALDI_ASSERT(!components_.empty());
-  return components_.back()->OutputDim();
-}
-
-int32 Nnet::InputDim() const {
-  KALDI_ASSERT(!components_.empty());
-  return components_.front()->InputDim();
-}
-
-const Component& Nnet::GetComponent(int32 c) const {
-  return *(components_.at(c));
-}
-
-Component& Nnet::GetComponent(int32 c) {
-  return *(components_.at(c));
-}
-
-const Component& Nnet::GetLastComponent() const {
-  return *(components_.at(NumComponents()-1));
-}
-
-Component& Nnet::GetLastComponent() {
-  return *(components_.at(NumComponents()-1));
-}
-
-void Nnet::ReplaceComponent(int32 c, const Component& comp) {
-  delete components_.at(c);
-  components_.at(c) = comp.Copy();  // deep copy,
-  Check();
-}
-
-void Nnet::SwapComponent(int32 c, Component** comp) {
-  Component* tmp = components_.at(c);
-  components_.at(c) = *comp;
-  (*comp) = tmp;
-  Check();
-}
-
-void Nnet::AppendComponent(const Component& comp) {
-  components_.push_back(comp.Copy());  // append,
-  Check();
-}
-
-void Nnet::AppendComponentPointer(Component* dynamically_allocated_comp) {
-  components_.push_back(dynamically_allocated_comp);  // append,
-  Check();
-}
-
-void Nnet::AppendNnet(const Nnet& other) {
-  for (int32 i = 0; i < other.NumComponents(); i++) {
-    AppendComponent(other.GetComponent(i));
-  }
-  Check();
-}
-
-void Nnet::RemoveComponent(int32 c) {
-  Component* ptr = components_.at(c);
-  components_.erase(components_.begin()+c);
-  delete ptr;
-  Check();
-}
-
-void Nnet::RemoveLastComponent() {
-  RemoveComponent(NumComponents()-1);
-}
-
-int32 Nnet::NumParams() const {
-  int32 n_params = 0;
-  for (int32 n = 0; n < components_.size(); n++) {
-    if (components_[n]->IsUpdatable()) {
-      n_params +=
-        dynamic_cast<UpdatableComponent*>(components_[n])->NumParams();
-    }
-  }
-  return n_params;
-}
-
-void Nnet::GetGradient(Vector<BaseFloat>* gradient) const {
-  gradient->Resize(NumParams());
-  int32 pos = 0;
-  // loop over Components,
-  for (int32 i = 0; i < components_.size(); i++) {
-    if (components_[i]->IsUpdatable()) {
-      UpdatableComponent& c =
-        dynamic_cast<UpdatableComponent&>(*components_[i]);
-      SubVector<BaseFloat> grad_range(gradient->Range(pos, c.NumParams()));
-      c.GetGradient(&grad_range);  // getting gradient,
-      pos += c.NumParams();
-    }
-  }
-  KALDI_ASSERT(pos == NumParams());
-}
-
-void Nnet::GetParams(Vector<BaseFloat>* params) const {
-  params->Resize(NumParams());
-  int32 pos = 0;
-  // loop over Components,
-  for (int32 i = 0; i < components_.size(); i++) {
-    if (components_[i]->IsUpdatable()) {
-      UpdatableComponent& c =
-        dynamic_cast<UpdatableComponent&>(*components_[i]);
-      SubVector<BaseFloat> params_range(params->Range(pos, c.NumParams()));
-      c.GetParams(&params_range);  // getting params,
-      pos += c.NumParams();
-    }
-  }
-  KALDI_ASSERT(pos == NumParams());
-}
-
-void Nnet::SetParams(const VectorBase<BaseFloat>& params) {
-  KALDI_ASSERT(params.Dim() == NumParams());
-  int32 pos = 0;
-  // loop over Components,
-  for (int32 i = 0; i < components_.size(); i++) {
-    if (components_[i]->IsUpdatable()) {
-      UpdatableComponent& c =
-        dynamic_cast<UpdatableComponent&>(*components_[i]);
-      c.SetParams(params.Range(pos, c.NumParams()));  // setting params,
-      pos += c.NumParams();
-    }
-  }
-  KALDI_ASSERT(pos == NumParams());
-}
-
-void Nnet::SetDropoutRate(BaseFloat r)  {
-  for (int32 c = 0; c < NumComponents(); c++) {
-    if (GetComponent(c).GetType() == Component::kDropout) {
-      Dropout& comp = dynamic_cast<Dropout&>(GetComponent(c));
-      BaseFloat r_old = comp.GetDropoutRate();
-      comp.SetDropoutRate(r);
-      KALDI_LOG << "Setting dropout-rate in component " << c
-                << " from " << r_old << " to " << r;
-    }
-  }
-}
-
-
-void Nnet::ResetStreams(const std::vector<int32> &stream_reset_flag) {
-  for (int32 c = 0; c < NumComponents(); c++) {
-    if (GetComponent(c).IsMultistream()) {
-      MultistreamComponent& comp =
-        dynamic_cast<MultistreamComponent&>(GetComponent(c));
-      comp.ResetStreams(stream_reset_flag);
-    }
-  }
-}
-
-void Nnet::SetSeqLengths(const std::vector<int32> &sequence_lengths) {
-  for (int32 c = 0; c < NumComponents(); c++) {
-    if (GetComponent(c).IsMultistream()) {
-      MultistreamComponent& comp =
-        dynamic_cast<MultistreamComponent&>(GetComponent(c));
-      comp.SetSeqLengths(sequence_lengths);
-    }
-  }
-}
-
-void Nnet::Init(const std::string &proto_file) {
-  Input in(proto_file);
-  std::istream &is = in.Stream();
-  std::string proto_line, token;
-
-  // Initialize from the prototype, where each line
-  // contains the description for one component.
-  while (is >> std::ws, !is.eof()) {
-    KALDI_ASSERT(is.good());
-
-    // get a line from the proto file,
-    std::getline(is, proto_line);
-    if (proto_line == "") continue;
-    KALDI_VLOG(1) << proto_line;
-
-    // get the 1st token from the line,
-    std::istringstream(proto_line) >> std::ws >> token;
-    // ignore these tokens:
-    if (token == "<NnetProto>" || token == "</NnetProto>") continue;
-
-    // create new component, append to Nnet,
-    this->AppendComponentPointer(Component::Init(proto_line+"\n"));
-  }
-  // cleanup
-  in.Close();
-  Check();
-}
-
-
-/**
- * I/O wrapper for converting 'rxfilename' to 'istream',
- */
-void Nnet::Read(const std::string &rxfilename) {
-  bool binary;
-  Input in(rxfilename, &binary);
-  Read(in.Stream(), binary);
-  in.Close();
-  // Warn if the NN is empty
-  if (NumComponents() == 0) {
-    KALDI_WARN << "The network '" << rxfilename << "' is empty.";
-  }
-}
-
-
-void Nnet::Read(std::istream &is, bool binary) {
-  // Read the Components through the 'factory' Component::Read(...),
-  Component* comp(NULL);
-  while (comp = Component::Read(is, binary), comp != NULL) {
-    // Check dims,
-    if (NumComponents() > 0) {
-      if (components_.back()->OutputDim() != comp->InputDim()) {
-        KALDI_ERR << "Dimensionality mismatch!"
-                  << " Previous layer output:" << components_.back()->OutputDim()
-                  << " Current layer input:" << comp->InputDim();
-      }
-    }
-    // Append to 'this' Nnet,
-    AppendComponentPointer(comp);
-  }
-  Check();
-}
-
-
-/**
- * I/O wrapper for converting 'wxfilename' to 'ostream',
- */
-void Nnet::Write(const std::string &wxfilename, bool binary) const {
-  Output out(wxfilename, binary, true);
-  Write(out.Stream(), binary);
-  out.Close();
-}
-
-
-void Nnet::Write(std::ostream &os, bool binary) const {
-  Check();
-  WriteToken(os, binary, "<Nnet>");
-  if (binary == false) os << std::endl;
-  for (int32 i = 0; i < NumComponents(); i++) {
-    components_[i]->Write(os, binary);
-  }
-  WriteToken(os, binary, "</Nnet>");
-  if (binary == false) os << std::endl;
-}
-
-
-std::string Nnet::Info() const {
-  // global info
-  std::ostringstream ostr;
-  ostr << "num-components " << NumComponents() << std::endl;
-  if (NumComponents() == 0)
-    return ostr.str();
-  ostr << "input-dim " << InputDim() << std::endl;
-  ostr << "output-dim " << OutputDim() << std::endl;
-  ostr << "number-of-parameters " << static_cast<float>(NumParams())/1e6
-       << " millions" << std::endl;
-  // topology & weight stats
-  for (int32 i = 0; i < NumComponents(); i++) {
-    ostr << "component " << i+1 << " : "
-         << Component::TypeToMarker(components_[i]->GetType())
-         << ", input-dim " << components_[i]->InputDim()
-         << ", output-dim " << components_[i]->OutputDim()
-         << ", " << components_[i]->Info() << std::endl;
-  }
-  return ostr.str();
-}
-
-std::string Nnet::InfoGradient(bool header) const {
-  std::ostringstream ostr;
-  // gradient stats
-  if (header) ostr << "\n### GRADIENT STATS :\n";
-  for (int32 i = 0; i < NumComponents(); i++) {
-    ostr << "Component " << i+1 << " : "
-         << Component::TypeToMarker(components_[i]->GetType())
-         << ", " << components_[i]->InfoGradient() << std::endl;
-  }
-  if (header) ostr << "### END GRADIENT\n";
-  return ostr.str();
-}
-
-std::string Nnet::InfoPropagate(bool header) const {
-  std::ostringstream ostr;
-  // forward-pass buffer stats
-  if (header) ostr << "\n### FORWARD PROPAGATION BUFFER CONTENT :\n";
-  ostr << "[0] output of <Input> " << MomentStatistics(propagate_buf_[0])
-       << std::endl;
-  for (int32 i = 0; i < NumComponents(); i++) {
-    ostr << "[" << 1+i << "] output of "
-         << Component::TypeToMarker(components_[i]->GetType())
-         << MomentStatistics(propagate_buf_[i+1]) << std::endl;
-    // nested networks too...
-    if (Component::kParallelComponent == components_[i]->GetType()) {
-      ostr <<
-        dynamic_cast<ParallelComponent*>(components_[i])->InfoPropagate();
-    }
-    if (Component::kMultiBasisComponent == components_[i]->GetType()) {
-      ostr << dynamic_cast<MultiBasisComponent*>(components_[i])->InfoPropagate();
-    }
-  }
-  if (header) ostr << "### END FORWARD\n";
-  return ostr.str();
-}
-
-std::string Nnet::InfoBackPropagate(bool header) const {
-  std::ostringstream ostr;
-  // forward-pass buffer stats
-  if (header) ostr << "\n### BACKWARD PROPAGATION BUFFER CONTENT :\n";
-  ostr << "[0] diff of <Input> " << MomentStatistics(backpropagate_buf_[0])
-       << std::endl;
-  for (int32 i = 0; i < NumComponents(); i++) {
-    ostr << "["<<1+i<< "] diff-output of "
-         << Component::TypeToMarker(components_[i]->GetType())
-         << MomentStatistics(backpropagate_buf_[i+1]) << std::endl;
-    // nested networks too...
-    if (Component::kParallelComponent == components_[i]->GetType()) {
-      ostr <<
-        dynamic_cast<ParallelComponent*>(components_[i])->InfoBackPropagate();
-    }
-    if (Component::kMultiBasisComponent == components_[i]->GetType()) {
-      ostr << dynamic_cast<MultiBasisComponent*>(components_[i])->InfoBackPropagate();
-    }
-  }
-  if (header) ostr << "### END BACKWARD\n\n";
-  return ostr.str();
-}
-
-
-void Nnet::Check() const {
-  // check dims,
-  for (size_t i = 0; i + 1 < components_.size(); i++) {
-    KALDI_ASSERT(components_[i] != NULL);
-    int32 output_dim = components_[i]->OutputDim(),
-      next_input_dim = components_[i+1]->InputDim();
-    // show error message,
-    if (output_dim != next_input_dim) {
-      KALDI_ERR << "Component dimension mismatch!"
-                << " Output dim of [" << i << "] "
-                << Component::TypeToMarker(components_[i]->GetType())
-                << " is " << output_dim << ". "
-                << "Input dim of next [" << i+1 << "] "
-                << Component::TypeToMarker(components_[i+1]->GetType())
-                << " is " << next_input_dim << ".";
-    }
-  }
-  // check for nan/inf in network weights,
-  Vector<BaseFloat> weights;
-  GetParams(&weights);
-  BaseFloat sum = weights.Sum();
-  if (KALDI_ISINF(sum)) {
-    KALDI_ERR << "'inf' in network parameters "
-              << "(weight explosion, need lower learning rate?)";
-  }
-  if (KALDI_ISNAN(sum)) {
-    KALDI_ERR << "'nan' in network parameters (need lower learning rate?)";
-  }
-}
-
-
-void Nnet::Destroy() {
-  for (int32 i = 0; i < NumComponents(); i++) {
-    delete components_[i];
-  }
-  components_.resize(0);
-  propagate_buf_.resize(0);
-  backpropagate_buf_.resize(0);
-}
-
-
-void Nnet::SetTrainOptions(const NnetTrainOptions& opts) {
-  opts_ = opts;
-  // set values to individual components,
-  for (int32 l = 0; l < NumComponents(); l++) {
-    if (GetComponent(l).IsUpdatable()) {
-      dynamic_cast<UpdatableComponent&>(GetComponent(l)).SetTrainOptions(opts_);
-    }
-  }
-}
-
-
-}  // namespace nnet1
-}  // namespace kaldi
diff --git a/src/nnet/nnet-nnet.h b/src/nnet/nnet-nnet.h
deleted file mode 100644
index cf29f91a89d..00000000000
--- a/src/nnet/nnet-nnet.h
+++ /dev/null
@@ -1,186 +0,0 @@
-// nnet/nnet-nnet.h
-
-// Copyright 2011-2016  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET_NNET_NNET_H_
-#define KALDI_NNET_NNET_NNET_H_
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <sstream>
-
-#include "base/kaldi-common.h"
-#include "util/kaldi-io.h"
-#include "matrix/matrix-lib.h"
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-component.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class Nnet {
- public:
-  Nnet();
-  ~Nnet();
-
-  Nnet(const Nnet& other);  // Allow copy constructor.
-  Nnet& operator= (const Nnet& other);  // Allow assignment operator.
-
- public:
-  /// Perform forward pass through the network,
-  void Propagate(const CuMatrixBase<BaseFloat> &in,
-                 CuMatrix<BaseFloat> *out);
-  /// Perform backward pass through the network,
-  void Backpropagate(const CuMatrixBase<BaseFloat> &out_diff,
-                     CuMatrix<BaseFloat> *in_diff);
-  /// Perform forward pass through the network (with 2 swapping buffers),
-  void Feedforward(const CuMatrixBase<BaseFloat> &in,
-                   CuMatrix<BaseFloat> *out);
-
-  /// Dimensionality on network input (input feature dim.),
-  int32 InputDim() const;
-  /// Dimensionality of network outputs (posteriors | bn-features | etc.),
-  int32 OutputDim() const;
-
-  /// Returns the number of 'Components' which form the NN.
-  /// Typically a NN layer is composed of 2 components:
-  /// the <AffineTransform> with trainable parameters
-  /// and a non-linearity like <Sigmoid> or <Softmax>.
-  /// Usually there are 2x more Components than the NN layers.
-  int32 NumComponents() const {
-    return components_.size();
-  }
-
-  /// Component accessor,
-  const Component& GetComponent(int32 c) const;
-
-  /// Component accessor,
-  Component& GetComponent(int32 c);
-
-  /// LastComponent accessor,
-  const Component& GetLastComponent() const;
-
-  /// LastComponent accessor,
-  Component& GetLastComponent();
-
-  /// Replace c'th component in 'this' Nnet (deep copy),
-  void ReplaceComponent(int32 c, const Component& comp);
-
-  /// Swap c'th component with the pointer,
-  void SwapComponent(int32 c, Component** comp);
-
-  /// Append Component to 'this' instance of Nnet (deep copy),
-  void AppendComponent(const Component& comp);
-
-  /// Append Component* to 'this' instance of Nnet by a shallow copy
-  /// ('this' instance of Nnet over-takes the ownership of the pointer).
-  void AppendComponentPointer(Component *dynamically_allocated_comp);
-
-  /// Append other Nnet to the 'this' Nnet (copy all its components),
-  void AppendNnet(const Nnet& nnet_to_append);
-
-  /// Remove c'th component,
-  void RemoveComponent(int32 c);
-
-  /// Remove the last of the Components,
-  void RemoveLastComponent();
-
-  /// Access to the forward-pass buffers
-  const std::vector<CuMatrix<BaseFloat> >& PropagateBuffer() const {
-    return propagate_buf_;
-  }
-  /// Access to the backward-pass buffers
-  const std::vector<CuMatrix<BaseFloat> >& BackpropagateBuffer() const {
-    return backpropagate_buf_;
-  }
-
-  /// Get the number of parameters in the network,
-  int32 NumParams() const;
-
-  /// Get the gradient stored in the network,
-  void GetGradient(Vector<BaseFloat>* gradient) const;
-
-  /// Get the network weights in a supervector,
-  void GetParams(Vector<BaseFloat>* params) const;
-
-  /// Set the network weights from a supervector,
-  void SetParams(const VectorBase<BaseFloat>& params);
-
-  /// Set the dropout rate
-  void SetDropoutRate(BaseFloat r);
-
-  /// Reset streams in multi-stream training,
-  void ResetStreams(const std::vector<int32> &stream_reset_flag);
-
-  /// Set sequence length in LSTM multi-stream training,
-  void SetSeqLengths(const std::vector<int32> &sequence_lengths);
-
-  /// Initialize the Nnet from the prototype,
-  void Init(const std::string &proto_file);
-
-  /// Read Nnet from 'rxfilename',
-  void Read(const std::string &rxfilename);
-  /// Read Nnet from 'istream',
-  void Read(std::istream &in, bool binary);
-
-  /// Write Nnet to 'wxfilename',
-  void Write(const std::string &wxfilename, bool binary) const;
-  /// Write Nnet to 'ostream',
-  void Write(std::ostream &out, bool binary) const;
-
-  /// Create string with human readable description of the nnet,
-  std::string Info() const;
-  /// Create string with per-component gradient statistics,
-  std::string InfoGradient(bool header = true) const;
-  /// Create string with propagation-buffer statistics,
-  std::string InfoPropagate(bool header = true) const;
-  /// Create string with back-propagation-buffer statistics,
-  std::string InfoBackPropagate(bool header = true) const;
-  /// Consistency check,
-  void Check() const;
-  /// Relese the memory,
-  void Destroy();
-
-  /// Set hyper-parameters of the training (pushes to all UpdatableComponents),
-  void SetTrainOptions(const NnetTrainOptions& opts);
-  /// Get training hyper-parameters from the network,
-  const NnetTrainOptions& GetTrainOptions() const {
-    return opts_;
-  }
-
- private:
-  /// Vector which contains all the components composing the neural network,
-  /// the components are for example: AffineTransform, Sigmoid, Softmax
-  std::vector<Component*> components_;
-
-  /// Buffers for forward pass (on demand initialization),
-  std::vector<CuMatrix<BaseFloat> > propagate_buf_;
-  /// Buffers for backward pass (on demand initialization),
-  std::vector<CuMatrix<BaseFloat> > backpropagate_buf_;
-
-  /// Option class with hyper-parameters passed to UpdatableComponent(s)
-  NnetTrainOptions opts_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_NNET_H_
-
-
diff --git a/src/nnet/nnet-parallel-component.h b/src/nnet/nnet-parallel-component.h
deleted file mode 100644
index 95dfddf612d..00000000000
--- a/src/nnet/nnet-parallel-component.h
+++ /dev/null
@@ -1,361 +0,0 @@
-// nnet/nnet-parallel-component.h
-
-// Copyright 2014  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_PARALLEL_COMPONENT_H_
-#define KALDI_NNET_NNET_PARALLEL_COMPONENT_H_
-
-#include <string>
-#include <vector>
-#include <sstream>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-
-namespace kaldi {
-namespace nnet1 {
-
-class ParallelComponent : public MultistreamComponent {
- public:
-  ParallelComponent(int32 dim_in, int32 dim_out):
-    MultistreamComponent(dim_in, dim_out)
-  { }
-
-  ~ParallelComponent()
-  { }
-
-  Component* Copy() const { return new ParallelComponent(*this); }
-  ComponentType GetType() const { return kParallelComponent; }
-
-  const Nnet& GetNestedNnet(int32 id) const { return nnet_.at(id); }
-  Nnet& GetNestedNnet(int32 id) { return nnet_.at(id); }
-
-  void InitData(std::istream &is) {
-    // define options
-    std::vector<std::string> nested_nnet_proto;
-    std::vector<std::string> nested_nnet_filename;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<NestedNnet>" || token == "<NestedNnetFilename>") {
-        while (is >> std::ws, !is.eof()) {
-          std::string file_or_end;
-          ReadToken(is, false, &file_or_end);
-          if (file_or_end == "</NestedNnet>" ||
-              file_or_end == "</NestedNnetFilename>") break;
-          nested_nnet_filename.push_back(file_or_end);
-        }
-      } else if (token == "<NestedNnetProto>") {
-        while (is >> std::ws, !is.eof()) {
-          std::string file_or_end;
-          ReadToken(is, false, &file_or_end);
-          if (file_or_end == "</NestedNnetProto>") break;
-          nested_nnet_proto.push_back(file_or_end);
-        }
-      } else { KALDI_ERR << "Unknown token " << token << ", typo in config?"
-                         << " (NestedNnet|NestedNnetFilename|NestedNnetProto)";
-      }
-    }
-    // Initialize,
-    // First, read nnets from files,
-    if (nested_nnet_filename.size() > 0) {
-      for (int32 i = 0; i < nested_nnet_filename.size(); i++) {
-        Nnet nnet;
-        nnet.Read(nested_nnet_filename[i]);
-        nnet_.push_back(nnet);
-        KALDI_LOG << "Loaded nested <Nnet> from file : "
-                  << nested_nnet_filename[i];
-      }
-    }
-    // Second, initialize nnets from prototypes,
-    if (nested_nnet_proto.size() > 0) {
-      for (int32 i = 0; i < nested_nnet_proto.size(); i++) {
-        Nnet nnet;
-        nnet.Init(nested_nnet_proto[i]);
-        nnet_.push_back(nnet);
-        KALDI_LOG << "Initialized nested <Nnet> from prototype : "
-                  << nested_nnet_proto[i];
-      }
-    }
-    // Check dim-sum of nested nnets,
-    int32 nnet_input_sum = 0, nnet_output_sum = 0;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      nnet_input_sum += nnet_[i].InputDim();
-      nnet_output_sum += nnet_[i].OutputDim();
-    }
-    KALDI_ASSERT(InputDim() == nnet_input_sum);
-    KALDI_ASSERT(OutputDim() == nnet_output_sum);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // read
-    ExpectToken(is, binary, "<NestedNnetCount>");
-    int32 nnet_count;
-    ReadBasicType(is, binary, &nnet_count);
-    for (int32 i = 0; i < nnet_count; i++) {
-      ExpectToken(is, binary, "<NestedNnet>");
-      int32 dummy;
-      ReadBasicType(is, binary, &dummy);
-      Nnet nnet;
-      nnet.Read(is, binary);
-      nnet_.push_back(nnet);
-    }
-    ExpectToken(is, binary, "</ParallelComponent>");
-
-    // check dim-sum of nested nnets
-    int32 nnet_input_sum = 0, nnet_output_sum = 0;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      nnet_input_sum += nnet_[i].InputDim();
-      nnet_output_sum += nnet_[i].OutputDim();
-    }
-    KALDI_ASSERT(InputDim() == nnet_input_sum);
-    KALDI_ASSERT(OutputDim() == nnet_output_sum);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // useful dims
-    int32 nnet_count = nnet_.size();
-    //
-    WriteToken(os, binary, "<NestedNnetCount>");
-    WriteBasicType(os, binary, nnet_count);
-    if (!binary) os << "\n";
-    for (int32 i = 0; i < nnet_count; i++) {
-      WriteToken(os, binary, "<NestedNnet>");
-      WriteBasicType(os, binary, i+1);
-      if (!binary) os << "\n";
-      nnet_[i].Write(os, binary);
-    }
-    WriteToken(os, binary, "</ParallelComponent>");
-  }
-
-  int32 NumParams() const {
-    int32 ans = 0;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      ans += nnet_[i].NumParams();
-    }
-    return ans;
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 offset = 0;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      int32 n_params = nnet_[i].NumParams();
-      Vector<BaseFloat> gradient_aux;  // we need 'Vector<>',
-      nnet_[i].GetGradient(&gradient_aux);  // copy gradient from Nnet,
-      gradient->Range(offset, n_params).CopyFromVec(gradient_aux);
-      offset += n_params;
-    }
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 offset = 0;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      int32 n_params = nnet_[i].NumParams();
-      Vector<BaseFloat> params_aux;  // we need 'Vector<>',
-      nnet_[i].GetParams(&params_aux);  // copy params from Nnet,
-      params->Range(offset, n_params).CopyFromVec(params_aux);
-      offset += n_params;
-    }
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 offset = 0;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      int32 n_params = nnet_[i].NumParams();
-      nnet_[i].SetParams(params.Range(offset, n_params));
-      offset += n_params;
-    }
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  std::string Info() const {
-    std::ostringstream os;
-    os << "\n";
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      os << "nested_network #" << i+1 << " {\n"
-         << nnet_[i].Info()
-         << "}\n";
-    }
-    std::string s(os.str());
-    s.erase(s.end() -1);  // removing last '\n'
-    return s;
-  }
-
-  std::string InfoGradient() const {
-    std::ostringstream os;
-    os << "\n";
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      os << "nested_gradient #" << i+1 << " {\n"
-         << nnet_[i].InfoGradient(false)
-         << "}\n";
-    }
-    std::string s(os.str());
-    s.erase(s.end() -1);  // removing last '\n'
-    return s;
-  }
-
-  std::string InfoPropagate() const {
-    std::ostringstream os;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      os << "nested_propagate #" << i+1 << " {\n"
-         << nnet_[i].InfoPropagate(false)
-         << "}\n";
-    }
-    return os.str();
-  }
-
-  std::string InfoBackPropagate() const {
-    std::ostringstream os;
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      os << "nested_backpropagate #" << i+1 << " {\n"
-         << nnet_[i].InfoBackPropagate(false)
-         << "}\n";
-    }
-    return os.str();
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // column-offsets for data buffers 'in,out',
-    int32 input_offset = 0, output_offset = 0;
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      // get the data 'windows',
-      CuSubMatrix<BaseFloat> src(
-        in.ColRange(input_offset, nnet_[i].InputDim())
-      );
-      CuSubMatrix<BaseFloat> tgt(
-        out->ColRange(output_offset, nnet_[i].OutputDim())
-      );
-      // forward through auxiliary matrix, as 'Propagate' requires 'CuMatrix',
-      CuMatrix<BaseFloat> tgt_aux;
-      nnet_[i].Propagate(src, &tgt_aux);
-      tgt.CopyFromMat(tgt_aux);
-      // advance the offsets,
-      input_offset += nnet_[i].InputDim();
-      output_offset += nnet_[i].OutputDim();
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // column-offsets for data buffers 'in,out',
-    int32 input_offset = 0, output_offset = 0;
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      // get the data 'windows',
-      CuSubMatrix<BaseFloat> src(
-        out_diff.ColRange(output_offset, nnet_[i].OutputDim())
-      );
-      CuSubMatrix<BaseFloat> tgt(
-        in_diff->ColRange(input_offset, nnet_[i].InputDim())
-      );
-      // ::Backpropagate through auxiliary matrix (CuMatrix in the interface),
-      CuMatrix<BaseFloat> tgt_aux;
-      nnet_[i].Backpropagate(src, &tgt_aux);
-      tgt.CopyFromMat(tgt_aux);
-      // advance the offsets,
-      input_offset += nnet_[i].InputDim();
-      output_offset += nnet_[i].OutputDim();
-    }
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    { }  // do nothing
-  }
-
-  /**
-   * Overriding the default,
-   * which was UpdatableComponent::SetTrainOptions(...)
-   */
-  void SetTrainOptions(const NnetTrainOptions &opts) {
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      nnet_[i].SetTrainOptions(opts);
-    }
-  }
-
-  /**
-   * Overriding the default,
-   * which was UpdatableComponent::SetLearnRateCoef(...)
-   */
-  void SetLearnRateCoef(BaseFloat val) {
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      // loop over components,
-      for (int32 j = 0; j < nnet_[i].NumComponents(); j++) {
-        if (nnet_[i].GetComponent(j).IsUpdatable()) {
-          UpdatableComponent& comp =
-            dynamic_cast<UpdatableComponent&>(nnet_[i].GetComponent(j));
-          // set the value,
-          comp.SetLearnRateCoef(val);
-        }
-      }
-    }
-  }
-
-  /**
-   * Overriding the default,
-   * which was UpdatableComponent::SetBiasLearnRateCoef(...)
-   */
-  void SetBiasLearnRateCoef(BaseFloat val) {
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      // loop over components,
-      for (int32 j = 0; j < nnet_[i].NumComponents(); j++) {
-        if (nnet_[i].GetComponent(j).IsUpdatable()) {
-          UpdatableComponent& comp =
-            dynamic_cast<UpdatableComponent&>(nnet_[i].GetComponent(j));
-          // set the value,
-          comp.SetBiasLearnRateCoef(val);
-        }
-      }
-    }
-  }
-
-  /**
-   * Overriding the default,
-   * which was MultistreamComponent::SetSeqLengths(...)
-   */
-  void SetSeqLengths(const std::vector<int32> &sequence_lengths) {
-    sequence_lengths_ = sequence_lengths;
-    // loop over nnets,
-    for (int32 i = 0; i < nnet_.size(); i++) {
-      nnet_[i].SetSeqLengths(sequence_lengths);
-    }
-  }
-
- private:
-  std::vector<Nnet> nnet_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_PARALLEL_COMPONENT_H_
diff --git a/src/nnet/nnet-parametric-relu.h b/src/nnet/nnet-parametric-relu.h
deleted file mode 100644
index 0cdf3347f35..00000000000
--- a/src/nnet/nnet-parametric-relu.h
+++ /dev/null
@@ -1,213 +0,0 @@
-// nnet/nnet-parametric-relu.h
-
-// Copyright 2016 Brno University of Technology (author: Murali Karthick B)
-//           2011-2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_PARAMETRIC_RELU_H_
-#define KALDI_NNET_NNET_PARAMETRIC_RELU_H_
-
-#include <string>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class ParametricRelu : public UpdatableComponent {
- public:
-  ParametricRelu(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    alpha_(dim_out),
-    beta_(dim_out),
-    alpha_corr_(dim_out),
-    beta_corr_(dim_out),
-    alpha_learn_rate_coef_(0.0),
-    beta_learn_rate_coef_(0.0)
-  { }
-
-  ~ParametricRelu()
-  { }
-
-  Component* Copy() const { return new ParametricRelu(*this); }
-  ComponentType GetType() const { return kParametricRelu; }
-
-  void InitData(std::istream &is) {
-    // define options
-    BaseFloat alpha = 1.0, beta = 0.0;
-
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<Alpha>") ReadBasicType(is, false, &alpha);
-      else if (token == "<Beta>") ReadBasicType(is, false, &beta);
-      else if (token == "<AlphaLearnRateCoef>") ReadBasicType(is, false, &alpha_learn_rate_coef_);
-      else if (token == "<BetaLearnRateCoef>") ReadBasicType(is, false, &beta_learn_rate_coef_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                  << " (Alpha|Beta|AlphaLearnRateCoef|BetaLearnRateCoef)";
-    }
-
-    // Initialize trainable parameters,
-    alpha_.Set(alpha);
-    beta_.Set(beta);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    while ('<' == Peek(is, binary)) {
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'A': ExpectToken(is, binary, "<AlphaLearnRateCoef>");
-          ReadBasicType(is, binary, &alpha_learn_rate_coef_);
-          break;
-        case 'B': ExpectToken(is, binary, "<BetaLearnRateCoef>");
-          ReadBasicType(is, binary, &beta_learn_rate_coef_);
-          break;
-        default:
-          std::string token;
-          ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-    // ParametricRelu scaling parameters
-    alpha_.Read(is, binary);
-    beta_.Read(is, binary);
-    KALDI_ASSERT(alpha_.Dim() == output_dim_);
-    KALDI_ASSERT(beta_.Dim() == output_dim_);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<AlphaLearnRateCoef>");
-    WriteBasicType(os, binary, alpha_learn_rate_coef_);
-    WriteToken(os, binary, "<BetaLearnRateCoef>");
-    WriteBasicType(os, binary, beta_learn_rate_coef_);
-
-    // ParametricRelu scales for each neuron,
-    if (!binary) os << "\n";
-    alpha_.Write(os, binary);
-    beta_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return alpha_.Dim() + beta_.Dim();
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 alpha_num_elem = alpha_.Dim();
-    int32 beta_num_elem = beta_.Dim();
-    gradient->Range(0, alpha_num_elem).CopyFromVec(Vector<BaseFloat>(alpha_corr_));
-    gradient->Range(alpha_num_elem, beta_num_elem).CopyFromVec(Vector<BaseFloat>(beta_corr_));
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 alpha_num_elem = alpha_.Dim();
-    int32 beta_num_elem = beta_.Dim();
-    params->Range(0, alpha_num_elem).CopyFromVec(Vector<BaseFloat>(alpha_));
-    params->Range(alpha_num_elem, beta_num_elem).CopyFromVec(Vector<BaseFloat>(beta_));
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 alpha_num_elem = alpha_.Dim();
-    int32 beta_num_elem = beta_.Dim();
-    alpha_.CopyFromVec(params.Range(0, alpha_num_elem));
-    beta_.CopyFromVec(params.Range(alpha_num_elem, beta_num_elem));
-  }
-
-  std::string Info() const {
-    return std::string("\n  alpha") +
-      MomentStatistics(alpha_) +
-      ", alpha-lr-coef " + ToString(alpha_learn_rate_coef_) +
-      "\n  beta" + MomentStatistics(beta_) +
-      ", beta-lr-coef " + ToString(beta_learn_rate_coef_);
-  }
-  std::string InfoGradient() const {
-    return std::string("\n  alpha_grad") +
-      MomentStatistics(alpha_corr_) +
-      ", alpha-lr-coef " + ToString(alpha_learn_rate_coef_) +
-      "\n  beta_grad" + MomentStatistics(beta_corr_) +
-      ", beta-lr-coef " + ToString(beta_learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // out = (in < 0.0 ? aplha*in : beta*in)
-    out->ParametricRelu(in, alpha_, beta_);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // in_diff = (in > 0 ? alpha * out_diff : beta * out_diff)
-    in_diff->DiffParametricRelu(in, out_diff, alpha_, beta_);
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // we use these hyperparameters,
-    const BaseFloat alpha_lr = opts_.learn_rate * alpha_learn_rate_coef_;
-    const BaseFloat beta_lr = opts_.learn_rate * beta_learn_rate_coef_;
-    const BaseFloat mmt = opts_.momentum;
-
-    if (alpha_learn_rate_coef_ > 0.0) {
-       // get gradient,
-       alpha_aux_ = input;
-       alpha_aux_.ApplyFloor(0.0); // masking positive Relu inputs,
-       alpha_aux_.MulElements(diff);
-       alpha_corr_.AddRowSumMat(1.0, alpha_aux_, mmt);
-       // update,
-       alpha_.AddVec(-alpha_lr, alpha_corr_);
-    }
-    if (beta_learn_rate_coef_ > 0.0) {
-       // get gradient,
-       beta_aux_ = input;
-       beta_aux_.ApplyCeiling(0.0); // masking positive Relu inputs,
-       beta_aux_.MulElements(diff);
-       beta_corr_.AddRowSumMat(1.0, beta_aux_, mmt);
-       beta_.AddVec(-beta_lr, beta_corr_);
-    }
-  }
-
- private:
-  CuVector<BaseFloat> alpha_;  ///< Vector of 'alphas', one value per neuron.
-  CuVector<BaseFloat> beta_;  ///< Vector of 'betas', one value per neuron.
-
-  CuVector<BaseFloat> alpha_corr_;  ///< Vector of 'alpha' updates.
-  CuVector<BaseFloat> beta_corr_;  ///< Vector of 'beta' updates.
-
-  /// Auxiliary matrix for getting 'alpha' updates,
-  CuMatrix<BaseFloat> alpha_aux_;
-  /// Auxiliary matrix for getting 'beta' updates,
-  CuMatrix<BaseFloat> beta_aux_;
-
-  /// Controls learning rate for alpha (0.0 disables learning),
-  BaseFloat alpha_learn_rate_coef_;
-  /// Controls learning rate for beta (0.0 disables learning),
-  BaseFloat beta_learn_rate_coef_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_PARAMETRIC_RELU_H_
diff --git a/src/nnet/nnet-pdf-prior.cc b/src/nnet/nnet-pdf-prior.cc
deleted file mode 100644
index 90ee3239a39..00000000000
--- a/src/nnet/nnet-pdf-prior.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// nnet/nnet-pdf-prior.cc
-
-// Copyright 2013  Brno University of Technology (Author: Karel Vesely);
-//                 Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-pdf-prior.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-PdfPrior::PdfPrior(const PdfPriorOptions &opts)
-    : prior_scale_(opts.prior_scale) {
-  if (opts.class_frame_counts == "") {
-    // class_frame_counts is empty, the PdfPrior is deactivated...
-    // (for example when 'nnet-forward' generates bottleneck features)
-    return;
-  }
-
-  KALDI_LOG << "Computing pdf-priors from : " << opts.class_frame_counts;
-
-  Vector<double> frame_counts, rel_freq, log_priors;
-  {
-    Input in;
-    in.OpenTextMode(opts.class_frame_counts);
-    frame_counts.Read(in.Stream(), false);
-    in.Close();
-  }
-
-  // get relative frequencies,
-  rel_freq = frame_counts;
-  rel_freq.Scale(1.0/frame_counts.Sum());
-
-  // get the log-prior,
-  log_priors = rel_freq;
-  log_priors.Add(1e-20);
-  log_priors.ApplyLog();
-
-  // Make the priors for classes with low counts +inf (i.e. -log(0))
-  // such that the classes have 0 likelihood (i.e. -inf log-likelihood).
-  // We use sqrt(FLT_MAX) instead of -kLogZeroFloat to prevent NANs
-  // from appearing in computation.
-  int32 num_floored = 0;
-  for (int32 i = 0; i < log_priors.Dim(); i++) {
-    if (rel_freq(i) < opts.prior_floor) {
-      log_priors(i) = sqrt(FLT_MAX);
-      num_floored++;
-    }
-  }
-  KALDI_LOG << "Floored " << num_floored << " pdf-priors "
-            << "(hard-set to " << sqrt(FLT_MAX)
-            << ", which disables DNN output when decoding)";
-
-  // sanity check,
-  KALDI_ASSERT(KALDI_ISFINITE(log_priors.Sum()));
-
-  // push to GPU,
-  log_priors_ = Vector<BaseFloat>(log_priors);
-}
-
-
-void PdfPrior::SubtractOnLogpost(CuMatrixBase<BaseFloat> *llk) {
-  if (log_priors_.Dim() == 0) {
-    KALDI_ERR << "--class-frame-counts is empty: Cannot initialize priors "
-              << "without the counts.";
-  }
-  if (log_priors_.Dim() != llk->NumCols()) {
-    KALDI_ERR << "Dimensionality mismatch,"
-              << " class_frame_counts " << log_priors_.Dim()
-              << " pdf_output_llk " << llk->NumCols();
-  }
-  llk->AddVecToRows(-prior_scale_, log_priors_);
-}
-
-}  // namespace nnet1
-}  // namespace kaldi
diff --git a/src/nnet/nnet-pdf-prior.h b/src/nnet/nnet-pdf-prior.h
deleted file mode 100644
index f02e61cc993..00000000000
--- a/src/nnet/nnet-pdf-prior.h
+++ /dev/null
@@ -1,77 +0,0 @@
-// nnet/nnet-pdf-prior.h
-
-// Copyright 2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET_NNET_PDF_PRIOR_H_
-#define KALDI_NNET_NNET_PDF_PRIOR_H_
-
-#include <cfloat>
-#include <string>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "matrix/matrix-lib.h"
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-vector.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-struct PdfPriorOptions {
-  std::string class_frame_counts;
-  BaseFloat prior_scale;
-  BaseFloat prior_floor;
-
-  PdfPriorOptions():
-    class_frame_counts(""),
-    prior_scale(1.0),
-    prior_floor(1e-10)
-  { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("class-frame-counts", &class_frame_counts,
-                   "Vector with frame-counts of pdfs to compute log-priors."
-                   " (priors are typically subtracted from log-posteriors"
-                   " or pre-softmax activations)");
-    opts->Register("prior-scale", &prior_scale,
-                   "Scaling factor to be applied on pdf-log-priors");
-    opts->Register("prior-floor", &prior_floor,
-                   "Flooring constatnt for prior probability "
-                   "(i.e. label rel. frequency)");
-  }
-};
-
-class PdfPrior {
- public:
-  /// Initialize pdf-prior from options
-  explicit PdfPrior(const PdfPriorOptions &opts);
-
-  /// Subtract pdf priors from log-posteriors to get pseudo log-likelihoods
-  void SubtractOnLogpost(CuMatrixBase<BaseFloat> *llk);
-
- private:
-  BaseFloat prior_scale_;
-  CuVector<BaseFloat> log_priors_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(PdfPrior);
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_PDF_PRIOR_H_
diff --git a/src/nnet/nnet-randomizer-test.cc b/src/nnet/nnet-randomizer-test.cc
deleted file mode 100644
index 1f4b2564089..00000000000
--- a/src/nnet/nnet-randomizer-test.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-// nnet/nnet-randomizer-test.cc
-
-// Copyright 2013  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-randomizer.h"
-
-#include <numeric>
-#include <vector>
-#include <algorithm>
-
-using namespace kaldi;
-using namespace kaldi::nnet1;
-
-//////////////////////////////////////////////////
-
-template<class Real>
-static void InitRand(VectorBase<Real> *v) {
-  for (MatrixIndexT i = 0;i < v->Dim();i++)
-    (*v)(i) = RandGauss();
-}
-
-template<class Real>
-static void InitRand(MatrixBase<Real> *M) {
-  do {
-    for (MatrixIndexT i = 0;i < M->NumRows();i++)
-      for (MatrixIndexT j = 0;j < M->NumCols();j++)
-        (*M)(i, j) = RandGauss();
-  } while (M->NumRows() != 0 && M->Cond() > 100);
-}
-
-
-template<class Real>
-static void AssertEqual(const VectorBase<Real> &A,
-                        const VectorBase<Real> &B,
-                        float tol = 0.001) {
-  KALDI_ASSERT(A.Dim() == B.Dim());
-  for (MatrixIndexT i = 0; i < A.Dim(); i++) {
-    KALDI_ASSERT(std::abs(A(i)-B(i)) < tol);
-  }
-}
-
-
-template<class RandomAccessIterator>
-static void AssertEqual(RandomAccessIterator begin1, RandomAccessIterator end1,
-                        RandomAccessIterator begin2, RandomAccessIterator end2) {
-  KALDI_ASSERT((end1 - begin1) == (end2 - begin2));
-  KALDI_ASSERT(end1 > begin1);
-  for ( ; begin1 < end1; ++begin1, ++begin2) {
-    KALDI_ASSERT(*begin1 == *begin2);
-  }
-}
-
-
-//////////////////////////////////////////////////
-
-void UnitTestRandomizerMask() {
-  NnetDataRandomizerOptions c;
-  RandomizerMask r;
-  r.Init(c);
-  const std::vector<int32>& m = r.Generate(5);
-  KALDI_ASSERT(m.size() == 5);
-  int32 sum_of_elems = std::accumulate(m.begin(), m.end(),0);
-  KALDI_ASSERT(sum_of_elems == 4 + 3 + 2 + 1 + 0);
-}
-
-void UnitTestMatrixRandomizer() {
-  Matrix<BaseFloat> m(1111, 10);
-  InitRand(&m);
-  CuMatrix<BaseFloat> m2(m);
-  // config
-  NnetDataRandomizerOptions c;
-  c.randomizer_size = 1000;
-  c.minibatch_size = 100;
-  // randomizer
-  MatrixRandomizer r;
-  r.Init(c);
-  r.AddData(m2);
-  KALDI_ASSERT(r.IsFull());
-  // create vector with consecutive indices
-  std::vector<int32> mask(1111);
-  for (int32 i = 0; i < 1111; i++) {
-    mask[i] = i;
-  }
-  r.Randomize(mask);  // no shuffling
-  // make sure we get same data we put to randomizer
-  int32 i=0;
-  for ( ; !r.Done(); r.Next(), i++) {
-    KALDI_LOG << i;
-    const CuMatrixBase<BaseFloat> &m3 = r.Value();
-    Matrix<BaseFloat> m4(m3.NumRows(), m3.NumCols());
-    m3.CopyToMat(&m4);
-    AssertEqual(m4, m.RowRange(i * c.minibatch_size, c.minibatch_size));
-  }
-  KALDI_ASSERT(i == 11);  // 11 minibatches
-
-  KALDI_LOG << "Filling for 2nd time";
-  // try to fill buffer one more time, and empty it
-  KALDI_ASSERT(!r.IsFull());
-  r.AddData(m2);
-  KALDI_ASSERT(r.IsFull());
-  KALDI_ASSERT(r.NumFrames() == 11 + 1111);
-  {  // check last 11 rows were copied to the front in the buffer
-    const CuMatrixBase<BaseFloat> &m3 = r.Value();
-    Matrix<BaseFloat> m4(m3.NumRows(), m3.NumCols());
-    m3.CopyToMat(&m4);
-    AssertEqual(m4.RowRange(0, 11), m.RowRange(1100, 11));
-  }
-  KALDI_ASSERT(!r.Done());
-  for ( ; !r.Done(); r.Next(), i++) {
-    KALDI_LOG << i;
-    const CuMatrixBase<BaseFloat>& m3 = r.Value();
-    static_cast<const void>(m3);  // variable no longer unused,
-  }
-  KALDI_ASSERT(i == 22);  // 22 minibatches
-}
-
-void UnitTestVectorRandomizer() {
-  Vector<BaseFloat> v(1111);
-  InitRand(&v);
-  // config
-  NnetDataRandomizerOptions c;
-  c.randomizer_size = 1000;
-  c.minibatch_size = 100;
-  // randomizer
-  VectorRandomizer r;
-  r.Init(c);
-  r.AddData(v);
-  KALDI_ASSERT(r.IsFull());
-  // create vector with consecutive indices
-  std::vector<int32> mask(1111);
-  for (int32 i = 0; i < 1111; i++) {
-    mask[i] = i;
-  }
-  r.Randomize(mask);  // no shuffling
-  // make sure we get same data we put to randomizer
-  int32 i = 0;
-  for ( ; !r.Done(); r.Next(), i++) {
-    KALDI_LOG << i;
-    const VectorBase<BaseFloat> &v2 = r.Value();
-    AssertEqual(v2, v.Range(i * c.minibatch_size, c.minibatch_size));
-  }
-  KALDI_ASSERT(i == 11);  // 11 minibatches
-
-  KALDI_LOG << "Filling for 2nd time";
-  // try to fill buffer one more time, and empty it
-  KALDI_ASSERT(!r.IsFull());
-  r.AddData(v);
-  KALDI_ASSERT(r.IsFull());
-  KALDI_ASSERT(r.NumFrames() == 11 + 1111);
-  {  // check last 11 rows were copied to the front in the buffer
-    const VectorBase<BaseFloat> &v2 = r.Value();
-    AssertEqual(v2.Range(0, 11), v.Range(1100, 11));
-  }
-  KALDI_ASSERT(!r.Done());
-  for ( ; !r.Done(); r.Next(), i++) {
-    KALDI_LOG << i;
-    const VectorBase<BaseFloat>& v2 = r.Value();
-    static_cast<const void>(v2);  // variable no longer unused,
-  }
-  KALDI_ASSERT(i == 22);  // 22 minibatches
-}
-
-void UnitTestStdVectorRandomizer() {
-  // prepare vector with some data,
-  std::vector<int32> v(1111);
-  for (int32 i = 0; i < v.size(); i++) {
-    v.at(i) = i;
-  }
-  std::random_shuffle(v.begin(), v.end());
-
-  // config
-  NnetDataRandomizerOptions c;
-  c.randomizer_size = 1000;
-  c.minibatch_size = 100;
-  // randomizer
-  Int32VectorRandomizer r;
-  r.Init(c);
-  r.AddData(v);
-  KALDI_ASSERT(r.IsFull());
-  // create vector with consecutive indices
-  std::vector<int32> mask(1111);
-  for (int32 i = 0; i < 1111; i++) {
-    mask[i]=i;
-  }
-  r.Randomize(mask);  // no shuffling
-  // make sure we get same data we put to randomizer
-  int32 i = 0;
-  for ( ; !r.Done(); r.Next(), i++) {
-    KALDI_LOG << i;
-    std::vector<int32> v2 = r.Value();
-    AssertEqual(v2.begin(),
-                v2.end(),
-                v.begin() + (i * c.minibatch_size),
-                v.begin() + ((i+1) * c.minibatch_size));
-  }
-  KALDI_ASSERT(i == 11);  // 11 minibatches
-
-  KALDI_LOG << "Filling for 2nd time";
-  // try to fill buffer one more time, and empty it
-  KALDI_ASSERT(!r.IsFull());
-  r.AddData(v);
-  KALDI_ASSERT(r.IsFull());
-  KALDI_ASSERT(r.NumFrames() == 11 + 1111);
-  {  // check last 11 rows were copied to the front in the buffer
-    std::vector<int32> v2 = r.Value();
-    AssertEqual(v2.begin(), v2.begin()+11, v.begin()+1100, v.begin()+1100+11);
-  }
-  KALDI_ASSERT(!r.Done());
-  for ( ; !r.Done(); r.Next(), i++) {
-    KALDI_LOG << i;
-    std::vector<int32> v2 = r.Value();
-  }
-  KALDI_ASSERT(i == 22);  // 22 minibatches
-}
-
-
-int main() {
-  UnitTestRandomizerMask();
-  UnitTestMatrixRandomizer();
-  UnitTestVectorRandomizer();
-  UnitTestStdVectorRandomizer();
-
-  std::cout << "Tests succeeded.\n";
-}
-
diff --git a/src/nnet/nnet-randomizer.cc b/src/nnet/nnet-randomizer.cc
deleted file mode 100644
index b15214ea477..00000000000
--- a/src/nnet/nnet-randomizer.cc
+++ /dev/null
@@ -1,234 +0,0 @@
-// nnet/nnet-randomizer.cc
-
-// Copyright 2013  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-randomizer.h"
-
-#include <vector>
-#include <algorithm>
-#include <utility>
-
-namespace kaldi {
-namespace nnet1 {
-
-/* RandomizerMask:: */
-
-void RandomizerMask::Init(const NnetDataRandomizerOptions& conf) {
-  KALDI_LOG << "Seeding by srand with : " << conf.randomizer_seed;
-  srand(conf.randomizer_seed);
-}
-
-const std::vector<int32>& RandomizerMask::Generate(int32 mask_size) {
-  mask_.resize(mask_size);
-  for (int32 i = 0; i < mask_size; i++) mask_[i] = i;
-  // shuffle using built-in random generator:
-  std::random_shuffle(mask_.begin(), mask_.end());
-  return mask_;
-}
-
-
-/* MatrixRandomizer:: */
-
-void MatrixRandomizer::AddData(const CuMatrixBase<BaseFloat>& m) {
-  // pre-allocate before 1st use
-  if (data_.NumCols() == 0) {
-    data_.Resize(conf_.randomizer_size, m.NumCols());
-  }
-  // optionally put previous left-over to front
-  if (data_begin_ > 0) {
-    KALDI_ASSERT(data_begin_ <= data_end_);  // sanity check,
-    int32 leftover = data_end_ - data_begin_;
-    KALDI_ASSERT(leftover < data_begin_);  // no overlap,
-    if (leftover > 0) {
-      data_.RowRange(0, leftover).CopyFromMat(data_.RowRange(data_begin_, leftover));
-    }
-    data_begin_ = 0;
-    data_end_ = leftover;
-    // set zero to the rest of the buffer,
-    data_.RowRange(leftover, data_.NumRows() - leftover).SetZero();
-  }
-  // extend the buffer if necessary,
-  if (data_.NumRows() < data_end_ + m.NumRows()) {
-    // CuMatrix -> Matrix -> CuMatrix (needs less GPU memory),
-    Matrix<BaseFloat> data_aux(data_);
-    // Add extra 3% rows, so we don't reallocate soon:
-    int32 extra_rows = 0.03 * data_.NumRows();
-    data_.Resize(data_end_ + m.NumRows() + extra_rows, data_.NumCols());
-    data_.RowRange(0, data_aux.NumRows()).CopyFromMat(data_aux);
-  }
-  // copy the data
-  data_.RowRange(data_end_, m.NumRows()).CopyFromMat(m);
-  data_end_ += m.NumRows();
-}
-
-void MatrixRandomizer::Randomize(const std::vector<int32>& mask) {
-  KALDI_ASSERT(data_begin_ == 0);
-  KALDI_ASSERT(data_end_ > 0);
-  KALDI_ASSERT(data_end_ == mask.size());
-  // Copy to auxiliary buffer for unshuffled data
-  data_aux_ = data_;
-  // Put the mask to GPU
-  CuArray<int32> mask_in_gpu(mask.size());
-  mask_in_gpu.CopyFromVec(mask);
-  // Randomize the data, mask is used to index rows in source matrix:
-  // (Here the vector 'mask_in_gpu' is typically shorter than number
-  //  of rows in 'data_aux_', because the buffer 'data_aux_'
-  //  is larger than capacity 'randomizer_size'.
-  //  The extra rows in 'data_aux_' do not contain speech frames and
-  //  are not copied from 'data_aux_', the extra rows in 'data_' are
-  //  unchanged by cu::Randomize.)
-  cu::Randomize(data_aux_, mask_in_gpu, &data_);
-}
-
-void MatrixRandomizer::Next() {
-  data_begin_ += conf_.minibatch_size;
-}
-
-const CuMatrixBase<BaseFloat>& MatrixRandomizer::Value() {
-  // make sure we have data for next minibatch,
-  KALDI_ASSERT(data_end_ - data_begin_ >= conf_.minibatch_size);
-  // prepare the mini-batch buffer,
-  minibatch_.Resize(conf_.minibatch_size, data_.NumCols(), kUndefined);
-  minibatch_.CopyFromMat(data_.RowRange(data_begin_, conf_.minibatch_size));
-  return minibatch_;
-}
-
-
-/* VectorRandomizer */
-
-void VectorRandomizer::AddData(const Vector<BaseFloat>& v) {
-  // pre-allocate before 1st use
-  if (data_.Dim() == 0) {
-    data_.Resize(conf_.randomizer_size);
-  }
-  // optionally put previous left-over to front
-  if (data_begin_ > 0) {
-    KALDI_ASSERT(data_begin_ <= data_end_);  // sanity check
-    int32 leftover = data_end_ - data_begin_;
-    KALDI_ASSERT(leftover < data_begin_);  // no overlap
-    if (leftover > 0) {
-      data_.Range(0, leftover).CopyFromVec(data_.Range(data_begin_, leftover));
-    }
-    data_begin_ = 0;
-    data_end_ = leftover;
-    data_.Range(leftover, data_.Dim()-leftover).SetZero();  // zeroing the rest
-  }
-  // extend the buffer if necessary
-  if (data_.Dim() < data_end_ + v.Dim()) {
-    Vector<BaseFloat> data_aux(data_);
-    data_.Resize(data_end_ + v.Dim() + 1000);  // +1000 row surplus
-    data_.Range(0, data_aux.Dim()).CopyFromVec(data_aux);
-  }
-  // copy the data
-  data_.Range(data_end_, v.Dim()).CopyFromVec(v);
-  data_end_ += v.Dim();
-}
-
-void VectorRandomizer::Randomize(const std::vector<int32>& mask) {
-  KALDI_ASSERT(data_begin_ == 0);
-  KALDI_ASSERT(data_end_ > 0);
-  KALDI_ASSERT(data_end_ == mask.size());
-  // Use auxiliary buffer for unshuffled data
-  Vector<BaseFloat> data_aux(data_);
-  // randomize the data, mask is used to index elements in source vector
-  for (int32 i = 0; i < mask.size(); i++) {
-    data_(i) = data_aux(mask.at(i));
-  }
-}
-
-void VectorRandomizer::Next() {
-  data_begin_ += conf_.minibatch_size;
-}
-
-const Vector<BaseFloat>& VectorRandomizer::Value() {
-  // make sure we have data for next minibatch,
-  KALDI_ASSERT(data_end_ - data_begin_ >= conf_.minibatch_size);
-  // prepare the mini-batch buffer,
-  minibatch_.Resize(conf_.minibatch_size, kUndefined);
-  minibatch_.CopyFromVec(data_.Range(data_begin_, conf_.minibatch_size));
-  return minibatch_;
-}
-
-
-/* StdVectorRandomizer */
-
-template<typename T>
-void StdVectorRandomizer<T>::AddData(const std::vector<T>& v) {
-  // pre-allocate before 1st use
-  if (data_.size() == 0) {
-    data_.resize(conf_.randomizer_size);
-  }
-  // optionally put previous left-over to front
-  if (data_begin_ > 0) {
-    KALDI_ASSERT(data_begin_ <= data_end_);  // sanity check
-    int32 leftover = data_end_ - data_begin_;
-    KALDI_ASSERT(leftover < data_begin_);  // no overlap
-    if (leftover > 0) {
-      typename std::vector<T>::iterator leftover_begin = data_.begin() + data_begin_;
-      std::copy(leftover_begin, leftover_begin + leftover, data_.begin());
-    }
-    data_begin_ = 0;
-    data_end_ = leftover;
-  }
-  // extend the buffer if necessary
-  if (data_.size() < data_end_ + v.size()) {
-    data_.resize(data_end_ + v.size() + 1000);  // +1000 row surplus
-  }
-  // copy the data
-  std::copy(v.begin(), v.end(), data_.begin()+data_end_);
-  data_end_ += v.size();
-}
-
-template<typename T>
-void StdVectorRandomizer<T>::Randomize(const std::vector<int32>& mask) {
-  KALDI_ASSERT(data_begin_ == 0);
-  KALDI_ASSERT(data_end_ > 0);
-  KALDI_ASSERT(data_end_ == mask.size());
-  // Use auxiliary buffer for unshuffled data
-  std::vector<T> data_aux(data_);
-  // randomize the data, mask is used to index elements in source vector
-  for (int32 i = 0; i < mask.size(); i++) {
-    data_.at(i) = data_aux.at(mask.at(i));
-  }
-}
-
-template<typename T>
-void StdVectorRandomizer<T>::Next() {
-  data_begin_ += conf_.minibatch_size;
-}
-
-template<typename T>
-const std::vector<T>& StdVectorRandomizer<T>::Value() {
-  // make sure we have enough data for minibatch,
-  KALDI_ASSERT(data_end_ - data_begin_ >= conf_.minibatch_size);
-  // prepare the mini-batch buffer,
-  minibatch_.resize(conf_.minibatch_size);
-  typename std::vector<T>::iterator first = data_.begin() + data_begin_;
-  typename std::vector<T>::iterator last  = first + conf_.minibatch_size;
-  std::copy(first, last, minibatch_.begin());
-  return minibatch_;
-}
-
-// Instantiate template StdVectorRandomizer with types we expect to operate on,
-// - Int32VectorRandomizer:
-template class StdVectorRandomizer<int32>;
-// - PosteriorRandomizer:
-template class StdVectorRandomizer<std::vector<std::pair<int32, BaseFloat> > >;
-
-}  // namespace nnet1
-}  // namespace kaldi
diff --git a/src/nnet/nnet-randomizer.h b/src/nnet/nnet-randomizer.h
deleted file mode 100644
index 71da6950599..00000000000
--- a/src/nnet/nnet-randomizer.h
+++ /dev/null
@@ -1,274 +0,0 @@
-// nnet/nnet-randomizer.h
-
-// Copyright 2013  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_RANDOMIZER_H_
-#define KALDI_NNET_NNET_RANDOMIZER_H_
-
-#include <utility>
-#include <vector>
-
-#include "base/kaldi-math.h"
-#include "itf/options-itf.h"
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * Configuration variables that affect how frame-level shuffling is done.
- */
-struct NnetDataRandomizerOptions {
-  int32 randomizer_size;  ///< Maximum number of samples we have in memory,
-  int32 randomizer_seed;
-  int32 minibatch_size;
-
-  NnetDataRandomizerOptions():
-    randomizer_size(32768),
-    randomizer_seed(777),
-    minibatch_size(256)
-  { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("randomizer-size", &randomizer_size,
-       "Capacity of randomizer, length of concatenated utterances which, "
-       "are used for frame-level shuffling (in frames, affects memory "
-       "consumption, max 8000000).");
-    opts->Register("randomizer-seed", &randomizer_seed,
-       "Seed value for srand, sets fixed order of frame-level shuffling");
-    opts->Register("minibatch-size", &minibatch_size, "Size of a minibatch.");
-  }
-};
-
-
-/**
- * Generates randomly ordered vector of indices,
- */
-class RandomizerMask {
- public:
-  RandomizerMask()
-  { }
-
-  explicit RandomizerMask(const NnetDataRandomizerOptions &conf) {
-    Init(conf);
-  }
-
-  /// Init, call srand,
-  void Init(const NnetDataRandomizerOptions& conf);
-
-  /// Generate randomly ordered vector of integers 0..[mask_size -1],
-  const std::vector<int32>& Generate(int32 mask_size);
-
- private:
-  std::vector<int32> mask_;
-};
-
-
-/**
- * Shuffles rows of a matrix according to the indices in the mask,
- */
-class MatrixRandomizer {
- public:
-  MatrixRandomizer():
-    data_begin_(0),
-    data_end_(0)
-  { }
-
-  explicit MatrixRandomizer(const NnetDataRandomizerOptions &conf):
-    data_begin_(0),
-    data_end_(0)
-  {
-    Init(conf);
-  }
-
-  /// Set the randomizer parameters (size)
-  void Init(const NnetDataRandomizerOptions& conf) {
-    conf_ = conf;
-  }
-
-  /// Add data to randomization buffer
-  void AddData(const CuMatrixBase<BaseFloat>& m);
-
-  /// Returns true, when capacity is full
-  bool IsFull() {
-    return ((data_begin_ == 0) && (data_end_ > conf_.randomizer_size ));
-  }
-
-  /// Number of frames stored inside the Randomizer
-  int32 NumFrames() {
-    return data_end_;
-  }
-
-  /// Randomize matrix row-order using mask
-  void Randomize(const std::vector<int32>& mask);
-
-  /// Returns true, if no more data for another mini-batch (after current one)
-  bool Done() {
-    return (data_end_ - data_begin_ < conf_.minibatch_size);
-  }
-
-  /// Sets cursor to next mini-batch
-  void Next();
-
-  /// Returns matrix-window with next mini-batch
-  const CuMatrixBase<BaseFloat>& Value();
-
- private:
-  CuMatrix<BaseFloat> data_;  // can be larger than 'randomizer_size'
-  CuMatrix<BaseFloat> data_aux_;  // auxiliary buffer for shuffling
-  CuMatrix<BaseFloat> minibatch_;  // buffer for mini-batch
-
-  /// A cursor, pointing to the 'row' where the next mini-batch begins,
-  int32 data_begin_;
-  /// A cursor, pointing to the 'row' after the end of data,
-  int32 data_end_;
-
-  NnetDataRandomizerOptions conf_;
-};
-
-
-/// Randomizes elements of a vector according to a mask
-class VectorRandomizer {
- public:
-  VectorRandomizer():
-    data_begin_(0),
-    data_end_(0)
-  { }
-
-  explicit VectorRandomizer(const NnetDataRandomizerOptions &conf):
-    data_begin_(0),
-    data_end_(0)
-  {
-    Init(conf);
-  }
-
-  /// Set the randomizer parameters (size)
-  void Init(const NnetDataRandomizerOptions& conf) {
-    conf_ = conf;
-  }
-
-  /// Add data to randomization buffer
-  void AddData(const Vector<BaseFloat>& v);
-
-  /// Returns true, when capacity is full
-  bool IsFull() {
-    return ((data_begin_ == 0) && (data_end_ > conf_.randomizer_size ));
-  }
-
-  /// Number of frames stored inside the Randomizer
-  int32 NumFrames() {
-    return data_end_;
-  }
-
-  /// Randomize matrix row-order using mask
-  void Randomize(const std::vector<int32>& mask);
-
-  /// Returns true, if no more data for another mini-batch (after current one)
-  bool Done() {
-    return (data_end_ - data_begin_ < conf_.minibatch_size);
-  }
-
-  /// Sets cursor to next mini-batch
-  void Next();
-
-  /// Returns matrix-window with next mini-batch
-  const Vector<BaseFloat>& Value();
-
- private:
-  Vector<BaseFloat> data_;  // can be larger than 'randomizer_size'
-  Vector<BaseFloat> minibatch_;  // buffer for mini-batch
-
-  /// A cursor, pointing to the 'row' where the next mini-batch begins,
-  int32 data_begin_;
-  /// A cursor, pointing to the 'row' after the end of data,
-  int32 data_end_;
-
-  NnetDataRandomizerOptions conf_;
-};
-
-
-/// Randomizes elements of a vector according to a mask
-template<typename T>
-class StdVectorRandomizer {
- public:
-  StdVectorRandomizer():
-    data_begin_(0),
-    data_end_(0)
-  { }
-
-  explicit StdVectorRandomizer(const NnetDataRandomizerOptions &conf):
-    data_begin_(0),
-    data_end_(0)
-  {
-    Init(conf);
-  }
-
-  /// Set the randomizer parameters (size)
-  void Init(const NnetDataRandomizerOptions& conf) {
-    conf_ = conf;
-  }
-
-  /// Add data to randomization buffer
-  void AddData(const std::vector<T>& v);
-
-  /// Returns true, when capacity is full
-  bool IsFull() {
-    return ((data_begin_ == 0) && (data_end_ > conf_.randomizer_size ));
-  }
-
-  /// Number of frames stored inside the Randomizer
-  int32 NumFrames() {
-    return data_end_;
-  }
-
-  /// Randomize matrix row-order using mask
-  void Randomize(const std::vector<int32>& mask);
-
-  /// Returns true, if no more data for another mini-batch (after current one)
-  bool Done() {
-    return (data_end_ - data_begin_ < conf_.minibatch_size);
-  }
-
-  /// Sets cursor to next mini-batch
-  void Next();
-
-  /// Returns matrix-window with next mini-batch
-  const std::vector<T>& Value();
-
- private:
-  std::vector<T> data_;  // can be larger than 'randomizer_size'
-  std::vector<T> minibatch_;  // buffer for mini-batch
-
-  /// A cursor, pointing to the 'row' where the next mini-batch begins,
-  int32 data_begin_;
-  /// A cursor, pointing to the 'row' after the end of data,
-  int32 data_end_;
-
-  NnetDataRandomizerOptions conf_;
-};
-
-typedef StdVectorRandomizer<int32> Int32VectorRandomizer;
-typedef StdVectorRandomizer<std::vector<std::pair<int32, BaseFloat> > > PosteriorRandomizer;
-
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_RANDOMIZER_H_
diff --git a/src/nnet/nnet-rbm.h b/src/nnet/nnet-rbm.h
deleted file mode 100644
index 4b5f4c1e24a..00000000000
--- a/src/nnet/nnet-rbm.h
+++ /dev/null
@@ -1,433 +0,0 @@
-// nnet/nnet-rbm.h
-
-// Copyright 2012-2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_RBM_H_
-#define KALDI_NNET_NNET_RBM_H_
-
-#include <string>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-utils.h"
-#include "nnet/nnet-various.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-class RbmBase : public Component {
- public:
-  typedef enum {
-    Bernoulli,
-    Gaussian
-  } RbmNodeType;
-
-  RbmBase(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  // Inherited from Component::
-  // void Propagate(...)
-  // virtual void PropagateFnc(...) = 0
-
-  virtual void Reconstruct(
-    const CuMatrixBase<BaseFloat> &hid_state,
-    CuMatrix<BaseFloat> *vis_probs
-  ) = 0;
-  virtual void RbmUpdate(
-    const CuMatrixBase<BaseFloat> &pos_vis,
-    const CuMatrixBase<BaseFloat> &pos_hid,
-    const CuMatrixBase<BaseFloat> &neg_vis,
-    const CuMatrixBase<BaseFloat> &neg_hid
-  ) = 0;
-
-  virtual RbmNodeType VisType() const = 0;
-  virtual RbmNodeType HidType() const = 0;
-
-  virtual void WriteAsNnet(std::ostream& os, bool binary) const = 0;
-
-  /// Set training hyper-parameters to the network and its UpdatableComponent(s)
-  void SetRbmTrainOptions(const RbmTrainOptions& opts) {
-    rbm_opts_ = opts;
-  }
-  /// Get training hyper-parameters from the network
-  const RbmTrainOptions& GetRbmTrainOptions() const {
-    return rbm_opts_;
-  }
-
- protected:
-  RbmTrainOptions rbm_opts_;
-
- private:
-  //// Make inherited methods inaccessible,
-  //   as for RBMs we use Reconstruct(.)
-  void Backpropagate(const CuMatrixBase<BaseFloat> &in,
-                     const CuMatrixBase<BaseFloat> &out,
-                     const CuMatrixBase<BaseFloat> &out_diff,
-                     CuMatrix<BaseFloat> *in_diff)
-  { }
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff)
-  { }
-  ////
-};
-
-
-
-class Rbm : public RbmBase {
- public:
-  Rbm(int32 dim_in, int32 dim_out):
-    RbmBase(dim_in, dim_out)
-  { }
-
-  ~Rbm()
-  { }
-
-  Component* Copy() const {
-    return new Rbm(*this);
-  }
-
-  ComponentType GetType() const {
-    return kRbm;
-  }
-
-  void InitData(std::istream &is) {
-    // define options,
-    std::string vis_type;
-    std::string hid_type;
-    float vis_bias_mean = 0.0, vis_bias_range = 0.0,
-          hid_bias_mean = 0.0, hid_bias_range = 0.0,
-          param_stddev = 0.1;
-    std::string vis_bias_cmvn_file;  // initialize biases to logit(p_active)
-    // parse config,
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<VisibleType>") ReadToken(is, false, &vis_type);
-      else if (token == "<HiddenType>") ReadToken(is, false, &hid_type);
-      else if (token == "<VisibleBiasMean>") ReadBasicType(is, false, &vis_bias_mean);
-      else if (token == "<VisibleBiasRange>") ReadBasicType(is, false, &vis_bias_range);
-      else if (token == "<HiddenBiasMean>") ReadBasicType(is, false, &hid_bias_mean);
-      else if (token == "<HiddenBiasRange>") ReadBasicType(is, false, &hid_bias_range);
-      else if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
-      else if (token == "<VisibleBiasCmvnFilename>") ReadToken(is, false, &vis_bias_cmvn_file);
-      else KALDI_ERR << "Unknown token " << token << " Typo in config?";
-    }
-
-    // Translate the 'node' types,
-    if (vis_type == "bern" || vis_type == "Bernoulli") vis_type_ = RbmBase::Bernoulli;
-    else if (vis_type == "gauss" || vis_type == "Gaussian") vis_type_ = RbmBase::Gaussian;
-    else KALDI_ERR << "Wrong <VisibleType>" << vis_type;
-    //
-    if (hid_type == "bern" || hid_type == "Bernoulli") hid_type_ = RbmBase::Bernoulli;
-    else if (hid_type == "gauss" || hid_type == "Gaussian") hid_type_ = RbmBase::Gaussian;
-    else KALDI_ERR << "Wrong <HiddenType>" << hid_type;
-
-    //
-    // Initialize trainable parameters,
-    //
-    // visible-hidden connections,
-    vis_hid_.Resize(OutputDim(), InputDim());
-    RandGauss(0.0, param_stddev, &vis_hid_);
-    // hidden-bias,
-    hid_bias_.Resize(OutputDim());
-    RandUniform(hid_bias_mean, hid_bias_range, &hid_bias_);
-    // visible-bias,
-    if (vis_bias_cmvn_file == "") {
-      vis_bias_.Resize(InputDim());
-      RandUniform(vis_bias_mean, vis_bias_range, &vis_bias_);
-    } else {
-      KALDI_LOG << "Initializing from <VisibleBiasCmvnFilename> "
-                << vis_bias_cmvn_file;
-      // Reading Nnet with 'global-cmvn' components,
-      Nnet cmvn;
-      cmvn.Read(vis_bias_cmvn_file);
-      KALDI_ASSERT(InputDim() == cmvn.InputDim());
-      // The parameters from <AddShift> correspond to 'negative' mean values,
-      Vector<BaseFloat> p(cmvn.InputDim());
-      dynamic_cast<AddShift&>(cmvn.GetComponent(0)).GetParams(&p);
-      p.Scale(-1.0);  // 'un-do' negation of mean values,
-      p.ApplyFloor(0.0001);
-      p.ApplyCeiling(0.9999);
-      // Getting the logit,
-      Vector<BaseFloat> logit_p(p.Dim());
-      for (int32 d = 0; d < p.Dim(); d++) {
-        logit_p(d) = Log(p(d)) - Log(1.0 - p(d));
-      }
-      vis_bias_ = logit_p;
-      KALDI_ASSERT(vis_bias_.Dim() == InputDim());
-    }
-  }
-
-
-  void ReadData(std::istream &is, bool binary) {
-    std::string vis_node_type, hid_node_type;
-    ReadToken(is, binary, &vis_node_type);
-    ReadToken(is, binary, &hid_node_type);
-
-    if (vis_node_type == "bern") {
-      vis_type_ = RbmBase::Bernoulli;
-    } else if (vis_node_type == "gauss") {
-      vis_type_ = RbmBase::Gaussian;
-    }
-    if (hid_node_type == "bern") {
-      hid_type_ = RbmBase::Bernoulli;
-    } else if (hid_node_type == "gauss") {
-      hid_type_ = RbmBase::Gaussian;
-    }
-
-    vis_hid_.Read(is, binary);
-    vis_bias_.Read(is, binary);
-    hid_bias_.Read(is, binary);
-
-    KALDI_ASSERT(vis_hid_.NumRows() == output_dim_);
-    KALDI_ASSERT(vis_hid_.NumCols() == input_dim_);
-    KALDI_ASSERT(vis_bias_.Dim() == input_dim_);
-    KALDI_ASSERT(hid_bias_.Dim() == output_dim_);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    switch (vis_type_) {
-      case Bernoulli : WriteToken(os,binary, "bern"); break;
-      case Gaussian  : WriteToken(os,binary, "gauss"); break;
-      default : KALDI_ERR << "Unknown type " << vis_type_;
-    }
-    switch (hid_type_) {
-      case Bernoulli : WriteToken(os,binary, "bern"); break;
-      case Gaussian  : WriteToken(os,binary, "gauss"); break;
-      default : KALDI_ERR << "Unknown type " << hid_type_;
-    }
-    vis_hid_.Write(os, binary);
-    vis_bias_.Write(os, binary);
-    hid_bias_.Write(os, binary);
-  }
-
-
-  // Component API
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // pre-fill with bias
-    out->AddVecToRows(1.0, hid_bias_, 0.0);
-    // multiply by weights^t
-    out->AddMatMat(1.0, in, kNoTrans, vis_hid_, kTrans, 1.0);
-    // optionally apply sigmoid
-    if (hid_type_ == RbmBase::Bernoulli) {
-      out->Sigmoid(*out);
-    }
-  }
-
-  // RBM training API
-  void Reconstruct(const CuMatrixBase<BaseFloat> &hid_state,
-                   CuMatrix<BaseFloat> *vis_probs) {
-    // check the dim
-    if (output_dim_ != hid_state.NumCols()) {
-      KALDI_ERR << "Nonmatching dims, component:" << output_dim_
-                << " data:" << hid_state.NumCols();
-    }
-    // optionally allocate buffer
-    if (input_dim_ != vis_probs->NumCols() ||
-        hid_state.NumRows() != vis_probs->NumRows()) {
-      vis_probs->Resize(hid_state.NumRows(), input_dim_);
-    }
-
-    // pre-fill with bias
-    vis_probs->AddVecToRows(1.0, vis_bias_, 0.0);
-    // multiply by weights
-    vis_probs->AddMatMat(1.0, hid_state, kNoTrans, vis_hid_, kNoTrans, 1.0);
-    // optionally apply sigmoid
-    if (vis_type_ == RbmBase::Bernoulli) {
-      vis_probs->Sigmoid(*vis_probs);
-    }
-  }
-
-  void RbmUpdate(const CuMatrixBase<BaseFloat> &pos_vis,
-                 const CuMatrixBase<BaseFloat> &pos_hid,
-                 const CuMatrixBase<BaseFloat> &neg_vis,
-                 const CuMatrixBase<BaseFloat> &neg_hid) {
-    // dims
-    KALDI_ASSERT(pos_vis.NumRows() == pos_hid.NumRows() &&
-           pos_vis.NumRows() == neg_vis.NumRows() &&
-           pos_vis.NumRows() == neg_hid.NumRows() &&
-           pos_vis.NumCols() == neg_vis.NumCols() &&
-           pos_hid.NumCols() == neg_hid.NumCols() &&
-           pos_vis.NumCols() == input_dim_ &&
-           pos_hid.NumCols() == output_dim_);
-
-    // lazy initialization of buffers
-    if ( vis_hid_corr_.NumRows() != vis_hid_.NumRows() ||
-         vis_hid_corr_.NumCols() != vis_hid_.NumCols() ||
-         vis_bias_corr_.Dim()    != vis_bias_.Dim()    ||
-         hid_bias_corr_.Dim()    != hid_bias_.Dim()     ) {
-      vis_hid_corr_.Resize(vis_hid_.NumRows(), vis_hid_.NumCols(), kSetZero);
-      vis_bias_corr_.Resize(vis_bias_.Dim(), kSetZero);
-      hid_bias_corr_.Resize(hid_bias_.Dim(), kSetZero);
-    }
-
-    // ANTI-WEIGHT-EXPLOSION PROTECTION (Gaussian-Bernoulli RBM)
-    //
-    // in the following section we detect that the weights in
-    // Gaussian-Bernoulli RBM are almost exploding. The weight
-    // explosion is caused by large variance of the reconstructed data,
-    // which causes a feed-back loop that keeps increasing the weights.
-    //
-    // To avoid explosion, the standard-deviation of the visible-data
-    // and reconstructed-data should be about the same.
-    // The model is particularly sensitive at the very
-    // beginning of the CD-1 training.
-    //
-    // We compute the standard deviations on
-    // * 'A' : input mini-batch
-    // * 'B' : reconstruction.
-    // When 'B > 2*A', we stabilize the training in this way:
-    // 1. we scale down the weights and biases by 'A/B',
-    // 2. we shrink learning rate by 0.9x,
-    // 3. we reset the momentum buffer,
-    //
-    // A warning message is put to the log. In later stage
-    // the learning-rate returns back to its original value.
-    //
-    // To avoid the issue, we make sure that the weight-matrix
-    // is sensibly initialized.
-    //
-    if (vis_type_ == RbmBase::Gaussian) {
-      // check the data have no nan/inf:
-      CheckNanInf(pos_vis, "pos_vis");
-      CheckNanInf(pos_hid, "pos_hid");
-      CheckNanInf(neg_vis, "neg_vis");
-      CheckNanInf(neg_hid, "pos_hid");
-
-      // get standard deviations of pos_vis and neg_vis:
-      BaseFloat pos_vis_std = ComputeStdDev(pos_vis);
-      BaseFloat neg_vis_std = ComputeStdDev(neg_vis);
-
-      // monitor the standard deviation mismatch : data vs. reconstruction
-      if (pos_vis_std * 2 < neg_vis_std) {
-        // 1) scale-down the weights and biases
-        BaseFloat scale = pos_vis_std / neg_vis_std;
-        vis_hid_.Scale(scale);
-        vis_bias_.Scale(scale);
-        hid_bias_.Scale(scale);
-        // 2) reduce the learning rate
-        rbm_opts_.learn_rate *= 0.9;
-        // 3) reset the momentum buffers
-        vis_hid_corr_.SetZero();
-        vis_bias_corr_.SetZero();
-        hid_bias_corr_.SetZero();
-
-        KALDI_WARN << "Mismatch between pos_vis and neg_vis variances, "
-                   << "danger of weight explosion."
-                   << " a) Reducing weights with scale " << scale
-                   << " b) Lowering learning rate to " << rbm_opts_.learn_rate
-                   << " [pos_vis_std:" << pos_vis_std
-                   << ",neg_vis_std:" << neg_vis_std << "]";
-        return; /* i.e. don't update now, the update would be too BIG */
-      }
-    }
-    //
-    // End of weight-explosion check
-
-
-    //  We use these training hyper-parameters
-    //
-    const BaseFloat lr = rbm_opts_.learn_rate;
-    const BaseFloat mmt = rbm_opts_.momentum;
-    const BaseFloat l2 = rbm_opts_.l2_penalty;
-
-    //  UPDATE vishid matrix
-    //
-    //  vishidinc = momentum*vishidinc + ...
-    //              epsilonw*( (posprods-negprods)/numcases - weightcost*vishid)
-    //
-    //  vishidinc[t] = -(epsilonw/numcases)*negprods + momentum*vishidinc[t-1]
-    //                 +(epsilonw/numcases)*posprods
-    //                 -(epsilonw*weightcost)*vishid[t-1]
-    //
-    BaseFloat N = static_cast<BaseFloat>(pos_vis.NumRows());
-    vis_hid_corr_.AddMatMat(-lr/N, neg_hid, kTrans, neg_vis, kNoTrans, mmt);
-    vis_hid_corr_.AddMatMat(+lr/N, pos_hid, kTrans, pos_vis, kNoTrans, 1.0);
-    vis_hid_corr_.AddMat(-lr*l2, vis_hid_);
-    vis_hid_.AddMat(1.0, vis_hid_corr_);
-
-    //  UPDATE visbias vector
-    //
-    //  visbiasinc = momentum*visbiasinc +
-    //               (epsilonvb/numcases)*(posvisact-negvisact);
-    //
-    vis_bias_corr_.AddRowSumMat(-lr/N, neg_vis, mmt);
-    vis_bias_corr_.AddRowSumMat(+lr/N, pos_vis, 1.0);
-    vis_bias_.AddVec(1.0, vis_bias_corr_, 1.0);
-
-    //  UPDATE hidbias vector
-    //
-    // hidbiasinc = momentum*hidbiasinc +
-    //              (epsilonhb/numcases)*(poshidact-neghidact);
-    //
-    hid_bias_corr_.AddRowSumMat(-lr/N, neg_hid, mmt);
-    hid_bias_corr_.AddRowSumMat(+lr/N, pos_hid, 1.0);
-    hid_bias_.AddVec(1.0, hid_bias_corr_, 1.0);
-  }
-
-  RbmNodeType VisType() const {
-    return vis_type_;
-  }
-
-  RbmNodeType HidType() const {
-    return hid_type_;
-  }
-
-  void WriteAsNnet(std::ostream& os, bool binary) const {
-    // header,
-    WriteToken(os, binary, Component::TypeToMarker(Component::kAffineTransform));
-    WriteBasicType(os, binary, OutputDim());
-    WriteBasicType(os, binary, InputDim());
-    if (!binary) os << "\n";
-    // data,
-    vis_hid_.Write(os, binary);
-    hid_bias_.Write(os, binary);
-    // sigmoid activation,
-    if (HidType() == Bernoulli) {
-      WriteToken(os, binary, Component::TypeToMarker(Component::kSigmoid));
-      WriteBasicType(os, binary, OutputDim());
-      WriteBasicType(os, binary, OutputDim());
-    }
-    if (!binary) os << "\n";
-  }
-
- protected:
-  CuMatrix<BaseFloat> vis_hid_;        ///< Matrix with neuron weights
-  CuVector<BaseFloat> vis_bias_;       ///< Vector with biases
-  CuVector<BaseFloat> hid_bias_;       ///< Vector with biases
-
-  CuMatrix<BaseFloat> vis_hid_corr_;   ///< Matrix for linearity updates
-  CuVector<BaseFloat> vis_bias_corr_;  ///< Vector for bias updates
-  CuVector<BaseFloat> hid_bias_corr_;  ///< Vector for bias updates
-
-  RbmNodeType vis_type_;
-  RbmNodeType hid_type_;
-};
-
-
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_RBM_H_
diff --git a/src/nnet/nnet-recurrent.h b/src/nnet/nnet-recurrent.h
deleted file mode 100644
index ef251f70f10..00000000000
--- a/src/nnet/nnet-recurrent.h
+++ /dev/null
@@ -1,346 +0,0 @@
-// nnet/nnet-lstm-projected-streams.h
-
-// Copyright 2016  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-
-#ifndef KALDI_NNET_NNET_RECURRENT_STREAMS_H_
-#define KALDI_NNET_NNET_RECURRENT_STREAMS_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-
-namespace kaldi {
-namespace nnet1 {
-
-
-/**
- * Component with recurrent connections, 'tanh' non-linearity.
- * No internal state preserved, starting each sequence from zero vector.
- *
- * Can be used in 'per-sentence' training and multi-stream training.
- */
-class RecurrentComponent : public MultistreamComponent {
- public:
-  RecurrentComponent(int32 input_dim, int32 output_dim):
-    MultistreamComponent(input_dim, output_dim)
-  { }
-
-  ~RecurrentComponent()
-  { }
-
-  Component* Copy() const { return new RecurrentComponent(*this); }
-  ComponentType GetType() const { return kRecurrentComponent; }
-
-  void InitData(std::istream &is) {
-    // define options,
-    float param_scale = 0.02;
-    // parse the line from prototype,
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<GradClip>") ReadBasicType(is, false, &grad_clip_);
-      else if (token == "<DiffClip>") ReadBasicType(is, false, &diff_clip_);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
-      else if (token == "<ParamScale>") ReadBasicType(is, false, &param_scale);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (GradClip|DiffClip|LearnRateCoef|BiasLearnRateCoef|ParamScale)";
-    }
-
-    // init the weights and biases (from uniform dist.),
-    w_forward_.Resize(output_dim_, input_dim_);
-    w_recurrent_.Resize(output_dim_, output_dim_);
-    bias_.Resize(output_dim_);
-
-    RandUniform(0.0, 2.0 * param_scale, &w_forward_);
-    RandUniform(0.0, 2.0 * param_scale, &w_recurrent_);
-    RandUniform(0.0, 2.0 * param_scale, &bias_);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // Read all the '<Tokens>' in arbitrary order,
-    while ('<' == Peek(is, binary)) {
-      std::string token;
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'G': ExpectToken(is, binary, "<GradClip>");
-          ReadBasicType(is, binary, &grad_clip_);
-          break;
-        case 'D': ExpectToken(is, binary, "<DiffClip>");
-          ReadBasicType(is, binary, &diff_clip_);
-          break;
-        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
-          ReadBasicType(is, binary, &learn_rate_coef_);
-          break;
-        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
-          ReadBasicType(is, binary, &bias_learn_rate_coef_);
-          break;
-        default: ReadToken(is, false, &token);
-          KALDI_ERR << "Unknown token: " << token;
-      }
-    }
-
-    // Read the data (data follow the tokens),
-    w_forward_.Read(is, binary);
-    w_recurrent_.Read(is, binary);
-    bias_.Read(is, binary);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<GradClip>");
-    WriteBasicType(os, binary, grad_clip_);
-    WriteToken(os, binary, "<DiffClip>");
-    WriteBasicType(os, binary, diff_clip_);
-
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<BiasLearnRateCoef>");
-    WriteBasicType(os, binary, bias_learn_rate_coef_);
-
-    if (!binary) os << "\n";
-    w_forward_.Write(os, binary);
-    w_recurrent_.Write(os, binary);
-    bias_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return w_forward_.NumRows() * w_forward_.NumCols() +
-      w_recurrent_.NumRows() * w_recurrent_.NumCols() +
-      bias_.Dim();
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 offset, len;
-
-    offset = 0;    len = w_forward_corr_.NumRows() * w_forward_corr_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(w_forward_corr_);
-
-    offset += len; len = w_recurrent_corr_.NumRows() * w_recurrent_corr_.NumCols();
-    gradient->Range(offset, len).CopyRowsFromMat(w_recurrent_corr_);
-
-    offset += len; len = bias_corr_.Dim();
-    gradient->Range(offset, len).CopyFromVec(bias_corr_);
-
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 offset, len;
-
-    offset = 0;    len = w_forward_.NumRows() * w_forward_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(w_forward_);
-
-    offset += len; len = w_recurrent_.NumRows() * w_recurrent_.NumCols();
-    params->Range(offset, len).CopyRowsFromMat(w_recurrent_);
-
-    offset += len; len = bias_.Dim();
-    params->Range(offset, len).CopyFromVec(bias_);
-
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 offset, len;
-
-    offset = 0;    len = w_forward_.NumRows() * w_forward_.NumCols();
-    w_forward_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = w_recurrent_.NumRows() * w_recurrent_.NumCols();
-    w_recurrent_.CopyRowsFromVec(params.Range(offset, len));
-
-    offset += len; len = bias_.Dim();
-    bias_.CopyFromVec(params.Range(offset, len));
-
-    offset += len;
-    KALDI_ASSERT(offset == NumParams());
-  }
-
-  std::string Info() const {
-    return std::string("  ") +
-      "\n  w_forward_  "   + MomentStatistics(w_forward_) +
-      "\n  w_recurrent_  " + MomentStatistics(w_recurrent_) +
-      "\n  bias_  "        + MomentStatistics(bias_);
-  }
-
-  std::string InfoGradient() const {
-    return std::string("") +
-      "( learn_rate_coef " + ToString(learn_rate_coef_) +
-      ", bias_learn_rate_coef " + ToString(bias_learn_rate_coef_) +
-      ", grad-clip " + ToString(grad_clip_) +
-      ", diff-clip " + ToString(diff_clip_) + " )" +
-      "\n  Gradients:" +
-      "\n  w_forward_corr_  "   + MomentStatistics(w_forward_corr_) +
-      "\n  w_recurrent_corr_  "   + MomentStatistics(w_recurrent_corr_) +
-      "\n  bias_corr_  "     + MomentStatistics(bias_corr_) +
-      "\n  Forward-pass:" +
-      "\n  out_  " + MomentStatistics(out_) +
-      "\n  Backward-pass:" +
-      "\n  out_diff_bptt_  " + MomentStatistics(out_diff_bptt_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-
-
-    KALDI_ASSERT(in.NumRows() % NumStreams() == 0);
-    int32 T = in.NumRows() / NumStreams();
-    int32 S = NumStreams();
-
-    // Precopy bias,
-    out->AddVecToRows(1.0, bias_, 0.0);
-    // Apply 'forward' connections,
-    out->AddMatMat(1.0, in, kNoTrans, w_forward_, kTrans, 1.0);
-
-    // First line of 'out' w/o recurrent signal, apply 'tanh' directly,
-    out->RowRange(0, S).Tanh(out->RowRange(0, S));
-
-    // Apply 'recurrent' connections,
-    for (int32 t = 1; t < T; t++) {
-      out->RowRange(t*S, S).AddMatMat(1.0, out->RowRange((t-1)*S, S), kNoTrans, w_recurrent_, kTrans, 1.0);
-      out->RowRange(t*S, S).Tanh(out->RowRange(t*S, S));
-      // Zero output for padded frames,
-      if (sequence_lengths_.size() == S) {
-        for (int32 s = 0; s < S; s++) {
-          if (t >= sequence_lengths_[s]) {
-            out->Row(t*S + s).SetZero();
-          }
-        }
-      }
-      //
-    }
-
-    out_ = (*out);  // We'll need a copy for updating the recurrent weights!
-
-    // We are DONE ;)
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-
-    int32 T = in.NumRows() / NumStreams();
-    int32 S = NumStreams();
-
-    // Apply BPTT on 'out_diff',
-    out_diff_bptt_ = out_diff;
-    for (int32 t = T-1; t >= 1; t--) {
-      // buffers,
-      CuSubMatrix<BaseFloat> d_t = out_diff_bptt_.RowRange(t*S, S);
-      CuSubMatrix<BaseFloat> d_t1 = out_diff_bptt_.RowRange((t-1)*S, S);
-      const CuSubMatrix<BaseFloat> y_t = out.RowRange(t*S, S);
-
-      // BPTT,
-      d_t.DiffTanh(y_t, d_t);
-      d_t1.AddMatMat(1.0, d_t, kNoTrans, w_recurrent_, kNoTrans, 1.0);
-
-      // clipping,
-      if (diff_clip_ > 0.0) {
-        d_t1.ApplyFloor(-diff_clip_);
-        d_t1.ApplyCeiling(diff_clip_);
-      }
-
-      // Zero diff for padded frames,
-      if (sequence_lengths_.size() == S) {
-        for (int32 s = 0; s < S; s++) {
-          if (t >= sequence_lengths_[s]) {
-            out_diff_bptt_.Row(t*S + s).SetZero();
-          }
-        }
-      }
-    }
-
-    // Apply 'DiffTanh' on first block,
-    CuSubMatrix<BaseFloat> d_t = out_diff_bptt_.RowRange(0, S);
-    const CuSubMatrix<BaseFloat> y_t = out.RowRange(0, S);
-    d_t.DiffTanh(y_t, d_t);
-
-    // Transform diffs to 'in_diff',
-    in_diff->AddMatMat(1.0, out_diff_bptt_, kNoTrans, w_forward_, kNoTrans, 0.0);
-
-    // We are DONE ;)
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    int32 T = input.NumRows() / NumStreams();
-    int32 S = NumStreams();
-
-    // getting the learning rate,
-    const BaseFloat lr  = opts_.learn_rate;
-    const BaseFloat mmt = opts_.momentum;
-
-    if (bias_corr_.Dim() != OutputDim()) {
-      w_forward_corr_.Resize(w_forward_.NumRows(), w_forward_.NumCols(), kSetZero);
-      w_recurrent_corr_.Resize(w_recurrent_.NumRows(), w_recurrent_.NumCols(), kSetZero);
-      bias_corr_.Resize(OutputDim(), kSetZero);
-    }
-
-    // getting the gradients,
-    w_forward_corr_.AddMatMat(1.0, out_diff_bptt_, kTrans, input, kNoTrans, mmt);
-
-
-    w_recurrent_corr_.AddMatMat(1.0, out_diff_bptt_.RowRange(S, (T-1)*S), kTrans,
-                                               out_.RowRange(0, (T-1)*S), kNoTrans, mmt);
-
-    bias_corr_.AddRowSumMat(1.0, out_diff_bptt_, mmt);
-
-    // updating,
-    w_forward_.AddMat(-lr * learn_rate_coef_, w_forward_corr_);
-    w_recurrent_.AddMat(-lr * learn_rate_coef_, w_recurrent_corr_);
-    bias_.AddVec(-lr * bias_learn_rate_coef_, bias_corr_);
-  }
-
- private:
-
-  BaseFloat grad_clip_;  ///< Clipping of the update,
-  BaseFloat diff_clip_;  ///< Clipping in the BPTT loop,
-
-  // trainable parameters,
-  CuMatrix<BaseFloat> w_forward_;
-  CuMatrix<BaseFloat> w_recurrent_;
-  CuVector<BaseFloat> bias_;
-
-  // udpate buffers,
-  CuMatrix<BaseFloat> w_forward_corr_;
-  CuMatrix<BaseFloat> w_recurrent_corr_;
-  CuVector<BaseFloat> bias_corr_;
-
-  // forward propagation buffer,
-  CuMatrix<BaseFloat> out_;
-
-  // back-propagate buffer,
-  CuMatrix<BaseFloat> out_diff_bptt_;
-
-};  // class RecurrentComponent
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_RECURRENT_STREAMS_H_
diff --git a/src/nnet/nnet-sentence-averaging-component.h b/src/nnet/nnet-sentence-averaging-component.h
deleted file mode 100644
index 129b54890a7..00000000000
--- a/src/nnet/nnet-sentence-averaging-component.h
+++ /dev/null
@@ -1,314 +0,0 @@
-// nnet/nnet-sentence-averaging-component.h
-
-// Copyright 2013-2016  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_SENTENCE_AVERAGING_COMPONENT_H_
-#define KALDI_NNET_NNET_SENTENCE_AVERAGING_COMPONENT_H_
-
-#include <string>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-
-/**
- * SimpleSentenceAveragingComponent does not have nested network,
- * it is intended to be used inside of a <ParallelComponent>.
- * For training use 'nnet-train-perutt'.
- *
- * The sentence-averaging typically leads to small gradients, so we boost it 100x
- * by default (boost = multiply, it's equivalent to applying learning-rate factor).
- */
-class SimpleSentenceAveragingComponent : public Component {
- public:
-  SimpleSentenceAveragingComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    gradient_boost_(100.0),
-    shrinkage_(0.0),
-    only_summing_(false)
-  { }
-
-  ~SimpleSentenceAveragingComponent()
-  { }
-
-  Component* Copy() const {
-    return new SimpleSentenceAveragingComponent(*this);
-  }
-
-  ComponentType GetType() const {
-    return kSimpleSentenceAveragingComponent;
-  }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      if (token == "<GradientBoost>") ReadBasicType(is, false, &gradient_boost_);
-      else if (token == "<Shrinkage>") ReadBasicType(is, false, &shrinkage_);
-      else if (token == "<OnlySumming>") ReadBasicType(is, false, &only_summing_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (GradientBoost|Shrinkage|OnlySumming)";
-    }
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    bool end_loop = false;
-    while (!end_loop && '<' == Peek(is, binary)) {
-      int first_char = PeekToken(is, binary);
-      switch (first_char) {
-        case 'G': ExpectToken(is, binary, "<GradientBoost>");
-          ReadBasicType(is, binary, &gradient_boost_);
-          break;
-        case 'S': ExpectToken(is, binary, "<Shrinkage>");
-          ReadBasicType(is, binary, &shrinkage_);
-          break;
-        case 'O': ExpectToken(is, binary, "<OnlySumming>");
-          // compatibility trick,
-          // in some models 'only_summing_' was float '0.0',
-          // from now 'only_summing_' is 'bool':
-          try {
-            ReadBasicType(is, binary, &only_summing_);
-          } catch(const std::exception &e) {
-            KALDI_WARN << "ERROR was handled by exception!";
-            BaseFloat dummy_float;
-            ReadBasicType(is, binary, &dummy_float);
-          }
-          break;
-        case '!':
-          ExpectToken(is, binary, "<!EndOfComponent>");
-        default:
-          end_loop = true;
-      }
-    }
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<GradientBoost>");
-    WriteBasicType(os, binary, gradient_boost_);
-    WriteToken(os, binary, "<Shrinkage>");
-    WriteBasicType(os, binary, shrinkage_);
-    WriteToken(os, binary, "<OnlySumming>");
-    WriteBasicType(os, binary, only_summing_);
-  }
-
-  std::string Info() const {
-    return std::string("\n  gradient-boost ") + ToString(gradient_boost_) +
-      ", shrinkage: " + ToString(shrinkage_) +
-      ", only summing: " + ToString(only_summing_);
-  }
-  std::string InfoGradient() const {
-    return Info();
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // get the average row-vector,
-    average_row_.Resize(InputDim());
-    if (only_summing_) {
-      average_row_.AddRowSumMat(1.0, in, 0.0);
-    } else {
-      average_row_.AddRowSumMat(1.0/(in.NumRows()+shrinkage_), in, 0.0);
-    }
-    // copy it on the output,
-    out->AddVecToRows(1.0, average_row_, 0.0);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // When averaging, a single frame from input influenced all the frames
-    // on the output. So the derivative w.r.t. single input frame is a sum
-    // of the output derivatives, scaled by the averaging constant 1/K.
-    //
-    // In the same time all the input frames of the average influenced
-    // all the output frames. So the loss derivarive is same for all
-    // the input frames coming to the averaging.
-    //
-    // getting the average output diff,
-    average_diff_.Resize(OutputDim());
-    if (only_summing_) {
-      average_diff_.AddRowSumMat(1.0, out_diff, 0.0);
-    } else {
-      average_diff_.AddRowSumMat(1.0/(out_diff.NumRows()+shrinkage_), out_diff, 0.0);
-    }
-    // copy the derivative into the input diff, (applying gradient-boost!!)
-    in_diff->AddVecToRows(gradient_boost_, average_diff_, 0.0);
-  }
-
- private:
-  /// Auxiliary buffer for forward propagation (for average vector),
-  CuVector<BaseFloat> average_row_;
-
-  /// Auxiliary buffer for back-propagation (for average vector),
-  CuVector<BaseFloat> average_diff_;
-
-  /// Scalar applied on gradient in backpropagation,
-  BaseFloat gradient_boost_;
-
-  /// Number of 'imaginary' zero-vectors in the average
-  /// (shrinks the average vector for short sentences),
-  BaseFloat shrinkage_;
-
-  /// Removes normalization term from arithmetic mean (when true).
-  bool only_summing_;
-};
-
-
-/** Deprecated!!!, keeping it as Katka Zmolikova used it in JSALT 2015 */
-class SentenceAveragingComponent : public UpdatableComponent {
- public:
-  SentenceAveragingComponent(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out), learn_rate_factor_(100.0)
-  { }
-  ~SentenceAveragingComponent()
-  { }
-
-  Component* Copy() const { return new SentenceAveragingComponent(*this); }
-  ComponentType GetType() const { return kSentenceAveragingComponent; }
-
-  void InitData(std::istream &is) {
-    // define options
-    std::string nested_nnet_filename;
-    std::string nested_nnet_proto;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<NestedNnetFilename>") ReadToken(is, false, &nested_nnet_filename);
-      else if (token == "<NestedNnetProto>") ReadToken(is, false, &nested_nnet_proto);
-      else if (token == "<LearnRateFactor>") ReadBasicType(is, false, &learn_rate_factor_);
-      else KALDI_ERR << "Unknown token " << token << " Typo in config?";
-    }
-    // initialize (read already prepared nnet from file)
-    KALDI_ASSERT((nested_nnet_proto != "") ^ (nested_nnet_filename != ""));  // xor,
-    if (nested_nnet_filename != "") nnet_.Read(nested_nnet_filename);
-    if (nested_nnet_proto != "") nnet_.Init(nested_nnet_proto);
-    // check dims of nested nnet
-    KALDI_ASSERT(InputDim() == nnet_.InputDim());
-    KALDI_ASSERT(OutputDim() == nnet_.OutputDim() + InputDim());
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    nnet_.Read(is, binary);
-    KALDI_ASSERT(nnet_.InputDim() == InputDim());
-    KALDI_ASSERT(nnet_.OutputDim() + InputDim() == OutputDim());
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    nnet_.Write(os, binary);
-  }
-
-  int32 NumParams() const { return nnet_.NumParams(); }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ERR << "Unimplemented!";
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    Vector<BaseFloat> params_aux;
-    nnet_.GetParams(&params_aux);
-    params->CopyFromVec(params_aux);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ERR << "Unimplemented!";
-  }
-
-  std::string Info() const {
-    return std::string("nested_network {\n") + nnet_.Info() + "}\n";
-  }
-
-  std::string InfoGradient() const {
-    return std::string("nested_gradient {\n") + nnet_.InfoGradient() + "}\n";
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // Get NN output
-    CuMatrix<BaseFloat> out_nnet;
-    nnet_.Propagate(in, &out_nnet);
-    // Get the average row (averaging over the time axis):
-    // averaging corresponds to extraction of a 'constant vector'
-    // code for single sentence,
-    int32 num_inputs = in.NumCols(),
-      nnet_outputs = nnet_.OutputDim(),
-      num_frames = out_nnet.NumRows();
-
-    CuVector<BaseFloat> average_row(nnet_outputs);
-    average_row.AddRowSumMat(1.0/num_frames, out_nnet, 0.0);
-    // Forwarding sentence codes along with input features
-    out->ColRange(0, nnet_outputs).AddVecToRows(1.0, average_row, 0.0);
-    out->ColRange(nnet_outputs, num_inputs).CopyFromMat(in);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    if (in_diff == NULL) return;
-    int32 num_inputs = in.NumCols(),
-      nnet_outputs = nnet_.OutputDim();
-    in_diff->CopyFromMat(out_diff.ColRange(nnet_outputs, num_inputs));
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // get useful dims,
-    int32 nnet_outputs = nnet_.OutputDim(),
-      num_frames = diff.NumRows();
-    // Passing the derivative into the nested network. The loss derivative is averaged:
-    // single frame from nested network influenced all the frames in the main network,
-    // so to get the derivative w.r.t. single frame from nested network we sum derivatives
-    // of all frames from main network (and scale by 1/Nframes constant).
-    //
-    // In fact all the frames from nested network influenced all the input frames to main nnet,
-    // so the loss derivarive w.r.t. nested network output is same for all frames in sentence.
-    CuVector<BaseFloat> average_diff(nnet_outputs);
-    average_diff.AddRowSumMat(1.0 / num_frames, diff.ColRange(0, nnet_outputs), 0.0);
-    CuMatrix<BaseFloat> nnet_out_diff(num_frames, nnet_outputs);
-    nnet_out_diff.AddVecToRows(1.0, average_diff, 0.0);
-    //
-    nnet_.Backpropagate(nnet_out_diff, NULL);
-  }
-
-  void SetTrainOptions(const NnetTrainOptions &opts) {
-    UpdatableComponent::SetTrainOptions(opts_);
-    // Pass the train options to the nnet
-    NnetTrainOptions o(opts);
-    o.learn_rate *= learn_rate_factor_;
-    nnet_.SetTrainOptions(opts_);
-  }
-
- private:
-  Nnet nnet_;
-  float learn_rate_factor_;
-};
-/* Deprecated */
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_SENTENCE_AVERAGING_COMPONENT_H_
diff --git a/src/nnet/nnet-trnopts.h b/src/nnet/nnet-trnopts.h
deleted file mode 100644
index 0a064e17fd4..00000000000
--- a/src/nnet/nnet-trnopts.h
+++ /dev/null
@@ -1,118 +0,0 @@
-// nnet/nnet-trnopts.h
-
-// Copyright 2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET_NNET_TRNOPTS_H_
-#define KALDI_NNET_NNET_TRNOPTS_H_
-
-#include "base/kaldi-common.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-
-struct NnetTrainOptions {
-  // option declaration
-  BaseFloat learn_rate;
-  BaseFloat momentum;
-  BaseFloat l2_penalty;
-  BaseFloat l1_penalty;
-
-  // default values
-  NnetTrainOptions():
-    learn_rate(0.008),
-    momentum(0.0),
-    l2_penalty(0.0),
-    l1_penalty(0.0)
-  { }
-
-  // register options
-  void Register(OptionsItf *opts) {
-    opts->Register("learn-rate", &learn_rate, "Learning rate");
-    opts->Register("momentum", &momentum, "Momentum");
-    opts->Register("l2-penalty", &l2_penalty, "L2 penalty (weight decay)");
-    opts->Register("l1-penalty", &l1_penalty, "L1 penalty (promote sparsity)");
-  }
-
-  // print for debug purposes
-  friend std::ostream& operator<<(std::ostream& os, const NnetTrainOptions& opts) {
-    os << "NnetTrainOptions : "
-       << "learn_rate" << opts.learn_rate << ", "
-       << "momentum" << opts.momentum << ", "
-       << "l2_penalty" << opts.l2_penalty << ", "
-       << "l1_penalty" << opts.l1_penalty;
-    return os;
-  }
-};
-
-
-struct RbmTrainOptions {
-  // option declaration
-  BaseFloat learn_rate;
-  BaseFloat momentum;
-  BaseFloat momentum_max;
-  int32 momentum_steps;
-  int32 momentum_step_period;
-  BaseFloat l2_penalty;
-
-  // default values
-  RbmTrainOptions():
-    learn_rate(0.4),
-    momentum(0.5),
-    momentum_max(0.9),
-    momentum_steps(40),
-    momentum_step_period(500000),
-    // 500000 * 40 = 55h of linear increase of momentum
-    l2_penalty(0.0002)
-  { }
-
-  // register options
-  void Register(OptionsItf *opts) {
-    opts->Register("learn-rate", &learn_rate, "Learning rate");
-
-    opts->Register("momentum", &momentum,
-                   "Initial momentum for linear scheduling");
-    opts->Register("momentum-max", &momentum_max,
-                   "Final momentum for linear scheduling");
-    opts->Register("momentum-steps", &momentum_steps,
-                   "Number of steps of linear momentum scheduling");
-    opts->Register("momentum-step-period", &momentum_step_period,
-                   "Number of datapoints per single momentum increase step");
-
-    opts->Register("l2-penalty", &l2_penalty,
-                   "L2 penalty (weight decay, increases mixing-rate)");
-  }
-
-  // print for debug purposes
-  friend std::ostream& operator<<(std::ostream& os, const RbmTrainOptions& opts) {
-    os << "RbmTrainOptions : "
-       << "learn_rate" << opts.learn_rate << ", "
-       << "momentum" << opts.momentum << ", "
-       << "momentum_max" << opts.momentum_max << ", "
-       << "momentum_steps" << opts.momentum_steps << ", "
-       << "momentum_step_period" << opts.momentum_step_period << ", "
-       << "l2_penalty" << opts.l2_penalty;
-    return os;
-  }
-};  // struct RbmTrainOptions
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_TRNOPTS_H_
diff --git a/src/nnet/nnet-utils.h b/src/nnet/nnet-utils.h
deleted file mode 100644
index 8b1afbbed3b..00000000000
--- a/src/nnet/nnet-utils.h
+++ /dev/null
@@ -1,317 +0,0 @@
-// nnet/nnet-utils.h
-
-// Copyright 2015  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_UTILS_H_
-#define KALDI_NNET_NNET_UTILS_H_
-
-#include <string>
-#include <vector>
-#include <iterator>
-#include <algorithm>
-
-#include "base/kaldi-common.h"
-#include "cudamatrix/cu-matrix.h"
-#include "cudamatrix/cu-array.h"
-#include "hmm/posterior.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-
-/**
- * Define stream insertion opeartor for 'std::vector', useful for log-prints,
- */
-template <typename T>
-std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) {
-  std::copy(v.begin(), v.end(), std::ostream_iterator<T>(os, " "));
-  return os;
-}
-
-/**
- * Convert basic type to a string (please don't overuse),
- */
-template <typename T>
-std::string ToString(const T& t) {
-  std::ostringstream os;
-  os << t;
-  return os.str();
-}
-
-/**
- * Get a string with statistics of the data in a vector,
- * so we can print them easily.
- */
-template <typename Real>
-std::string MomentStatistics(const VectorBase<Real> &vec) {
-  // we use an auxiliary vector for the higher order powers
-  Vector<Real> vec_aux(vec);
-  Vector<Real> vec_no_mean(vec);  // vec with mean subtracted
-  // mean
-  Real mean = vec.Sum() / vec.Dim();
-  // variance
-  vec_aux.Add(-mean);
-  vec_no_mean = vec_aux;
-  vec_aux.MulElements(vec_no_mean);  // (vec-mean)^2
-  Real variance = vec_aux.Sum() / vec.Dim();
-  // skewness
-  // - negative : left tail is longer,
-  // - positive : right tail is longer,
-  // - zero : symmetric
-  vec_aux.MulElements(vec_no_mean);  // (vec-mean)^3
-  Real skewness = vec_aux.Sum() / pow(variance, 3.0/2.0) / vec.Dim();
-  // kurtosis (peakedness)
-  // - makes sense for symmetric distributions (skewness is zero)
-  // - positive : 'sharper peak' than Normal distribution
-  // - negative : 'heavier tails' than Normal distribution
-  // - zero : same peakedness as the Normal distribution
-  vec_aux.MulElements(vec_no_mean);  // (vec-mean)^4
-  Real kurtosis = vec_aux.Sum() / (variance * variance) / vec.Dim() - 3.0;
-  // send the statistics to stream,
-  std::ostringstream ostr;
-  ostr << " ( min " << vec.Min() << ", max " << vec.Max()
-       << ", mean " << mean
-       << ", stddev " << sqrt(variance)
-       << ", skewness " << skewness
-       << ", kurtosis " << kurtosis
-       << " ) ";
-  return ostr.str();
-}
-
-/**
- * Overload MomentStatistics to MatrixBase<Real>
- */
-template <typename Real>
-std::string MomentStatistics(const MatrixBase<Real> &mat) {
-  Vector<Real> vec(mat.NumRows()*mat.NumCols());
-  vec.CopyRowsFromMat(mat);
-  return MomentStatistics(vec);
-}
-
-/**
- * Overload MomentStatistics to CuVectorBase<Real>
- */
-template <typename Real>
-std::string MomentStatistics(const CuVectorBase<Real> &vec) {
-  Vector<Real> vec_host(vec.Dim());
-  vec.CopyToVec(&vec_host);
-  return MomentStatistics(vec_host);
-}
-
-/**
- * Overload MomentStatistics to CuMatrix<Real>
- */
-template <typename Real>
-std::string MomentStatistics(const CuMatrixBase<Real> &mat) {
-  Matrix<Real> mat_host(mat.NumRows(), mat.NumCols());
-  mat.CopyToMat(&mat_host);
-  return MomentStatistics(mat_host);
-}
-
-/**
- * Check that matrix contains no nan or inf
- */
-template <typename Real>
-void CheckNanInf(const CuMatrixBase<Real> &mat, const char *msg = "") {
-  Real sum = mat.Sum();
-  if (KALDI_ISINF(sum)) { KALDI_ERR << "'inf' in " << msg; }
-  if (KALDI_ISNAN(sum)) { KALDI_ERR << "'nan' in " << msg; }
-}
-
-/**
- * Get the standard deviation of values in the matrix
- */
-template <typename Real>
-Real ComputeStdDev(const CuMatrixBase<Real> &mat) {
-  int32 N = mat.NumRows() * mat.NumCols();
-  Real mean = mat.Sum() / N;
-  CuMatrix<Real> pow_2(mat);
-  pow_2.MulElements(mat);
-  Real var = pow_2.Sum() / N - mean * mean;
-  if (var < 0.0) {
-    KALDI_WARN << "Forcing the variance to be non-negative! " << var << "->0.0";
-    var = 0.0;
-  }
-  return sqrt(var);
-}
-
-
-/**
- * Fill CuMatrix with random numbers (Gaussian distribution):
- * mu = the mean value,
- * sigma = standard deviation,
- *
- * Using the CPU random generator.
- */
-template <typename Real>
-void RandGauss(BaseFloat mu, BaseFloat sigma, CuMatrixBase<Real>* mat,
-               struct RandomState* state = NULL) {
-  // fill temporary matrix with 'Normal' samples,
-  Matrix<Real> m(mat->NumRows(), mat->NumCols(), kUndefined);
-  for (int32 r = 0; r < m.NumRows(); r++) {
-    for (int32 c = 0; c < m.NumCols(); c++) {
-      m(r, c) = RandGauss(state);
-    }
-  }
-  // re-shape the distrbution,
-  m.Scale(sigma);
-  m.Add(mu);
-  // export,
-  mat->CopyFromMat(m);
-}
-
-/**
- * Fill CuMatrix with random numbers (Uniform distribution):
- * mu = the mean value,
- * range = the 'width' of the uniform PDF (spanning mu-range/2 .. mu+range/2)
- *
- * Using the CPU random generator.
- */
-template <typename Real>
-void RandUniform(BaseFloat mu, BaseFloat range, CuMatrixBase<Real>* mat,
-                 struct RandomState* state = NULL) {
-  // fill temporary matrix with '0..1' samples,
-  Matrix<Real> m(mat->NumRows(), mat->NumCols(), kUndefined);
-  for (int32 r = 0; r < m.NumRows(); r++) {
-    for (int32 c = 0; c < m.NumCols(); c++) {
-      m(r, c) = Rand(state) / static_cast<Real>(RAND_MAX);
-    }
-  }
-  // re-shape the distrbution,
-  m.Scale(range);  // 0..range,
-  m.Add(mu - (range / 2.0));  // mu-range/2 .. mu+range/2,
-  // export,
-  mat->CopyFromMat(m);
-}
-
-/**
- * Fill CuVector with random numbers (Uniform distribution):
- * mu = the mean value,
- * range = the 'width' of the uniform PDF (spanning mu-range/2 .. mu+range/2)
- *
- * Using the CPU random generator.
- */
-template <typename Real>
-void RandUniform(BaseFloat mu, BaseFloat range, CuVectorBase<Real>* vec,
-                 struct RandomState* state = NULL) {
-  // fill temporary vector with '0..1' samples,
-  Vector<Real> v(vec->Dim(), kUndefined);
-  for (int32 i = 0; i < v.Dim(); i++) {
-    v(i) = Rand(state) / static_cast<Real>(RAND_MAX);
-  }
-  // re-shape the distrbution,
-  v.Scale(range);  // 0..range,
-  v.Add(mu - (range / 2.0));  // mu-range/2 .. mu+range/2,
-  // export,
-  vec->CopyFromVec(v);
-}
-
-
-/**
- * Build 'integer vector' out of vector of 'matlab-like' representation:
- * 'b, b:e, b:s:e'
- *
- * b,e,s are integers, where:
- * b = beginning
- * e = end
- * s = step
- *
- * The sequence includes 'end', 1:3 => [ 1 2 3 ].
- * The 'step' has to be positive.
- */
-inline void BuildIntegerVector(const std::vector<std::vector<int32> >& in,
-                               std::vector<int32>* out) {
-  // start with empty vector,
-  out->clear();
-  // loop over records,
-  for (int32 i = 0; i < in.size(); i++) {
-    // process i'th record,
-    int32 beg = 0, end = 0, step = 1;
-    switch (in[i].size()) {
-      case 1:
-        beg  = in[i][0];
-        end  = in[i][0];
-        step = 1;
-        break;
-      case 2:
-        beg  = in[i][0];
-        end  = in[i][1];
-        step = 1;
-        break;
-      case 3:
-        beg  = in[i][0];
-        end  = in[i][2];
-        step = in[i][1];
-        break;
-      default:
-        KALDI_ERR << "Something is wrong! (should be 1-3) : "
-                  << in[i].size();
-    }
-    // check the inputs,
-    KALDI_ASSERT(beg <= end);
-    KALDI_ASSERT(step > 0);  // positive,
-    // append values to vector,
-    for (int32 j = beg; j <= end; j += step) {
-      out->push_back(j);
-    }
-  }
-}
-
-/**
- * Wrapper with 'CuArray<int32>' output.
- */
-inline void BuildIntegerVector(const std::vector<std::vector<int32> >& in,
-                               CuArray<int32>* out) {
-  std::vector<int32> v;
-  BuildIntegerVector(in, &v);
-  (*out) = v;
-}
-
-
-/**
- * Wrapper of PosteriorToMatrix with CuMatrix argument.
- */
-template <typename Real>
-void PosteriorToMatrix(const Posterior &post,
-                       const int32 post_dim, CuMatrix<Real> *mat) {
-  Matrix<Real> m;
-  PosteriorToMatrix(post, post_dim, &m);
-  (*mat) = m;
-}
-
-
-/**
- * Wrapper of PosteriorToMatrixMapped with CuMatrix argument.
- */
-template <typename Real>
-void PosteriorToPdfMatrix(const Posterior &post,
-                          const TransitionModel &model,
-                          CuMatrix<Real> *mat) {
-  Matrix<BaseFloat> m;
-  PosteriorToPdfMatrix(post, model, &m);
-  // Copy to output GPU matrix,
-  (*mat) = m;
-}
-
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_UTILS_H_
diff --git a/src/nnet/nnet-various.h b/src/nnet/nnet-various.h
deleted file mode 100644
index eeef9bc25bf..00000000000
--- a/src/nnet/nnet-various.h
+++ /dev/null
@@ -1,518 +0,0 @@
-// nnet/nnet-various.h
-
-// Copyright 2012-2016  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_VARIOUS_H_
-#define KALDI_NNET_NNET_VARIOUS_H_
-
-#include <string>
-#include <vector>
-#include <algorithm>
-#include <sstream>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-#include "util/text-utils.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * Splices the time context of the input features
- * in N, out k*N, FrameOffset o_1,o_2,...,o_k
- * FrameOffset example 11frames: -5 -4 -3 -2 -1 0 1 2 3 4 5
- */
-class Splice: public Component {
- public:
-  Splice(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~Splice()
-  { }
-
-  Component* Copy() const { return new Splice(*this); }
-  ComponentType GetType() const { return kSplice; }
-
-  void InitData(std::istream &is) {
-    // define options,
-    std::vector<std::vector<int32> > build_vector;
-    // parse config,
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ReadVector>") {
-        frame_offsets_.Read(is, false);
-      } else if (token == "<BuildVector>") {
-        // Parse the list of 'matlab-like' indices:
-        // <BuildVector> 1:1:1000 1 2 3 1:10 </BuildVector>
-        while (is >> std::ws, !is.eof()) {
-          std::string colon_sep_list_or_end;
-          ReadToken(is, false, &colon_sep_list_or_end);
-          if (colon_sep_list_or_end == "</BuildVector>") break;
-          std::vector<int32> v;
-          SplitStringToIntegers(colon_sep_list_or_end, ":", false, &v);
-          build_vector.push_back(v);
-        }
-      } else {
-        KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                  << " (ReadVector|BuildVector)";
-      }
-    }
-
-    if (build_vector.size() > 0) {
-      // build the vector, using <BuildVector> ... </BuildVector> inputs,
-      BuildIntegerVector(build_vector, &frame_offsets_);
-    }
-
-    // check dim
-    KALDI_ASSERT(frame_offsets_.Dim()*InputDim() == OutputDim());
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    frame_offsets_.Read(is, binary);
-    KALDI_ASSERT(frame_offsets_.Dim() * InputDim() == OutputDim());
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    frame_offsets_.Write(os, binary);
-  }
-
-  std::string Info() const {
-    std::ostringstream ostr;
-    ostr << "\n  frame_offsets " << frame_offsets_;
-    std::string str = ostr.str();
-    str.erase(str.end()-1);
-    return str;
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    cu::Splice(in, frame_offsets_, out);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // WARNING!!! WARNING!!! WARNING!!!
-    // THIS BACKPROPAGATION CAN BE USED ONLY WITH 'PER-UTTERANCE' TRAINING!
-    // IN MINI-BATCH TRAINING, THIS <Splice> COMPONENT HAS TO BE PART OF THE
-    // 'feature_transform' SO WE DON'T BACKPROPAGATE THROUGH IT...
-
-    // dims,
-    int32 input_dim = in.NumCols(),
-          num_frames = out_diff.NumRows();
-    // Copy offsets to 'host',
-    std::vector<int32> offsets(frame_offsets_.Dim());
-    frame_offsets_.CopyToVec(&offsets);
-    // loop over the offsets,
-    for (int32 i = 0; i < offsets.size(); i++) {
-      int32 o_i = offsets.at(i);
-      int32 n_rows = num_frames - abs(o_i),
-            src_row = std::max(-o_i, 0),
-            tgt_row = std::max(o_i, 0);
-      const CuSubMatrix<BaseFloat> src = out_diff.Range(src_row, n_rows, i*input_dim, input_dim);
-      CuSubMatrix<BaseFloat> tgt = in_diff->RowRange(tgt_row, n_rows);
-      tgt.AddMat(1.0, src, kNoTrans);
-    }
-  }
-
- protected:
-  CuArray<int32> frame_offsets_;
-};
-
-
-/**
- * Rearrange the matrix columns according to the indices in copy_from_indices_
- */
-class CopyComponent: public Component {
- public:
-  CopyComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~CopyComponent()
-  { }
-
-  Component* Copy() const { return new CopyComponent(*this); }
-  ComponentType GetType() const { return kCopy; }
-
-  void InitData(std::istream &is) {
-    // define options,
-    std::vector<std::vector<int32> > build_vector;
-    // parse config,
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ReadVector>") {
-        copy_from_indices_.Read(is, false);
-      } else if (token == "<BuildVector>") {
-        // <BuildVector> 1:1:1000 1:1:1000 1 2 3 1:10 </BuildVector>
-        // 'matlab-line' indexing, read the colon-separated-lists:
-        while (is >> std::ws, !is.eof()) {
-          std::string colon_sep_list_or_end;
-          ReadToken(is, false, &colon_sep_list_or_end);
-          if (colon_sep_list_or_end == "</BuildVector>") break;
-          std::vector<int32> v;
-          SplitStringToIntegers(colon_sep_list_or_end, ":", false, &v);
-          build_vector.push_back(v);
-        }
-      } else {
-        KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                  << " (ReadVector|BuildVector)";
-      }
-    }
-
-    if (build_vector.size() > 0) {
-      // build the vector, using <BuildVector> ... </BuildVector> inputs,
-      BuildIntegerVector(build_vector, &copy_from_indices_);
-    }
-
-    // decrease by 1,
-    copy_from_indices_.Add(-1);
-
-    // check range,
-    KALDI_ASSERT(copy_from_indices_.Min() >= 0);
-    KALDI_ASSERT(copy_from_indices_.Max() < InputDim());
-    // check dim,
-    KALDI_ASSERT(copy_from_indices_.Dim() == OutputDim());
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    copy_from_indices_.Read(is, binary);
-    KALDI_ASSERT(copy_from_indices_.Dim() == OutputDim());
-    copy_from_indices_.Add(-1);  // -1 from each element,
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    CuArray<int32> tmp(copy_from_indices_);
-    tmp.Add(1);  // +1 to each element,
-    tmp.Write(os, binary);
-  }
-
-  std::string Info() const {
-    return std::string("\n  min ") + ToString(copy_from_indices_.Min()) +
-                         ", max "  + ToString(copy_from_indices_.Max());
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    cu::Copy(in, copy_from_indices_,out);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    static bool warning_displayed = false;
-    if (!warning_displayed) {
-      KALDI_WARN << Component::TypeToMarker(GetType()) << " : "
-                 << __func__ << "() Not implemented!";
-
-      warning_displayed = true;
-    }
-    in_diff->SetZero();
-  }
-
- protected:
-  CuArray<int32> copy_from_indices_;
-};
-
-
-
-/**
- * Rescale the matrix-rows to have unit length (L2-norm).
- */
-class LengthNormComponent: public Component {
- public:
-  LengthNormComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out)
-  { }
-
-  ~LengthNormComponent()
-  { }
-
-  Component* Copy() const { return new LengthNormComponent(*this); }
-  ComponentType GetType() const { return kLengthNormComponent; }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // resize vector when needed,
-    if (row_scales_.Dim() != in.NumRows()) {
-      row_scales_.Resize(in.NumRows());
-    }
-    // get the normalization scalars,
-    l2_aux_ = in;
-    l2_aux_.MulElements(l2_aux_);  // x^2,
-    row_scales_.AddColSumMat(1.0, l2_aux_, 0.0);  // sum_of_cols(x^2),
-    row_scales_.ApplyPow(0.5);  // L2norm = sqrt(sum_of_cols(x^2)),
-    row_scales_.InvertElements();  // 1/L2norm,
-    // compute the output,
-    out->CopyFromMat(in);
-    out->MulRowsVec(row_scales_);  // re-normalize,
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    in_diff->CopyFromMat(out_diff);
-    in_diff->MulRowsVec(row_scales_);  // diff_by_x(s * x) = s,
-  }
-
- private:
-  CuMatrix<BaseFloat> l2_aux_;  ///< auxiliary matrix for L2 norm computation,
-  CuVector<BaseFloat> row_scales_;  ///< normalization scale of each row,
-};
-
-
-/**
- * Adds shift to all the lines of the matrix
- * (can be used for global mean normalization)
- */
-class AddShift : public UpdatableComponent {
- public:
-  AddShift(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    shift_data_(dim_in)
-  { }
-
-  ~AddShift()
-  { }
-
-  Component* Copy() const { return new AddShift(*this); }
-  ComponentType GetType() const { return kAddShift; }
-
-  void InitData(std::istream &is) {
-    // define options
-    float init_param = 0.0;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<InitParam>") ReadBasicType(is, false, &init_param);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (InitParam)";
-    }
-    // initialize
-    shift_data_.Resize(InputDim(), kSetZero);  // set to zero
-    shift_data_.Set(init_param);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // optional learning-rate coef,
-    if ('<' == Peek(is, binary)) {
-      ExpectToken(is, binary, "<LearnRateCoef>");
-      ReadBasicType(is, binary, &learn_rate_coef_);
-    }
-    // read the shift data
-    shift_data_.Read(is, binary);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    shift_data_.Write(os, binary);
-  }
-
-  int32 NumParams() const { return shift_data_.Dim(); }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    shift_data_grad_.CopyToVec(gradient);
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    shift_data_.CopyToVec(params);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    shift_data_.CopyFromVec(params);
-  }
-
-  std::string Info() const {
-    return std::string("\n  shift_data") +
-      MomentStatistics(shift_data_) +
-      ", lr-coef " + ToString(learn_rate_coef_);
-  }
-
-  std::string InfoGradient() const {
-    return std::string("\n  shift_data_grad") +
-      MomentStatistics(shift_data_grad_) +
-      ", lr-coef " + ToString(learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // copy, add the shift,
-    out->CopyFromMat(in);
-    out->AddVecToRows(1.0, shift_data_, 1.0);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // the derivative of additive constant is zero...
-    in_diff->CopyFromMat(out_diff);
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // we use following hyperparameters from the option class,
-    const BaseFloat lr = opts_.learn_rate;
-    // gradient,
-    shift_data_grad_.Resize(InputDim(), kSetZero);  // reset to zero,
-    shift_data_grad_.AddRowSumMat(1.0, diff, 0.0);
-    // update,
-    shift_data_.AddVec(-lr * learn_rate_coef_, shift_data_grad_);
-  }
-
-  void SetLearnRateCoef(BaseFloat c) { learn_rate_coef_ = c; }
-
- protected:
-  CuVector<BaseFloat> shift_data_;
-  CuVector<BaseFloat> shift_data_grad_;
-};
-
-
-/**
- * Rescale the data column-wise by a vector
- * (can be used for global variance normalization)
- */
-class Rescale : public UpdatableComponent {
- public:
-  Rescale(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    scale_data_(dim_in)
-  { }
-
-  ~Rescale()
-  { }
-
-  Component* Copy() const { return new Rescale(*this); }
-  ComponentType GetType() const { return kRescale; }
-
-  void InitData(std::istream &is) {
-    // define options
-    float init_param = 0.0;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<InitParam>") ReadBasicType(is, false, &init_param);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (InitParam)";
-    }
-    // initialize
-    scale_data_.Resize(InputDim(), kSetZero);
-    scale_data_.Set(init_param);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // optional learning-rate coef,
-    if ('<' == Peek(is, binary)) {
-      ExpectToken(is, binary, "<LearnRateCoef>");
-      ReadBasicType(is, binary, &learn_rate_coef_);
-    }
-    // read the shift data
-    scale_data_.Read(is, binary);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    scale_data_.Write(os, binary);
-  }
-
-  int32 NumParams() const { return scale_data_.Dim(); }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    scale_data_grad_.CopyToVec(gradient);
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    scale_data_.CopyToVec(params);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    scale_data_.CopyFromVec(params);
-  }
-
-  std::string Info() const {
-    return std::string("\n  scale_data") +
-      MomentStatistics(scale_data_) +
-      ", lr-coef " + ToString(learn_rate_coef_);
-  }
-
-  std::string InfoGradient() const {
-    return std::string("\n  scale_data_grad") +
-      MomentStatistics(scale_data_grad_) +
-      ", lr-coef " + ToString(learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // copy, rescale the data,
-    out->CopyFromMat(in);
-    out->MulColsVec(scale_data_);
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // derivatives are scaled with the scale_data_,
-    in_diff->CopyFromMat(out_diff);
-    in_diff->MulColsVec(scale_data_);
-  }
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // we use following hyperparameters from the option class,
-    const BaseFloat lr = opts_.learn_rate;
-    // gradient,
-    scale_data_grad_.Resize(InputDim(), kSetZero);  // reset,
-    CuMatrix<BaseFloat> gradient_aux(diff);
-    gradient_aux.MulElements(input);
-    scale_data_grad_.AddRowSumMat(1.0, gradient_aux, 0.0);
-    // update,
-    scale_data_.AddVec(-lr * learn_rate_coef_, scale_data_grad_);
-  }
-
-  void SetLearnRateCoef(BaseFloat c) { learn_rate_coef_ = c; }
-
- protected:
-  CuVector<BaseFloat> scale_data_;
-  CuVector<BaseFloat> scale_data_grad_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_VARIOUS_H_
diff --git a/src/nnet2/Makefile b/src/nnet2/Makefile
deleted file mode 100644
index 7c19ec2603c..00000000000
--- a/src/nnet2/Makefile
+++ /dev/null
@@ -1,33 +0,0 @@
-
-
-all:
-
-include ../kaldi.mk
-
-LDFLAGS += $(CUDA_LDFLAGS)
-LDLIBS += $(CUDA_LDLIBS)
-
-
-TESTFILES = nnet-component-test nnet-precondition-test \
-	nnet-precondition-online-test nnet-example-functions-test \
-    nnet-nnet-test am-nnet-test online-nnet2-decodable-test \
-    nnet-compute-test
-
-OBJFILES = nnet-component.o nnet-nnet.o train-nnet.o train-nnet-ensemble.o nnet-update.o \
-     nnet-compute.o am-nnet.o nnet-functions.o  \
-     nnet-precondition.o combine-nnet.o \
-     mixup-nnet.o nnet-update-parallel.o combine-nnet-fast.o \
-     nnet-fix.o nnet-stats.o rescale-nnet.o nnet-limit-rank.o nnet-example.o \
-     get-feature-transform.o widen-nnet.o nnet-precondition-online.o \
-     nnet-example-functions.o nnet-compute-discriminative.o \
-     nnet-compute-discriminative-parallel.o online-nnet2-decodable.o \
-     nnet-compute-online.o
-
-LIBNAME = kaldi-nnet2
-
-ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a \
-          ../hmm/kaldi-hmm.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
-
-include ../makefiles/default_rules.mk
diff --git a/src/nnet2/am-nnet-test.cc b/src/nnet2/am-nnet-test.cc
deleted file mode 100644
index ce864320700..00000000000
--- a/src/nnet2/am-nnet-test.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// nnet2/am-nnet-test.cc
-
-// Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hmm/transition-model.h"
-#include "hmm/hmm-test-utils.h"
-#include "nnet2/am-nnet.h"
-
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void UnitTestAmNnet() {
-  std::vector<int32> phones;
-  phones.push_back(1);
-  for (int32 i = 2; i < 20; i++)
-    if (rand() % 2 == 0)
-      phones.push_back(i);
-  int32 N = 2 + rand() % 2, // context-size N is 2 or 3.
-      P = rand() % N;  // Central-phone is random on [0, N)
-
-  std::vector<int32> num_pdf_classes;
-
-  ContextDependency *ctx_dep =
-      GenRandContextDependencyLarge(phones, N, P,
-                                    true, &num_pdf_classes);
-
-  HmmTopology topo = GetDefaultTopology(phones);
-
-  TransitionModel trans_model(*ctx_dep, topo);
-
-  delete ctx_dep; // We won't need this further.
-  ctx_dep = NULL;
-
-  int32 input_dim = 40, output_dim = trans_model.NumPdfs();
-  Nnet *nnet = GenRandomNnet(input_dim, output_dim);
-
-  AmNnet am_nnet(*nnet);
-  delete nnet;
-  nnet = NULL;
-  Vector<BaseFloat> priors(output_dim);
-  priors.SetRandn();
-  priors.ApplyExp();
-  priors.Scale(1.0 / priors.Sum());
-
-  am_nnet.SetPriors(priors);
-
-  bool binary = (rand() % 2 == 0);
-  std::ostringstream os;
-  am_nnet.Write(os, binary);
-  AmNnet am_nnet2;
-  std::istringstream is(os.str());
-  am_nnet2.Read(is, binary);
-
-  std::ostringstream os2;
-  am_nnet2.Write(os2, binary);
-
-  KALDI_ASSERT(os2.str() == os.str());
-}
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-
-  UnitTestAmNnet();
-  return 0;
-}
-
diff --git a/src/nnet2/am-nnet.cc b/src/nnet2/am-nnet.cc
deleted file mode 100644
index 60d65de31a7..00000000000
--- a/src/nnet2/am-nnet.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// nnet2/am-nnet.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/am-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void AmNnet::Init(std::istream &config_is) {
-  nnet_.Init(config_is);
-}
-
-
-void AmNnet::Write(std::ostream &os, bool binary) const {
-  // We don't write any header or footer like <AmNnet> and </AmNnet> -- we just
-  // write the neural net and then the priors.  Who knows, there might be some
-  // situation where we want to just read the neural net.
-  nnet_.Write(os, binary);
-  priors_.Write(os, binary);
-}
-
-void AmNnet::Read(std::istream &is, bool binary) {
-  nnet_.Read(is, binary);
-  priors_.Read(is, binary);
-}
-
-void AmNnet::SetPriors(const VectorBase<BaseFloat> &priors) {
-  priors_ = priors;
-  if (priors_.Dim() > NumPdfs())    
-    KALDI_ERR << "Dimension of priors cannot exceed number of pdfs.";
-
-  if (priors_.Dim() > 0 && priors_.Dim() < NumPdfs()) {
-    KALDI_WARN << "Dimension of priors is " << priors_.Dim() << " < "
-               << NumPdfs() << ": extending with zeros, in case you had "
-               << "unseen pdf's, but this possibly indicates a serious problem.";
-    priors_.Resize(NumPdfs(), kCopyData);
-  }
-}
-
-std::string AmNnet::Info() const {
-  std::ostringstream ostr;
-  ostr << "prior dimension: " << priors_.Dim();
-  if (priors_.Dim() != 0) {
-    ostr << ", prior sum: " << priors_.Sum() << ", prior min: " << priors_.Min()
-         << "\n";
-  }
-  return nnet_.Info() + ostr.str();
-}
-
-void AmNnet::Init(const Nnet &nnet) {
-  nnet_ = nnet;
-  if (priors_.Dim() != 0 && priors_.Dim() != nnet.OutputDim()) {
-    KALDI_WARN << "Initializing neural net: prior dimension mismatch, "
-               << "discarding old priors.";
-    priors_.Resize(0);
-  }
-}
-
-void AmNnet::ResizeOutputLayer(int32 new_num_pdfs) {
-  nnet_.ResizeOutputLayer(new_num_pdfs);
-  priors_.Resize(new_num_pdfs);
-  priors_.Set(1.0 / new_num_pdfs);
-}
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/am-nnet.h b/src/nnet2/am-nnet.h
deleted file mode 100644
index a5ea512e3d8..00000000000
--- a/src/nnet2/am-nnet.h
+++ /dev/null
@@ -1,86 +0,0 @@
-// nnet2/am-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_AM_NNET_H_
-#define KALDI_NNET2_AM_NNET_H_
-
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-#include "nnet2/nnet-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/*
-  The class AmNnet (AM stands for "acoustic model") has the job of taking the
-  "Nnet" class, which is a quite general neural network, and giving it an
-  interface that's suitable for acoustic modeling; it deals with storing, and
-  dividing by, the prior of each context-dependent state.
-*/
-
-
-class AmNnet {
- public:
-  AmNnet() { }
-
-  AmNnet(const AmNnet &other): nnet_(other.nnet_), priors_(other.priors_) { }
-
-  explicit AmNnet(const Nnet &nnet): nnet_(nnet) { }
-  
-  /// Initialize the neural network based acoustic model from a config file.
-  /// At this point the priors won't be initialized; you'd have to do
-  /// SetPriors for that.
-  void Init(std::istream &config_is);
-
-  /// Initialize from a neural network that's already been set up.
-  /// Again, the priors will be empty at this point.
-  void Init(const Nnet &nnet);
-
-  int32 NumPdfs() const { return nnet_.OutputDim(); }
-  
-  void Write(std::ostream &os, bool binary) const;
-  
-  void Read(std::istream &is, bool binary);
-
-  const Nnet &GetNnet() const { return nnet_; }
-  
-  Nnet &GetNnet() { return nnet_; }
-
-  void SetPriors(const VectorBase<BaseFloat> &priors);
-  
-  const VectorBase<BaseFloat> &Priors() const { return priors_; }
-
-  std::string Info() const;
-
-  /// This function is used when doing transfer learning to a new system.
-  /// It will set the priors to be all the same. 
-  void ResizeOutputLayer(int32 new_num_pdfs);
-  
- private:
-  const AmNnet &operator = (const AmNnet &other); // Disallow.
-  Nnet nnet_;
-  Vector<BaseFloat> priors_;
-};
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_AM_NNET_H_
diff --git a/src/nnet2/combine-nnet-a.cc b/src/nnet2/combine-nnet-a.cc
deleted file mode 100644
index 6208ca77770..00000000000
--- a/src/nnet2/combine-nnet-a.cc
+++ /dev/null
@@ -1,230 +0,0 @@
-// nnet2/combine-nnet-a.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/combine-nnet-a.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/*
-  This function gets the "update direction".  The vector "nnets" is
-  interpreted as (old-nnet new-nnet1 net-nnet2 ... new-nnetN), and
-  the "update direction" is the average of the new nnets, minus the
-  old nnet.
-*/
-static void GetUpdateDirection(const std::vector<Nnet> &nnets,
-                               Nnet *direction) {
-  KALDI_ASSERT(nnets.size() > 1);
-  int32 num_new_nnets = nnets.size() - 1;
-  Vector<BaseFloat> scales(nnets[0].NumUpdatableComponents());
-
-  scales.Set(1.0 / num_new_nnets);
-  
-  *direction = nnets[1];
-  direction->ScaleComponents(scales); // first of the new nnets.
-  for (int32 n = 2; n < 1 + num_new_nnets; n++)
-    direction->AddNnet(scales, nnets[n]);
-  // now "direction" is the average of the new nnets.  Subtract
-  // the old nnet's parameters.
-  scales.Set(-1.0);
-  direction->AddNnet(scales, nnets[0]);
-}
-
-/// Sets "dest" to orig_nnet plus "direction", with
-/// each updatable component of "direction" first scaled by
-/// the appropriate scale.
-static void AddDirection(const Nnet &orig_nnet,
-                         const Nnet &direction,
-                         const VectorBase<BaseFloat> &scales,
-                         Nnet *dest) {
-  *dest = orig_nnet;
-  dest->AddNnet(scales, direction);
-}
-
-
-static BaseFloat ComputeObjfAndGradient(
-    const std::vector<NnetExample> &validation_set,
-    const Vector<double> &scale_params,
-    const Nnet &orig_nnet,
-    const Nnet &direction,
-    Vector<double> *gradient) {
-  
-  Vector<BaseFloat> scale_params_float(scale_params);
-
-  Nnet nnet_combined;
-  AddDirection(orig_nnet, direction, scale_params_float, &nnet_combined);
-  
-  Nnet nnet_gradient(nnet_combined);
-  bool is_gradient = true;
-  nnet_gradient.SetZero(is_gradient);
-  
-  // note: "ans" is normalized by the total weight of validation frames.
-  int32 batch_size = 1024;
-  BaseFloat ans = ComputeNnetGradient(nnet_combined,
-                                      validation_set,
-                                      batch_size,
-                                      &nnet_gradient);
-
-  BaseFloat tot_count = validation_set.size();
-  int32 i = 0; // index into scale_params.
-  for (int32 j = 0; j < nnet_combined.NumComponents(); j++) {
-    const UpdatableComponent *uc_direction =
-        dynamic_cast<const UpdatableComponent*>(&(direction.GetComponent(j))),
-        *uc_gradient =
-        dynamic_cast<const UpdatableComponent*>(&(nnet_gradient.GetComponent(j)));
-    if (uc_direction != NULL) {
-      BaseFloat dotprod = uc_direction->DotProduct(*uc_gradient) / tot_count;
-      (*gradient)(i) = dotprod; 
-      i++;
-    }
-  }
-  KALDI_ASSERT(i == scale_params.Dim());
-  return ans;
-}
-                                   
-
-void CombineNnetsA(const NnetCombineAconfig &config,
-                   const std::vector<NnetExample> &validation_set,
-                   const std::vector<Nnet> &nnets,
-                   Nnet *nnet_out) {
-
-  Nnet direction; // the update direction = avg(nnets[1 ... N]) - nnets[0].
-  GetUpdateDirection(nnets, &direction);
-  
-  Vector<double> scale_params(nnets[0].NumUpdatableComponents()); // initial
-  // scale on "direction".
-
-  int32 dim = scale_params.Dim();
-  KALDI_ASSERT(dim > 0);
-  Vector<double> gradient(dim);
-  
-  double objf, initial_objf, zero_objf;
-
-  // Compute objf at zero; we don't actually need this gradient.
-  zero_objf = ComputeObjfAndGradient(validation_set,
-                                     scale_params,
-                                     nnets[0],
-                                     direction,
-                                     &gradient);
-  KALDI_LOG << "Objective function at old parameters is "
-            << zero_objf;
-  
-  scale_params.Set(1.0); // start optimization from the average of the parameters.
-
-  LbfgsOptions lbfgs_options;
-  lbfgs_options.minimize = false; // We're maximizing.
-  lbfgs_options.m = dim; // Store the same number of vectors as the dimension
-  // itself, so this is BFGS.
-  lbfgs_options.first_step_length = config.initial_step;
-  
-  OptimizeLbfgs<double> lbfgs(scale_params,
-                              lbfgs_options);
-  
-  for (int32 i = 0; i < config.num_bfgs_iters; i++) {    
-    scale_params.CopyFromVec(lbfgs.GetProposedValue());
-    objf = ComputeObjfAndGradient(validation_set,
-                                  scale_params,
-                                  nnets[0],
-                                  direction,
-                                  &gradient);
-
-    KALDI_VLOG(2) << "Iteration " << i << " scale-params = " << scale_params
-                  << ", objf = " << objf << ", gradient = " << gradient;
-    
-    if (i == 0) initial_objf = objf;    
-    lbfgs.DoStep(objf, gradient);
-  }
-
-  scale_params.CopyFromVec(lbfgs.GetValue(&objf));
-
-  KALDI_LOG << "Combining nnets, after BFGS, validation objf per frame changed from "
-            << zero_objf << " (no change), or " << initial_objf << " (default change), "
-            << " to " << objf << "; scale factors on update direction are "
-            << scale_params;
-
-  BaseFloat objf_change = objf - zero_objf;
-  KALDI_ASSERT(objf_change >= 0.0); // This is guaranteed by the L-BFGS code.
-
-  if (objf_change < config.valid_impr_thresh) {
-    // We'll overshoot.  To have a smooth transition between the two regimes, if
-    // objf_change is close to valid_impr_thresh we don't overshoot as far.
-    BaseFloat overshoot = config.overshoot,
-        overshoot_max = config.valid_impr_thresh / objf_change; // >= 1.0.
-    if (overshoot_max < overshoot) {
-      KALDI_LOG << "Limiting overshoot from " << overshoot << " to " << overshoot_max
-                << " since the objf-impr " << objf_change << " is close to "
-                << "--valid-impr-thresh=" << config.valid_impr_thresh;
-      overshoot = overshoot_max;
-    }
-    KALDI_ASSERT(overshoot < 2.0 && "--valid-impr-thresh must be < 2.0 or "
-                 "it will lead to instability.");
-    scale_params.Scale(overshoot);
-
-    BaseFloat optimized_objf = objf;
-    objf = ComputeObjfAndGradient(validation_set,
-                                  scale_params,
-                                  nnets[0],
-                                  direction,
-                                  &gradient);
-
-    KALDI_LOG << "Combining nnets, after overshooting, validation objf changed "
-              << "to " << objf << ".  Note: (zero, start, optimized) objfs were "
-              << zero_objf << ", " << initial_objf << ", " << optimized_objf;
-    if (objf < zero_objf) {
-      // Note: this should not happen according to a quadratic approximation, and we
-      // expect this branch to be taken only rarely if at all.
-      KALDI_WARN << "After overshooting, objf was worse than not updating; not doing the "
-                 << "overshoot. ";
-     scale_params.Scale(1.0 / overshoot);
-    }
-  } // Else don't do the "overshoot" stuff.
-  
-  Vector<BaseFloat> scale_params_float(scale_params);
-  // Output to "nnet_out":
-  AddDirection(nnets[0], direction, scale_params_float, nnet_out);
-
-  // Now update the neural net learning rates.
-  int32 i = 0;
-  for (int32 j = 0; j < nnet_out->NumComponents(); j++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(nnet_out->GetComponent(j)));
-    if (uc != NULL) {
-      BaseFloat step_length = scale_params(i), factor = step_length;
-      // Our basic rule is to update the learning rate by multiplying it
-      // by "step_lenght", but this is subject to certain limits.
-      if (factor < config.min_learning_rate_factor)
-        factor = config.min_learning_rate_factor;
-      if (factor > config.max_learning_rate_factor)
-        factor = config.max_learning_rate_factor;
-      BaseFloat new_learning_rate = factor * uc->LearningRate();
-      if (new_learning_rate < config.min_learning_rate)
-        new_learning_rate = config.min_learning_rate;
-      KALDI_LOG << "For component " << j << ", step length was " << step_length
-                << ", updating learning rate by factor " << factor << ", changing "
-                << "learning rate from " << uc->LearningRate() << " to "
-                << new_learning_rate;
-      uc->SetLearningRate(new_learning_rate);
-      i++;
-    }
-  }
-}
- 
-  
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/combine-nnet-a.h b/src/nnet2/combine-nnet-a.h
deleted file mode 100644
index 2624993bbcf..00000000000
--- a/src/nnet2/combine-nnet-a.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// nnet2/combine-nnet-a.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_COMBINE_NNET_A_H_
-#define KALDI_NNET2_COMBINE_NNET_A_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "util/parse-options.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-struct NnetCombineAconfig {
-  int32 num_bfgs_iters; // The dimension is small (the number of layers)
-  // so we do BFGS.  Note: this num-iters is really the number of function
-  // evaluations.
-  
-  BaseFloat initial_step;
-
-  BaseFloat valid_impr_thresh;
-  BaseFloat overshoot;
-
-  BaseFloat min_learning_rate_factor; // 0.5 by default;
-  BaseFloat max_learning_rate_factor; // 2.0 by default.
-  BaseFloat min_learning_rate; // 0.0001 by default; we don't allow learning rate to go below
-  // this, mainly because it would lead to roundoff problems.
-  
-  NnetCombineAconfig(): num_bfgs_iters(15), initial_step(0.1),
-                        valid_impr_thresh(0.5), overshoot(1.8),
-                        min_learning_rate_factor(0.5),
-                        max_learning_rate_factor(2.0),
-                        min_learning_rate(0.0001) { }
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("num-bfgs-iters", &num_bfgs_iters, "Maximum number of function "
-                   "evaluations for BFGS to use when optimizing combination weights");
-    opts->Register("initial-step", &initial_step, "Parameter in the optimization, "
-                   "used to set the initial step length; the default value should be "
-                   "suitable.");
-    opts->Register("num-bfgs-iters", &num_bfgs_iters, "Maximum number of function "
-                   "evaluations for BFGS to use when optimizing combination weights");
-    opts->Register("valid-impr-thresh", &valid_impr_thresh, "Threshold of improvement "
-                   "in validation-set objective function for one iteratin; below this, "
-                   "we start using the \"overshoot\" mechanism to keep learning rates high.");
-    opts->Register("overshoot", &overshoot, "Factor by which we overshoot the step "
-                   "size obtained by BFGS; only applies when validation set impr is less "
-                   "than valid-impr-thresh.");
-    opts->Register("max-learning-rate-factor", &max_learning_rate_factor,
-                   "Maximum factor by which to increase the learning rate for any layer.");
-    opts->Register("min-learning-rate-factor", &min_learning_rate_factor,
-                   "Minimum factor by which to increase the learning rate for any layer.");
-    opts->Register("min-learning-rate", &min_learning_rate,
-                   "Floor on the automatically updated learning rates");
-  }  
-};
-
-void CombineNnetsA(const NnetCombineAconfig &combine_config,
-                   const std::vector<NnetExample> &validation_set,
-                   const std::vector<Nnet> &nnets_in,
-                   Nnet *nnet_out);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/combine-nnet-fast.cc b/src/nnet2/combine-nnet-fast.cc
deleted file mode 100644
index 7ab2c9caf05..00000000000
--- a/src/nnet2/combine-nnet-fast.cc
+++ /dev/null
@@ -1,443 +0,0 @@
-// nnet2/combine-nnet-fast.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/combine-nnet-fast.h"
-#include "nnet2/nnet-update-parallel.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/*
-  This class is responsible for computing a Fisher matrix which is a kind of
-  scatter of gradients on subsets; it's used for preconditioning the update in
-  class FastNnetCombiner.  */
-class FisherComputationClass: public MultiThreadable {
- public:
-  FisherComputationClass(const Nnet &nnet,
-                         const std::vector<Nnet> &nnets,
-                         const std::vector<NnetExample> &egs,
-                         int32 minibatch_size,
-                         SpMatrix<double> *scatter):
-      nnet_(nnet), nnets_(nnets), egs_(egs), minibatch_size_(minibatch_size),
-      scatter_ptr_(scatter) { } // This initializer is only used to create a
-  // temporary version of the object; the next initializer is used to
-  // create the separate versions for the parallel jobs.
-
-  FisherComputationClass(const FisherComputationClass &other):
-      MultiThreadable(other),
-      nnet_(other.nnet_), nnets_(other.nnets_), egs_(other.egs_),
-      minibatch_size_(other.minibatch_size_), scatter_ptr_(other.scatter_ptr_) {
-    scatter_.Resize(nnets_.size() * nnet_.NumUpdatableComponents());  }
-
-  void operator () () {
-    // b is the "minibatch id."
-    int32 num_egs = static_cast<int32>(egs_.size());
-    Nnet nnet_gradient(nnet_);
-    for (int32 b = 0; b * minibatch_size_ < num_egs; b++) {
-      if (b % num_threads_ != thread_id_)
-        continue; // We're not responsible for this minibatch.
-      int32 offset = b * minibatch_size_,
-          length = std::min(minibatch_size_,
-                       num_egs - offset);
-      bool is_gradient = true;
-      nnet_gradient.SetZero(is_gradient);
-      std::vector<NnetExample> minibatch(egs_.begin() + offset,
-                                                 egs_.begin() + offset + length);
-      DoBackprop(nnet_, minibatch, &nnet_gradient);
-      Vector<double> gradient(nnets_.size() * nnet_.NumUpdatableComponents());
-      int32 i = 0;
-      for (int32 n = 0; n < static_cast<int32>(nnets_.size()); n++) {
-        for (int32 c = 0; c < nnet_.NumComponents(); c++) {
-          const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(
-              &(nnet_gradient.GetComponent(c))),
-              *uc_other = dynamic_cast<const UpdatableComponent*>(
-                  &(nnets_[n].GetComponent(c)));
-          if (uc != NULL) {
-            gradient(i) = uc->DotProduct(*uc_other);
-            i++;
-          }
-        }
-      }
-      KALDI_ASSERT(i == gradient.Dim());
-      scatter_.AddVec2(1.0, gradient);
-    }
-  }
-  ~FisherComputationClass() {
-    if (scatter_.NumRows() != 0) {
-      if (scatter_ptr_->NumRows() == 0)
-        scatter_ptr_->Resize(scatter_.NumRows());
-      scatter_ptr_->AddSp(1.0, scatter_);
-    }
-  }
-
- private:
-  const Nnet &nnet_; // point at which we compute the parameter gradients.
-  const std::vector<Nnet> &nnets_; // The dot-product  of each of these with the parameter gradients,
-  // are the actual gradients that go into "scatter".
-  const std::vector<NnetExample> &egs_;
-  int32 minibatch_size_; // equals config --fisher-minbatch-size e.g. 64 (smaller than
-                         // regular minibatch size.)
-  SpMatrix<double> *scatter_ptr_;
-  SpMatrix<double> scatter_; // Local accumulation of the scatter.
-};
-
-
-class FastNnetCombiner {
- public:
-  FastNnetCombiner(const NnetCombineFastConfig &combine_config,
-                   const std::vector<NnetExample> &validation_set,
-                   const std::vector<Nnet> &nnets_in,
-                   Nnet *nnet_out):
-      config_(combine_config), egs_(validation_set),
-      nnets_(nnets_in), nnet_out_(nnet_out) {
-
-    GetInitialParams();
-    ComputePreconditioner();
-
-    int32 dim = params_.Dim();
-    KALDI_ASSERT(dim > 0);
-    Vector<double> gradient(dim);
-
-    double regularizer_objf, initial_regularizer_objf; // for diagnostics
-    double objf, initial_objf;
-
-    LbfgsOptions lbfgs_options;
-    lbfgs_options.minimize = false; // We're maximizing.
-    lbfgs_options.m = std::min(dim, config_.max_lbfgs_dim);
-    lbfgs_options.first_step_impr = config_.initial_impr;
-
-    OptimizeLbfgs<double> lbfgs(params_,
-                                lbfgs_options);
-
-    for (int32 i = 0; i < config_.num_lbfgs_iters; i++) {
-      params_.CopyFromVec(lbfgs.GetProposedValue());
-      objf = ComputeObjfAndGradient(&gradient, &regularizer_objf);
-      // Note: there is debug printout in ComputeObjfAndGradient
-      // (at verbose-level 2).
-      if (i == 0) {
-        initial_objf = objf;
-        initial_regularizer_objf = regularizer_objf;
-      }
-      lbfgs.DoStep(objf, gradient);
-    }
-    params_ = lbfgs.GetValue(&objf);
-
-    ComputeCurrentNnet(nnet_out_, true); // create the output neural net, and
-                                         // print out the scaling factors.
-    if (config_.regularizer != 0.0) {
-      double initial_part = initial_objf - initial_regularizer_objf,
-          part = objf - regularizer_objf;
-      KALDI_LOG << "Combining nnets, objf/frame + regularizer changed from "
-                << initial_part << " + " << initial_regularizer_objf
-                << " = " << initial_objf << " to " << part << " + "
-                << regularizer_objf << " = " << objf;
-    } else {
-      KALDI_LOG << "Combining nnets, objf per frame changed from "
-                << initial_objf << " to " << objf;
-    }
-  }
-
- private:
-  int32 GetInitialModel(
-      const std::vector<NnetExample> &validation_set,
-      const std::vector<Nnet> &nnets) const;
-
-  void GetInitialParams();
-
-  void ComputePreconditioner();
-
-  // Computes and returns objective function per frame, including
-  // regularizer term if applicable.  Also puts just the regularizer
-  // term in *regularizer_objf.
-  double ComputeObjfAndGradient(
-      Vector<double> *gradient,
-      double *regularizer_objf);
-
-  void ComputeCurrentNnet(
-      Nnet *dest, bool debug = false);
-
-  static void CombineNnets(const Vector<double> &scale_params,
-                           const std::vector<Nnet> &nnets,
-                           Nnet *dest);
-
-
-  // C_ is the cholesky of the smoothed Fisher matrix F.
-  // Let F = C C^T.
-  // Preconditioned gradient is \hat{g} = C^{-1} g
-  // Note: preconditioned parameter is \hat{p} = C^T p,
-  // so p = C^{-T} \hat{p}.
-  TpMatrix<double> C_;
-  TpMatrix<double> C_inv_;
-  Vector<double> params_; // the parameters we're optimizing-- in the
-                          // preconditioned space.  These are the same dimension
-                          // as the number of nnets we're combining times the
-                          // number of updatable layers.
-
-  const NnetCombineFastConfig &config_;
-  const std::vector<NnetExample> &egs_;
-  const std::vector<Nnet> &nnets_;
-  Nnet *nnet_out_;
-};
-
-
-// static
-void FastNnetCombiner::CombineNnets(const Vector<double> &scale_params,
-                                    const std::vector<Nnet> &nnets,
-                                    Nnet *dest) {
-  int32 num_nnets = nnets.size();
-  KALDI_ASSERT(num_nnets >= 1);
-  int32 num_uc = nnets[0].NumUpdatableComponents();
-  KALDI_ASSERT(nnets[0].NumUpdatableComponents() >= 1);
-
-
-  *dest = nnets[0];
-  SubVector<double> scale_params0(scale_params, 0, num_uc);
-  dest->ScaleComponents(Vector<BaseFloat>(scale_params0));
-  for (int32 n = 1; n < num_nnets; n++) {
-    SubVector<double> scale_params_n(scale_params, n * num_uc, num_uc);
-    dest->AddNnet(Vector<BaseFloat>(scale_params_n), nnets[n]);
-  }
-}
-
-
-void FastNnetCombiner::ComputePreconditioner() {
-  SpMatrix<double> F; // Fisher matrix.
-  Nnet nnet;
-  ComputeCurrentNnet(&nnet); // will be at initial value of neural net.
-
-  { // This block does the multi-threaded computation.
-    // The next line just initializes an "example" object.
-    FisherComputationClass fc(nnet, nnets_, egs_,
-                              config_.fisher_minibatch_size,
-                              &F);
-
-    // Setting num_threads to zero if config_.num_threads == 1
-    // is a signal to the MultiThreader class to run without creating
-    // any extra threads in this case; it helps support GPUs.
-    int32 num_threads = config_.num_threads == 1 ? 0 : config_.num_threads;
-    // The work gets done in the initializer and destructor of
-    // the class below.
-    MultiThreader<FisherComputationClass> m(num_threads, fc);
-  }
-
-  // The scale of F is irrelevant but it might be quite
-  // large at this point, so we just normalize it.
-  KALDI_ASSERT(F.Trace() > 0);
-  F.Scale(F.NumRows() / F.Trace()); // same scale as unit matrix.
-  // Make zero diagonal elements of F non-zero.  Relates to updatable
-  // components that have no effect, e.g. MixtureProbComponents that have
-  // no real free parameters.
-  KALDI_ASSERT(config_.fisher_floor > 0.0);
-  for (int32 i = 0; i < F.NumRows(); i++)
-    F(i, i) = std::max<BaseFloat>(F(i, i), config_.fisher_floor);
-  // We next smooth the diagonal elements of F by a small amount.
-  // This is mainly necessary in case the number of minibatches is
-  // smaller than the dimension of F; we want to ensure F is full rank.
-  for (int32 i = 0; i < F.NumRows(); i++)
-    F(i, i) *= (1.0 + config_.alpha);
-
-  C_.Resize(F.NumRows());
-  C_.Cholesky(F);
-  C_inv_ = C_;
-  C_inv_.Invert();
-
-  // Transform the params_ data-member to be in the preconditioned space.
-  Vector<double> raw_params(params_);
-  params_.AddTpVec(1.0, C_, kTrans, raw_params, 0.0);
-}
-
-// Note, we ignore the regularizer in selecting the best one.  It shouldn't
-// really matter.
-void FastNnetCombiner::GetInitialParams() {
-  int32 initial_model = config_.initial_model,
-      num_nnets = static_cast<int32>(nnets_.size());
-  if (initial_model > num_nnets)
-    initial_model = num_nnets;
-  if (initial_model < 0)
-    initial_model = GetInitialModel(egs_, nnets_);
-
-  KALDI_ASSERT(initial_model >= 0 && initial_model <= num_nnets);
-  int32 num_uc = nnets_[0].NumUpdatableComponents();
-
-  Vector<double> raw_params(num_uc * num_nnets); // parameters in
-                                                 // non-preconditioned space.
-  if (initial_model < num_nnets) {
-    KALDI_LOG << "Initializing with neural net with index " << initial_model;
-    // At this point we're using the best of the individual neural nets.
-    raw_params.Set(0.0);
-
-    // Set the block of parameters corresponding to the "best" of the
-    // source neural nets to
-    SubVector<double> best_block(raw_params, num_uc * initial_model, num_uc);
-    best_block.Set(1.0);
-  } else { // initial_model == num_nnets
-    KALDI_LOG << "Initializing with all neural nets averaged.";
-    raw_params.Set(1.0 / num_nnets);
-  }
-  KALDI_ASSERT(C_.NumRows() == 0); // Assume this not set up yet.
-  params_ = raw_params; // this is in non-preconditioned space.
-}
-
-/// Computes objf at point "params_".
-double FastNnetCombiner::ComputeObjfAndGradient(
-    Vector<double> *gradient,
-    double *regularizer_objf_ptr) {
-  Nnet nnet;
-  ComputeCurrentNnet(&nnet); // compute it at the value "params_".
-
-  Nnet nnet_gradient(nnet);
-  bool is_gradient = true;
-  nnet_gradient.SetZero(is_gradient);
-  double tot_weight = 0.0;
-  double objf = DoBackpropParallel(nnet, config_.minibatch_size, config_.num_threads,
-                                   egs_, &tot_weight, &nnet_gradient) / egs_.size();
-
-  // raw_gradient is gradient in non-preconditioned space.
-  Vector<double> raw_gradient(params_.Dim());
-
-  double regularizer_objf = 0.0; // sum of -0.5 * config_.regularizer * params-squared.
-  int32 i = 0; // index into raw_gradient
-  int32 num_nnets = nnets_.size();
-  for (int32 n = 0; n < num_nnets; n++) {
-    for (int32 j = 0; j < nnet.NumComponents(); j++) {
-      const UpdatableComponent *uc =
-          dynamic_cast<const UpdatableComponent*>(&(nnets_[n].GetComponent(j))),
-          *uc_gradient =
-          dynamic_cast<const UpdatableComponent*>(&(nnet_gradient.GetComponent(j))),
-          *uc_params =
-          dynamic_cast<const UpdatableComponent*>(&(nnet.GetComponent(j)));
-      if (uc != NULL) {
-        double gradient = uc->DotProduct(*uc_gradient) / tot_weight;
-        // "gradient" is the derivative of the objective function w.r.t. this
-        // element of the parameters (i.e. this weight, which gets applied to
-        // the j'th component of the n'th source neural net).
-        if (config_.regularizer != 0.0) {
-          gradient -= config_.regularizer * uc->DotProduct(*uc_params);
-          if (n == 0) // only add this once...
-            regularizer_objf +=
-                -0.5 * config_.regularizer * uc_params->DotProduct(*uc_params);
-        }
-        raw_gradient(i) = gradient;
-        i++;
-      }
-    }
-  }
-  if (config_.regularizer != 0.0) {
-    KALDI_VLOG(2) << "Objf is " << objf << " + regularizer " << regularizer_objf
-                  << " = " << (objf + regularizer_objf) << ", raw gradient is "
-                  << raw_gradient;
-  } else {
-    KALDI_VLOG(2) << "Objf is " << objf << ", raw gradient is " << raw_gradient;
-  }
-  KALDI_ASSERT(i == raw_gradient.Dim());
-  // \hat{g} = C^{-1} g.
-  gradient->AddTpVec(1.0, C_inv_, kNoTrans, raw_gradient, 0.0);
-  *regularizer_objf_ptr = regularizer_objf;
-  return objf + regularizer_objf;
-}
-
-void FastNnetCombiner::ComputeCurrentNnet(
-    Nnet *dest, bool debug) {
-  int32 num_nnets = nnets_.size();
-  KALDI_ASSERT(num_nnets >= 1);
-  KALDI_ASSERT(params_.Dim() == num_nnets * nnets_[0].NumUpdatableComponents());
-  Vector<double> raw_params(params_.Dim()); // Weights in non-preconditioned space:
-  // p = C^{-T} \hat{p}.  Here, raw_params is p, params_, is \hat{p}.
-
-  if (C_inv_.NumRows() > 0)
-    raw_params.AddTpVec(1.0, C_inv_, kTrans, params_, 0.0);
-  else
-    raw_params = params_; // C not set up yet: interpret params_ as raw parameters.
-
-  if (debug) {
-    Matrix<double> params_mat(num_nnets,
-                              nnets_[0].NumUpdatableComponents());
-    params_mat.CopyRowsFromVec(raw_params);
-    KALDI_LOG << "Scale parameters are " << params_mat;
-  }
-  CombineNnets(raw_params, nnets_, dest);
-}
-
-/// Returns an integer saying which model to use:
-/// either 0 ... num-models - 1 for the best individual model,
-/// or (#models) for the average of all of them.
-int32 FastNnetCombiner::GetInitialModel(
-    const std::vector<NnetExample> &validation_set,
-    const std::vector<Nnet> &nnets) const {
-  int32 num_nnets = static_cast<int32>(nnets.size());
-  KALDI_ASSERT(!nnets.empty());
-  int32 best_n = -1;
-  double best_objf = -std::numeric_limits<double>::infinity();
-  Vector<double> objfs(nnets.size());
-  for (int32 n = 0; n < num_nnets; n++) {
-    double num_frames;
-    double objf = ComputeNnetObjfParallel(nnets[n], config_.minibatch_size,
-                                          config_.num_threads, validation_set,
-                                          &num_frames);
-    KALDI_ASSERT(num_frames != 0);
-    objf /= num_frames;
-
-    if (n == 0 || objf > best_objf) {
-      best_objf = objf;
-      best_n = n;
-    }
-    objfs(n) = objf;
-  }
-  KALDI_LOG << "Objective functions for the source neural nets are " << objfs;
-
-  int32 num_uc = nnets[0].NumUpdatableComponents();
-
-  if (num_nnets > 1) { // Now try a version where all the neural nets have the
-                       // same weight.  Don't do this if num_nnets == 1 as
-                       // it would be a waste of time (identical to n == 0).
-    Vector<double> scale_params(num_uc * num_nnets);
-    scale_params.Set(1.0 / num_nnets);
-    Nnet average_nnet;
-    CombineNnets(scale_params, nnets, &average_nnet);
-    double num_frames;
-    double objf = ComputeNnetObjfParallel(average_nnet, config_.minibatch_size,
-                                          config_.num_threads, validation_set,
-                                          &num_frames);
-    objf /= num_frames;
-    KALDI_LOG << "Objf with all neural nets averaged is " << objf;
-    if (objf > best_objf) {
-      return num_nnets;
-    } else {
-      return best_n;
-    }
-  } else {
-    return best_n;
-  }
-}
-
-void CombineNnetsFast(const NnetCombineFastConfig &combine_config,
-                      const std::vector<NnetExample> &validation_set,
-                      const std::vector<Nnet> &nnets_in,
-                      Nnet *nnet_out) {
-  // Everything happens in the initializer.
-  FastNnetCombiner combiner(combine_config,
-                            validation_set,
-                            nnets_in,
-                            nnet_out);
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/combine-nnet-fast.h b/src/nnet2/combine-nnet-fast.h
deleted file mode 100644
index 2d75586bf1f..00000000000
--- a/src/nnet2/combine-nnet-fast.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// nnet2/combine-nnet-fast.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_COMBINE_NNET_FAST_H_
-#define KALDI_NNET2_COMBINE_NNET_FAST_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "util/parse-options.h"
-#include "itf/options-itf.h"
-
-
-// Compare with combine-nnet.h.  What we're doing is taking
-// a set of neural nets, and combining them with combination weights
-// (separate weights for each updatable layer), and optimizing
-// these weights using a validation set,
-
-// This is a faster implementation
-// with multi-threading and more careful preconditioning.
-// To get the pre-conditioning, we divide the validation subset
-// up into small-ish batches (e.g. 100 frames), and compute the
-// neural net gradient for each one.  We then compute the parameter
-// gradient (i.e. the gradient w.r.t. the combination weights we're
-// optimizing) for each batch, and use the scatter of these as a
-// kind of Fisher matrix for preconditioning.
-
-namespace kaldi {
-namespace nnet2 {
-
-/** Configuration class that controls neural net combination, where we combine a
-    number of neural nets, trying to find for each layer the optimal weighted
-    combination of the different neural-net parameters.
- */
-struct NnetCombineFastConfig {
-  int32 initial_model; // If provided, the index of the initial model to start
-  // the optimization from.
-  int32 num_lbfgs_iters; 
-  int32 num_threads;
-  BaseFloat initial_impr;
-  BaseFloat fisher_floor; // Flooring value we use for Fisher matrix (mainly
-                          // makes a difference in pnorm systems, where there
-                          // are don't-care directions in parameter space.
-  BaseFloat alpha; // A smoothing value we use in getting the Fisher matrix.
-  int32 fisher_minibatch_size; // e.g. 64; a relatively small minibatch size we
-  // use in the Fisher matrix computation (smaller will generally mean more accurate
-  // preconditioning but will slow down the computation).
-  int32 minibatch_size; // e.g. 1028; a larger minibatch size we use in
-  // the gradient computation.
-  int32 max_lbfgs_dim;
-  BaseFloat regularizer;
-  
-  NnetCombineFastConfig(): initial_model(-1), num_lbfgs_iters(10),
-                           num_threads(1), initial_impr(0.01), fisher_floor(1.0e-20),
-                           alpha(0.01), fisher_minibatch_size(64), minibatch_size(1024),
-                           max_lbfgs_dim(10), regularizer(0.0) {}
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("initial-model", &initial_model, "Specifies where to start the "
-                   "optimization from.  If 0 ... #models-1, then specifies the model; "
-                   "if >= #models, then the average of all inputs; if <0, chosen "
-                   "automatically from the previous options.");
-    opts->Register("num-lbfgs-iters", &num_lbfgs_iters, "Maximum number of function "
-                   "evaluations for L-BFGS to use when optimizing combination weights");
-    opts->Register("initial-impr", &initial_impr, "Amount of objective-function change "
-                   "We aim for on the first iteration.");
-    opts->Register("num-threads", &num_threads, "Number of threads to use in "
-                   "multi-core computation");
-    opts->Register("fisher-floor", &fisher_floor,
-                   "Floor for diagonal of Fisher matrix (used in preconditioning)");
-    opts->Register("alpha", &alpha, "Value we use in smoothing the Fisher matrix "
-                   "with its diagonal, in preconditioning the update.");
-    opts->Register("fisher-minibatch-size", &fisher_minibatch_size, "Size of minibatch "
-                   "used in computation of Fisher matrix (smaller -> better "
-                   "preconditioning");
-    opts->Register("minibatch-size", &minibatch_size, "Minibatch size used in computing "
-                   "gradients (only affects speed)");
-    opts->Register("max-lbfgs-dim", &max_lbfgs_dim, "Maximum dimension to use in "
-                   "L-BFGS (will not get higher than this even if the dimension "
-                   "of the space gets higher.)");
-    opts->Register("regularizer", &regularizer, "Add to the objective "
-                   "function (which is average log-like per frame), -0.5 * "
-                   "regularizer * square of parameters.");
-  }  
-};
-
-void CombineNnetsFast(const NnetCombineFastConfig &combine_config,
-                      const std::vector<NnetExample> &validation_set,
-                      const std::vector<Nnet> &nnets_in,
-                      Nnet *nnet_out);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/combine-nnet.cc b/src/nnet2/combine-nnet.cc
deleted file mode 100644
index 57cc6133c58..00000000000
--- a/src/nnet2/combine-nnet.cc
+++ /dev/null
@@ -1,253 +0,0 @@
-// nnet2/combine-nnet.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/combine-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-// Here, "scale_params" is in blocks, with the first block
-// corresponding to nnets[0].
-static void CombineNnets(const Vector<BaseFloat> &scale_params,
-                         const std::vector<Nnet> &nnets,
-                         Nnet *dest) {
-  int32 num_nnets = nnets.size();
-  KALDI_ASSERT(num_nnets >= 1);
-  int32 num_uc = nnets[0].NumUpdatableComponents();
-  KALDI_ASSERT(nnets[0].NumUpdatableComponents() >= 1);
-
-
-  *dest = nnets[0];
-  SubVector<BaseFloat> scale_params0(scale_params, 0, num_uc);
-  dest->ScaleComponents(scale_params0);
-  for (int32 n = 1; n < num_nnets; n++) {
-    SubVector<BaseFloat> scale_params_n(scale_params, n * num_uc, num_uc);
-    dest->AddNnet(scale_params_n, nnets[n]);
-  }
-}
-
-/// Returns an integer saying which model to use:
-/// either 0 ... num-models - 1 for the best individual model,
-/// or (#models) for the average of all of them.
-static int32 GetInitialModel(
-    const std::vector<NnetExample> &validation_set,
-    const std::vector<Nnet> &nnets) {
-  int32 minibatch_size = 1024;
-  int32 num_nnets = static_cast<int32>(nnets.size());
-  KALDI_ASSERT(!nnets.empty());
-  BaseFloat tot_frames = validation_set.size();
-  int32 best_n = -1;
-  BaseFloat best_objf = -std::numeric_limits<BaseFloat>::infinity();
-  Vector<BaseFloat> objfs(nnets.size());
-  for (int32 n = 0; n < num_nnets; n++) {
-    BaseFloat objf = ComputeNnetObjf(nnets[n], validation_set,
-                                     minibatch_size) / tot_frames;
-
-    if (n == 0 || objf > best_objf) {
-      best_objf = objf;
-      best_n = n;
-    }
-    objfs(n) = objf;
-  }
-  KALDI_LOG << "Objective functions for the source neural nets are " << objfs;
-
-  int32 num_uc = nnets[0].NumUpdatableComponents();
-
-  { // Now try a version where all the neural nets have the same weight.
-    Vector<BaseFloat> scale_params(num_uc * num_nnets);
-    scale_params.Set(1.0 / num_nnets);
-    Nnet average_nnet;
-    CombineNnets(scale_params, nnets, &average_nnet);
-    BaseFloat objf = ComputeNnetObjf(average_nnet, validation_set,
-                                     minibatch_size) / tot_frames;
-    KALDI_LOG << "Objf with all neural nets averaged is " << objf;
-    if (objf > best_objf) {
-      return num_nnets;
-    } else {
-      return best_n;
-    }
-  }
-}
-
-// This function chooses from among the neural nets, the one
-// which has the best validation set objective function.
-static void GetInitialScaleParams(
-    const NnetCombineConfig &combine_config,
-    const std::vector<NnetExample> &validation_set,
-    const std::vector<Nnet> &nnets,
-    Vector<double> *scale_params) {
-
-  int32 initial_model = combine_config.initial_model,
-      num_nnets = static_cast<int32>(nnets.size());
-  if (initial_model < 0 || initial_model > num_nnets)
-    initial_model = GetInitialModel(validation_set, nnets);
-
-  KALDI_ASSERT(initial_model >= 0 && initial_model <= num_nnets);
-  int32 num_uc = nnets[0].NumUpdatableComponents();
-
-  scale_params->Resize(num_uc * num_nnets);
-  if (initial_model < num_nnets) {
-    KALDI_LOG << "Initializing with neural net with index " << initial_model;
-    // At this point we're using the best of the individual neural nets.
-    scale_params->Set(0.0);
-
-    // Set the block of parameters corresponding to the "best" of the
-    // source neural nets to
-    SubVector<double> best_block(*scale_params, num_uc * initial_model, num_uc);
-    best_block.Set(1.0);
-  } else { // initial_model == num_nnets
-    KALDI_LOG << "Initializing with all neural nets averaged.";
-    scale_params->Set(1.0 / num_nnets);
-  }
-}
-
-
-
-
-static double ComputeObjfAndGradient(
-    const std::vector<NnetExample> &validation_set,
-    const Vector<double> &scale_params,
-    const std::vector<Nnet> &nnets,
-    bool debug,
-    Vector<double> *gradient) {
-
-  Vector<BaseFloat> scale_params_float(scale_params);
-
-  Nnet nnet_combined;
-  CombineNnets(scale_params_float, nnets, &nnet_combined);
-
-  Nnet nnet_gradient(nnet_combined);
-  bool is_gradient = true;
-  nnet_gradient.SetZero(is_gradient);
-
-  // note: "ans" is normalized by the total weight of validation frames.
-  int32 batch_size = 1024;
-  double ans = ComputeNnetGradient(nnet_combined,
-                                   validation_set,
-                                   batch_size,
-                                   &nnet_gradient);
-
-  double tot_frames = validation_set.size();
-  if (gradient != NULL) {
-    int32 i = 0; // index into scale_params.
-    for (int32 n = 0; n < static_cast<int32>(nnets.size()); n++) {
-      for (int32 j = 0; j < nnet_combined.NumComponents(); j++) {
-        const UpdatableComponent *uc =
-            dynamic_cast<const UpdatableComponent*>(&(nnets[n].GetComponent(j))),
-            *uc_gradient =
-            dynamic_cast<const UpdatableComponent*>(&(nnet_gradient.GetComponent(j)));
-        if (uc != NULL) {
-          double dotprod = uc->DotProduct(*uc_gradient) / tot_frames;
-          (*gradient)(i) = dotprod;
-          i++;
-        }
-      }
-    }
-    KALDI_ASSERT(i == scale_params.Dim());
-  }
-
-  if (debug) {
-    KALDI_LOG << "Double-checking gradient computation";
-
-    Vector<BaseFloat> manual_gradient(scale_params.Dim());
-    for (int32 i = 0; i < scale_params.Dim(); i++) {
-      double delta = 1.0e-04, fg = fabs((*gradient)(i));
-      if (fg < 1.0e-07) fg = 1.0e-07;
-      if (fg * delta < 1.0e-05)
-        delta = 1.0e-05 / fg;
-
-      Vector<double> scale_params_temp(scale_params);
-      scale_params_temp(i) += delta;
-      double new_ans = ComputeObjfAndGradient(validation_set,
-                                              scale_params_temp,
-                                              nnets,
-                                              false,
-                                              NULL);
-      manual_gradient(i) = (new_ans - ans) / delta;
-    }
-    KALDI_LOG << "Manually computed gradient is " << manual_gradient;
-    KALDI_LOG << "Gradient we computed is " << *gradient;
-  }
-
-  return ans;
-}
-
-
-void CombineNnets(const NnetCombineConfig &combine_config,
-                  const std::vector<NnetExample> &validation_set,
-                  const std::vector<Nnet> &nnets,
-                  Nnet *nnet_out) {
-
-  Vector<double> scale_params;
-
-  GetInitialScaleParams(combine_config,
-                        validation_set,
-                        nnets,
-                        &scale_params);
-
-  int32 dim = scale_params.Dim();
-  KALDI_ASSERT(dim > 0);
-  Vector<double> gradient(dim);
-
-  double objf, initial_objf;
-
-  LbfgsOptions lbfgs_options;
-  lbfgs_options.minimize = false; // We're maximizing.
-  lbfgs_options.m = dim; // Store the same number of vectors as the dimension
-  // itself, so this is BFGS.
-  lbfgs_options.first_step_impr = combine_config.initial_impr;
-
-  OptimizeLbfgs<double> lbfgs(scale_params,
-                              lbfgs_options);
-
-  for (int32 i = 0; i < combine_config.num_bfgs_iters; i++) {
-    scale_params.CopyFromVec(lbfgs.GetProposedValue());
-    objf = ComputeObjfAndGradient(validation_set,
-                                  scale_params,
-                                  nnets,
-                                  combine_config.test_gradient,
-                                  &gradient);
-
-    KALDI_VLOG(2) << "Iteration " << i << " scale-params = " << scale_params
-                  << ", objf = " << objf << ", gradient = " << gradient;
-
-    if (i == 0) initial_objf = objf;
-
-    lbfgs.DoStep(objf, gradient);
-  }
-
-  scale_params.CopyFromVec(lbfgs.GetValue(&objf));
-
-  Vector<BaseFloat> scale_params_float(scale_params);
-
-  KALDI_LOG << "Combining nnets, validation objf per frame changed from "
-            << initial_objf << " to " << objf;
-
-  Matrix<BaseFloat> scale_params_mat(nnets.size(),
-                                     nnets[0].NumUpdatableComponents());
-  scale_params_mat.CopyRowsFromVec(scale_params_float);
-  KALDI_LOG << "Final scale factors are " << scale_params_mat;
-
-  CombineNnets(scale_params_float, nnets, nnet_out);
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/combine-nnet.h b/src/nnet2/combine-nnet.h
deleted file mode 100644
index 83a15e41dce..00000000000
--- a/src/nnet2/combine-nnet.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// nnet2/combine-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_COMBINE_NNET_H_
-#define KALDI_NNET2_COMBINE_NNET_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "util/parse-options.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/** Configuration class that controls neural net combination, where we combine a
-    number of neural nets, trying to find for each layer the optimal weighted
-    combination of the different neural-net parameters.
- */
-struct NnetCombineConfig {
-  int32 initial_model; // If provided, the index of the initial model to start
-  // the optimization from.
-  int32 num_bfgs_iters; // The dimension is small (e.g. 3 to 5 times the
-  // number of neural nets we were given, e.g. 10) so we do
-  // BFGS.  We actually implement this as L-BFGS but setting the number of
-  // vectors to be the same as the dimension of the space.  Note: this
-  // num-iters is in reality the number of function evaluations.
-  
-  BaseFloat initial_impr;
-  bool test_gradient;
-  NnetCombineConfig(): initial_model(-1), num_bfgs_iters(30),
-                       initial_impr(0.01),
-                       test_gradient(false) { }
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("initial-model", &initial_model, "Specifies where to start the "
-                   "optimization from.  If 0 ... #models-1, then specifies the model; "
-                   "if #models, then the average of all inputs; otherwise, chosen "
-                   "automatically from the previous options.");
-    opts->Register("num-bfgs-iters", &num_bfgs_iters, "Maximum number of function "
-                   "evaluations for BFGS to use when optimizing combination weights");
-    opts->Register("initial-impr", &initial_impr, "Amount of objective-function change "
-                   "we aim for on the first iteration.");
-    opts->Register("test-gradient", &test_gradient, "If true, activate code that "
-                   "tests the gradient is accurate.");
-  }  
-};
-
-void CombineNnets(const NnetCombineConfig &combine_config,
-                  const std::vector<NnetExample> &validation_set,
-                  const std::vector<Nnet> &nnets_in,
-                  Nnet *nnet_out);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/decodable-am-nnet.h b/src/nnet2/decodable-am-nnet.h
deleted file mode 100644
index 6c40b11bf9d..00000000000
--- a/src/nnet2/decodable-am-nnet.h
+++ /dev/null
@@ -1,187 +0,0 @@
-// nnet2/decodable-am-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_DECODABLE_AM_NNET_H_
-#define KALDI_NNET2_DECODABLE_AM_NNET_H_
-
-#include <vector>
-#include "base/kaldi-common.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "itf/decodable-itf.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-compute.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/// DecodableAmNnet is a decodable object that decodes
-/// with a neural net acoustic model of type AmNnet.
-
-class DecodableAmNnet: public DecodableInterface {
- public:
-  DecodableAmNnet(const TransitionModel &trans_model,
-                  const AmNnet &am_nnet,
-                  const CuMatrixBase<BaseFloat> &feats,
-                  bool pad_input = true, // if !pad_input, the NumIndices()
-                                         // will be < feats.NumRows().
-                  BaseFloat prob_scale = 1.0):
-      trans_model_(trans_model) {
-    // Note: we could make this more memory-efficient by doing the
-    // computation in smaller chunks than the whole utterance, and not
-    // storing the whole thing.  We'll leave this for later.
-    int32 num_rows = feats.NumRows() -
-        (pad_input ? 0 : am_nnet.GetNnet().LeftContext() +
-                         am_nnet.GetNnet().RightContext());
-    if (num_rows <= 0) {
-      KALDI_WARN << "Input with " << feats.NumRows()  << " rows will produce "
-                 << "empty output.";
-      return;
-    }
-    CuMatrix<BaseFloat> log_probs(num_rows, trans_model.NumPdfs());
-    // the following function is declared in nnet-compute.h
-    NnetComputation(am_nnet.GetNnet(), feats, pad_input, &log_probs);
-    log_probs.ApplyFloor(1.0e-20); // Avoid log of zero which leads to NaN.
-    log_probs.ApplyLog();
-    CuVector<BaseFloat> priors(am_nnet.Priors());
-    KALDI_ASSERT(priors.Dim() == trans_model.NumPdfs() &&
-                 "Priors in neural network not set up.");
-    priors.ApplyLog();
-    // subtract log-prior (divide by prior)
-    log_probs.AddVecToRows(-1.0, priors);
-    // apply probability scale.
-    log_probs.Scale(prob_scale);
-    // Transfer the log-probs to the CPU for faster access by the
-    // decoding process.
-    log_probs_.Swap(&log_probs);
-  }
-
-  // Note, frames are numbered from zero.  But transition_id is numbered
-  // from one (this routine is called by FSTs).
-  virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id) {
-    return log_probs_(frame,
-                      trans_model_.TransitionIdToPdfFast(transition_id));
-  }
-
-  virtual int32 NumFramesReady() const { return log_probs_.NumRows(); }
-
-  // Indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-
-  virtual bool IsLastFrame(int32 frame) const {
-    KALDI_ASSERT(frame < NumFramesReady());
-    return (frame == NumFramesReady() - 1);
-  }
-
- protected:
-  const TransitionModel &trans_model_;
-  Matrix<BaseFloat> log_probs_; // actually not really probabilities, since we divide
-  // by the prior -> they won't sum to one.
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnet);
-};
-
-/// This version of DecodableAmNnet is intended for a version of the decoder
-/// that processes different utterances with multiple threads.  It needs to do
-/// the computation in a different place than the initializer, since the
-/// initializer gets called in the main thread of the program.
-
-class DecodableAmNnetParallel: public DecodableInterface {
- public:
-  DecodableAmNnetParallel(
-      const TransitionModel &trans_model,
-      const AmNnet &am_nnet,
-      const CuMatrix<BaseFloat> *feats,
-      bool pad_input = true,
-      BaseFloat prob_scale = 1.0):
-      trans_model_(trans_model), am_nnet_(am_nnet), feats_(feats),
-      pad_input_(pad_input), prob_scale_(prob_scale) {
-    KALDI_ASSERT(feats_ != NULL);
-  }
-
-  void Compute() {
-    log_probs_.Resize(feats_->NumRows(), trans_model_.NumPdfs());
-    // the following function is declared in nnet-compute.h
-    NnetComputation(am_nnet_.GetNnet(), *feats_,
-                    pad_input_, &log_probs_);
-    log_probs_.ApplyFloor(1.0e-20); // Avoid log of zero which leads to NaN.
-    log_probs_.ApplyLog();
-    CuVector<BaseFloat> priors(am_nnet_.Priors());
-    KALDI_ASSERT(priors.Dim() == trans_model_.NumPdfs() &&
-                 "Priors in neural network not set up.");
-    priors.ApplyLog();
-    // subtract log-prior (divide by prior)
-    log_probs_.AddVecToRows(-1.0, priors);
-    // apply probability scale.
-    log_probs_.Scale(prob_scale_);
-    delete feats_;
-    feats_ = NULL;
-  }
-
-  // Note, frames are numbered from zero.  But state_index is numbered
-  // from one (this routine is called by FSTs).
-  virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id) {
-    if (feats_) Compute(); // this function sets feats_ to NULL.
-    return log_probs_(frame,
-                      trans_model_.TransitionIdToPdfFast(transition_id));
-  }
-
-  int32 NumFramesReady() const {
-    if (feats_) {
-      if (pad_input_) return feats_->NumRows();
-      else {
-        int32 ans = feats_->NumRows() - am_nnet_.GetNnet().LeftContext() -
-            am_nnet_.GetNnet().RightContext();
-        if (ans < 0) ans = 0;
-        return ans;
-      }
-    } else {
-      return log_probs_.NumRows();
-    }
-  }
-
-  // Indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-
-  virtual bool IsLastFrame(int32 frame) const {
-    KALDI_ASSERT(frame < NumFramesReady());
-    return (frame == NumFramesReady() - 1);
-  }
-  ~DecodableAmNnetParallel() {
-    delete feats_;
-  }
- protected:
-  const TransitionModel &trans_model_;
-  const AmNnet &am_nnet_;
-  CuMatrix<BaseFloat> log_probs_; // actually not really probabilities, since we divide
-  // by the prior -> they won't sum to one.
-  const CuMatrix<BaseFloat> *feats_;
-  bool pad_input_;
-  BaseFloat prob_scale_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnetParallel);
-};
-
-
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif  // KALDI_NNET2_DECODABLE_AM_NNET_H_
diff --git a/src/nnet2/mixup-nnet.cc b/src/nnet2/mixup-nnet.cc
deleted file mode 100644
index a7d3723d08e..00000000000
--- a/src/nnet2/mixup-nnet.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-// nnet2/mixup-nnet.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/mixup-nnet.h"
-#include "gmm/model-common.h" // for GetSplitTargets()
-#include <numeric> // for std::accumulate
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/** This function makes sure the neural net ends with a
-    SumGroupComponent.  If it doesn't, it adds one
-    (with a single mixture/matrix corresponding to each
-    output element.)  [Before doing so, it makes sure
-    that the last layer is a SoftmaxLayer, which is what
-    we expect.  You can remove this check if there is some
-    use-case that makes sense where the type of the previous
-    layer is different.
- */
-static void GiveNnetCorrectTopology(Nnet *nnet,
-                                    AffineComponent **affine_component,
-                                    SoftmaxComponent **softmax_component,
-                                    SumGroupComponent **sum_group_component) {
-  int32 nc = nnet->NumComponents();
-  KALDI_ASSERT(nc > 0);
-  Component* component = &(nnet->GetComponent(nc - 1));
-  if ((*sum_group_component =
-       dynamic_cast<SumGroupComponent*>(component)) == NULL) {
-    KALDI_LOG << "Adding SumGroupComponent to neural net.";
-    int32 dim = component->OutputDim();
-    // Give it the same learning rate as the first updatable layer we have.
-    std::vector<int32> sizes(dim, 1); // a vector of all ones, of dimension "dim".
-  
-    *sum_group_component = new SumGroupComponent();
-    (*sum_group_component)->Init(sizes);
-    nnet->Append(*sum_group_component);
-    nc++;
-  }
-  component = &(nnet->GetComponent(nc - 2));    
-  if ((*softmax_component = dynamic_cast<SoftmaxComponent*>(component)) == NULL)
-    KALDI_ERR << "Neural net has wrong topology: expected second-to-last "
-              << "component to be SoftmaxComponent, type is "
-              << component->Type();
-  component = &(nnet->GetComponent(nc - 3));
-  if ((*affine_component = dynamic_cast<AffineComponent*>(component)) == NULL)
-    KALDI_ERR << "Neural net has wrong topology: expected third-to-last "
-              << "component to be AffineComponent, type is "
-              << component->Type();
-}
-
-
-/**
-   This function works as follows.
-   We first make sure the neural net has the correct topology, so its
-   last component is a SumGroupComponent.
-
-   We then get the counts for each matrix in the SumGroupComponent (these
-   will either correspond to leaves in the decision tree, or level-1 leaves, if
-   we have a 2-level-tree system).  We work out the total count for each of these
-   matrices, by getting the count from the SoftmaxComponent.
-
-   We then increase, if necessary, the dimensions that the SumGroupComponent sums
-   over increase the dimension of the SoftmaxComponent if necessary, and duplicate
-   and then perturb the relevant rows of the AffineComponent.
- */
-
-
-
-void MixupNnet(const NnetMixupConfig &mixup_config,
-               Nnet *nnet) {
-  AffineComponent *affine_component = NULL;
-  SoftmaxComponent *softmax_component = NULL;
-  SumGroupComponent *sum_group_component = NULL;
-  GiveNnetCorrectTopology(nnet,
-                          &affine_component,
-                          &softmax_component,
-                          &sum_group_component); // Adds a SumGroupComponent if needed.
-  
-  softmax_component->MixUp(mixup_config.num_mixtures,
-                           mixup_config.power,
-                           mixup_config.min_count,
-                           mixup_config.perturb_stddev,
-                           affine_component,
-                           sum_group_component);
-  nnet->Check(); // Checks that dimensions all match up.
-}
-
-
-/// Allocate mixtures to states via a power rule, and add any new mixtures.
-void SoftmaxComponent::MixUp(int32 num_mixtures,
-                             BaseFloat power,
-                             BaseFloat min_count,
-                             BaseFloat perturb_stddev,
-                             AffineComponent *ac,
-                             SumGroupComponent *sc) {
-  // "counts" is derived from this->counts_ by summing.
-  std::vector<int32> old_sizes;
-  sc->GetSizes(&old_sizes);
-  Vector<BaseFloat> counts(old_sizes.size());
-  int32 old_dim = 0;
-  for (size_t i = 0; i < old_sizes.size(); i++) {
-    int32 this_input_dim = old_sizes[i];
-    BaseFloat this_tot_count = 0.0; /// Total the count out of
-    /// all the output dims of the softmax layer that correspond
-    /// to this mixture.  We'll use this total to allocate new quasi-Gaussians.
-    for (int32 d = 0; d < this_input_dim; d++, old_dim++)
-      this_tot_count += this->value_sum_(old_dim);
-    counts(i) = this_tot_count;
-  }
-  KALDI_ASSERT(old_dim == value_sum_.Dim());
-  KALDI_ASSERT(counts.Sum() > 0 && "Cannot do mixing up without counts.");
-
-  std::vector<int32> targets; // #mixtures for each state.
-
-
-  // Get the target number of mixtures for each state.
-  GetSplitTargets(counts, num_mixtures, power, min_count, &targets);
-  KALDI_ASSERT(targets.size() == old_sizes.size());
-  std::vector<int32> new_sizes(old_sizes.size());
-  for (size_t i = 0; i < targets.size(); i++)
-    new_sizes[i] = std::max(targets[i], old_sizes[i]);
-  int32 new_dim = std::accumulate(new_sizes.begin(), new_sizes.end(),
-                                  static_cast<int32>(0)),
-      affine_input_dim = ac->InputDim();
-  KALDI_ASSERT(new_dim >= old_dim);
-  sc->Init(new_sizes);
-  
-  // bias and linear terms from affine component:
-  Vector<BaseFloat> old_bias_term(ac->bias_params_);
-  Matrix<BaseFloat> old_linear_term(ac->linear_params_);
-  
-  Vector<BaseFloat> new_bias_term(new_dim);
-  Matrix<BaseFloat> new_linear_term(new_dim, affine_input_dim);
-  Vector<BaseFloat> new_counts(new_dim);
-
-  // old_offset and new_offset are offsets into the dimension at the
-  // input/output of the softmax component, before and after mixing up
-  // respectively.  They get incremented in the following loop.
-  int32 old_offset = 0, new_offset = 0;
-  Vector<BaseFloat> old_counts(this->value_sum_);
-  for (size_t i = 0; i < old_sizes.size(); i++) {
-    int32 this_old_dim = old_sizes[i],
-          this_new_dim = new_sizes[i],
-          this_cur_dim = this_old_dim; // this_cur_dim is loop variable.
-    
-    SubMatrix<BaseFloat> this_old_linear_term(old_linear_term,
-                                              old_offset, this_old_dim,
-                                              0, affine_input_dim),
-        this_new_linear_term(new_linear_term,
-                             new_offset, this_new_dim,
-                             0, affine_input_dim);
-    SubVector<BaseFloat> this_old_bias_term(old_bias_term,
-                                            old_offset, this_old_dim),
-        this_new_bias_term(new_bias_term, new_offset, this_new_dim),
-        this_old_counts(old_counts,
-                        old_offset, this_old_dim),
-        this_new_counts(new_counts,
-                        new_offset, this_new_dim);
-    
-    // Copy the same-dimensional part of the parameters and counts.
-    this_new_linear_term.Range(0, this_old_dim, 0, affine_input_dim).
-        CopyFromMat(this_old_linear_term);
-    this_new_bias_term.Range(0, this_old_dim).
-        CopyFromVec(this_old_bias_term);
-    this_new_counts.Range(0, this_old_dim).
-        CopyFromVec(this_old_counts);
-    // this_new_params is the mixture weights.
-    // Add the new components...
-    for (; this_cur_dim < this_new_dim; this_cur_dim++) {
-      BaseFloat *count_begin = this_new_counts.Data(),
-          *count_end  = count_begin + this_cur_dim,
-          *count_max = std::max_element(count_begin, count_end);
-      KALDI_ASSERT(*count_max > 0.0);
-      *count_max *= 0.5;
-      *count_end = *count_max; // count for the element we're adding.
-      int32 max_index = static_cast<int32>(count_max - count_begin),
-          new_index = this_cur_dim;
-      SubVector<BaseFloat> cur_vec(this_new_linear_term, max_index),
-          new_vec(this_new_linear_term, new_index);
-      new_vec.CopyFromVec(cur_vec);
-      Vector<BaseFloat> rand(affine_input_dim);
-      rand.SetRandn();
-      cur_vec.AddVec(perturb_stddev, rand);
-      new_vec.AddVec(-perturb_stddev, rand);
-      this_new_bias_term(max_index) += Log(0.5);
-      this_new_bias_term(new_index) = this_new_bias_term(max_index);
-    }
-    old_offset += this_old_dim;
-    new_offset += this_new_dim;
-  }
-  KALDI_ASSERT(old_offset == old_dim && new_offset == new_dim);
-  ac->SetParams(new_bias_term, new_linear_term);
-  this->value_sum_.Resize(new_counts.Dim());
-  this->value_sum_.CopyFromVec(new_counts);
-  this->count_ = this->value_sum_.Sum();
-  this->dim_ = new_dim;
-  KALDI_LOG << "Mixed up from dimension of " << old_dim << " to " << new_dim
-            << " in the softmax layer.";
-}
-
-
-
-  
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/mixup-nnet.h b/src/nnet2/mixup-nnet.h
deleted file mode 100644
index 79dfa074e96..00000000000
--- a/src/nnet2/mixup-nnet.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// nnet2/mixup-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_MIXUP_NNET_H_
-#define KALDI_NNET2_MIXUP_NNET_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-struct NnetMixupConfig {
-  BaseFloat power;
-  BaseFloat min_count;
-  int32 num_mixtures;
-  BaseFloat perturb_stddev;
-  
-  
-  NnetMixupConfig(): power(0.25), min_count(1000.0),
-                     num_mixtures(-1), perturb_stddev(0.01) { }
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("power", &power, "Scaling factor used in determining the "
-                   "number of mixture components to use for each HMM state "
-                   "(or group of HMM states)");
-    opts->Register("min-count", &min_count, "Minimum count for a quasi-Gaussian, "
-                   "enforced while allocating mixtures (obscure parameter).");
-    opts->Register("num-mixtures", &num_mixtures, "If specified, total number of "
-                   "mixture components to mix up to (should be at least the "
-                   "#leaves in the system");
-    opts->Register("perturb-stddev", &perturb_stddev, "Standard deviation used "
-                   "when perturbing parameters during mixing up");
-  }  
-};
-
-/**
-  This function does something similar to Gaussian mixture splitting for
-  GMMs, except applied to the output layer of the neural network.
-  We create additional outputs, which will be summed over using a
-  SumGroupComponent.
-*/
-
-void MixupNnet(const NnetMixupConfig &mixup_config,
-               Nnet *nnet);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/nnet-component-test.cc b/src/nnet2/nnet-component-test.cc
deleted file mode 100644
index 5aeaf28cd1e..00000000000
--- a/src/nnet2/nnet-component-test.cc
+++ /dev/null
@@ -1,909 +0,0 @@
-// nnet2/nnet-component-test.cc
-
-// Copyright 2012-2014  Johns Hopkins University (author:  Daniel Povey)
-//                2015  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-component.h"
-#include "util/common-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void UnitTestGenericComponentInternal(const Component &component,
-                                      const ChunkInfo in_info,
-                                      const ChunkInfo out_info)  {
-
-  CuMatrix<BaseFloat> input(in_info.NumRows(), in_info.NumCols()),
-      output(1, out_info.NumRows() * out_info.NumCols());
-  input.SetRandn();
-  CuVector<BaseFloat> objf_vec(out_info.NumCols()); // objective function is linear function of output.
-  objf_vec.SetRandn(); // set to Gaussian noise.
-
-  int32 rand_seed = Rand();
-
-  RandomComponent *rand_component =
-      const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
-  if (rand_component != NULL) {
-    srand(rand_seed);
-    rand_component->ResetGenerator();
-  }
-  component.Propagate(in_info, out_info, input, &output);
-  {
-    bool binary = (Rand() % 2 == 0);
-    Output ko("tmpf", binary);
-    component.Write(ko.Stream(), binary);
-  }
-  Component *component_copy;
-  {
-    bool binary_in;
-    Input ki("tmpf", &binary_in);
-    component_copy = Component::ReadNew(ki.Stream(), binary_in);
-  }
-  unlink("tmpf");
-
-  { // Test backward derivative is correct.
-    CuVector<BaseFloat> output_objfs(out_info.NumRows());
-    output_objfs.AddMatVec(1.0, output, kNoTrans, objf_vec, 0.0);
-    BaseFloat objf = output_objfs.Sum();
-
-
-    CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
-    for (int32 i = 0; i < output_deriv.NumRows(); i++)
-      output_deriv.Row(i).CopyFromVec(objf_vec);
-
-    CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
-
-
-    CuMatrix<BaseFloat> empty_mat;
-    CuMatrix<BaseFloat> &input_ref =
-        (component_copy->BackpropNeedsInput() ? input : empty_mat),
-        &output_ref =
-        (component_copy->BackpropNeedsOutput() ? output : empty_mat);
-
-    component_copy->Backprop(in_info, out_info, input_ref, output_ref,
-                             output_deriv, NULL, &input_deriv);
-
-    int32 num_ok = 0, num_bad = 0, num_tries = 10;
-    KALDI_LOG << "Comparing feature gradients " << num_tries << " times.";
-    for (int32 i = 0; i < num_tries; i++) {
-      CuMatrix<BaseFloat> perturbed_input(input.NumRows(), input.NumCols());
-      {
-        RandomComponent *rand_component =
-            const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
-        if (rand_component != NULL) {
-          srand(rand_seed);
-          rand_component->ResetGenerator();
-        }
-      }
-      perturbed_input.SetRandn();
-      perturbed_input.Scale(1.0e-04); // scale by a small amount so it's like a delta.
-      BaseFloat predicted_difference = TraceMatMat(perturbed_input,
-                                                   input_deriv, kTrans);
-      perturbed_input.AddMat(1.0, input); // now it's the input + a delta.
-      { // Compute objf with perturbed input and make sure it matches
-        // prediction.
-        CuMatrix<BaseFloat> perturbed_output(output.NumRows(), output.NumCols());
-        {
-          RandomComponent *rand_component =
-              const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
-          if (rand_component != NULL) {
-            srand(rand_seed);
-            rand_component->ResetGenerator();
-          }
-        }
-        component.Propagate(in_info, out_info, perturbed_input, &perturbed_output);
-        CuVector<BaseFloat> perturbed_output_objfs(out_info.NumRows());
-        perturbed_output_objfs.AddMatVec(1.0, perturbed_output, kNoTrans,
-                                         objf_vec, 0.0);
-        BaseFloat perturbed_objf = perturbed_output_objfs.Sum(),
-             observed_difference = perturbed_objf - objf;
-        KALDI_LOG << "Input gradients: comparing " << predicted_difference
-                  << " and " << observed_difference;
-        if (fabs(predicted_difference - observed_difference) >
-            0.15 * fabs((predicted_difference + observed_difference)/2) &&
-            fabs(predicted_difference - observed_difference) > 1.0e-06) {
-          KALDI_WARN << "Bad difference!";
-          num_bad++;
-        } else {
-          num_ok++;
-        }
-      }
-    }
-    KALDI_LOG << "Succeeded for " << num_ok << " out of " << num_tries
-              << " tries.";
-    if (num_ok <= num_bad) {
-      delete component_copy;
-      KALDI_ERR << "Feature-derivative check failed";
-    }
-  }
-
-  UpdatableComponent *ucomponent =
-      dynamic_cast<UpdatableComponent*>(component_copy);
-
-  if (ucomponent != NULL) { // Test parameter derivative is correct.
-
-    int32 num_ok = 0, num_bad = 0, num_tries = 10;
-    KALDI_LOG << "Comparing model gradients " << num_tries << " times.";
-    for (int32 i = 0; i < num_tries; i++) {
-      UpdatableComponent *perturbed_ucomponent =
-          dynamic_cast<UpdatableComponent*>(ucomponent->Copy()),
-          *gradient_ucomponent =
-          dynamic_cast<UpdatableComponent*>(ucomponent->Copy());
-      KALDI_ASSERT(perturbed_ucomponent != NULL);
-      gradient_ucomponent->SetZero(true); // set params to zero and treat as gradient.
-      BaseFloat perturb_stddev = 5.0e-04;
-      perturbed_ucomponent->PerturbParams(perturb_stddev);
-
-      CuVector<BaseFloat> output_objfs(out_info.NumRows());
-      output_objfs.AddMatVec(1.0, output, kNoTrans, objf_vec, 0.0);
-      BaseFloat objf = output_objfs.Sum();
-
-      CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
-      for (int32 i = 0; i < output_deriv.NumRows(); i++)
-        output_deriv.Row(i).CopyFromVec(objf_vec);
-      CuMatrix<BaseFloat> input_deriv; // (input.NumRows(), input.NumCols());
-
-      // This will compute the parameter gradient.
-      ucomponent->Backprop(in_info, out_info, input, output, output_deriv,
-                           gradient_ucomponent, &input_deriv);
-
-      // Now compute the perturbed objf.
-      BaseFloat objf_perturbed;
-      {
-        CuMatrix<BaseFloat> output_perturbed; // (num_egs, output_dim);
-        {
-          RandomComponent *rand_component =
-              const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&component));
-          if (rand_component != NULL) {
-            srand(rand_seed);
-            rand_component->ResetGenerator();
-          }
-        }
-        perturbed_ucomponent->Propagate(in_info, out_info, input, &output_perturbed);
-        CuVector<BaseFloat> output_objfs_perturbed(out_info.NumRows());
-        output_objfs_perturbed.AddMatVec(1.0, output_perturbed,
-                                         kNoTrans, objf_vec, 0.0);
-        objf_perturbed = output_objfs_perturbed.Sum();
-      }
-
-      BaseFloat delta_objf_observed = objf_perturbed - objf,
-          delta_objf_predicted = (perturbed_ucomponent->DotProduct(*gradient_ucomponent) -
-                                  ucomponent->DotProduct(*gradient_ucomponent));
-
-      KALDI_LOG << "Model gradients: comparing " << delta_objf_observed
-                << " and " << delta_objf_predicted;
-      if (fabs(delta_objf_predicted - delta_objf_observed) >
-          0.05 * (fabs(delta_objf_predicted + delta_objf_observed)/2) &&
-          fabs(delta_objf_predicted - delta_objf_observed) > 1.0e-06) {
-        KALDI_WARN << "Bad difference!";
-        num_bad++;
-      } else {
-        num_ok++;
-      }
-      delete perturbed_ucomponent;
-      delete gradient_ucomponent;
-    }
-    if (num_ok < num_bad) {
-      delete component_copy;
-      KALDI_ERR << "model-derivative check failed";
-    }
-  }
-  delete component_copy; // No longer needed.
-}
-
-void UnitTestGenericComponentInternal(const Component &component) {
-  int32 input_dim = component.InputDim(),
-      output_dim = component.OutputDim();
-
-  KALDI_LOG << component.Info();
-  int32 num_egs = 10 + Rand() % 5;
-  int32 num_chunks = 1,
-        first_offset = 0,
-        last_offset = num_egs-1;
-
-  ChunkInfo in_info(input_dim, num_chunks, first_offset, last_offset);
-  ChunkInfo out_info(output_dim, num_chunks, first_offset, last_offset);
-  UnitTestGenericComponentInternal(component, in_info, out_info);
-}
-
-
-
-void UnitTestSigmoidComponent() {
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 input_dim = 10 + Rand() % 50;
-  {
-    SigmoidComponent sigmoid_component(input_dim);
-    UnitTestGenericComponentInternal(sigmoid_component);
-  }
-  {
-    SigmoidComponent sigmoid_component;
-    sigmoid_component.InitFromString("dim=15");
-    UnitTestGenericComponentInternal(sigmoid_component);
-  }
-}
-
-template<class T>
-void UnitTestGenericComponent(std::string extra_str = "") {
-  // works if it has an initializer from int,
-  // e.g. tanh, sigmoid.
-
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 input_dim = 10 + Rand() % 50;
-  {
-    T component(input_dim);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    T component;
-    component.InitFromString(static_cast<std::string>("dim=15 ") + extra_str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestMaxoutComponent() {
-  // works if it has an initializer from int,
-  // e.g. tanh, sigmoid.
-
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  for (int32 i = 0; i < 5; i++) {
-    int32 output_dim = 10 + Rand() % 20,
-        group_size = 1 + Rand() % 10,
-        input_dim = output_dim * group_size;
-
-    MaxoutComponent component(input_dim, output_dim);
-    UnitTestGenericComponentInternal(component);
-  }
-
-  {
-    MaxoutComponent component;
-    component.InitFromString("input-dim=15 output-dim=5");
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestPnormComponent() {
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 num_fail = 0, num_tries = 4;
-  for (int32 i = 0; i < num_tries; i++) {
-    try {
-      int32 output_dim = 10 + Rand() % 20,
-          group_size = 1 + Rand() % 10,
-          input_dim = output_dim * group_size;
-      BaseFloat p = 1.0 + 0.1 * (Rand() % 20);
-
-      PnormComponent component(input_dim, output_dim, p);
-      UnitTestGenericComponentInternal(component);
-    } catch (...) {
-      KALDI_WARN << "Ignoring test failure in UnitTestPnormComponent().";
-      num_fail++;
-    }
-  }
-  if (num_fail >= num_tries/2) {
-    KALDI_ERR << "Too many test failures.";
-  }
-}
-
-void UnitTestMaxpoolingComponent() {
-  // works if it has an initializer from int,
-  // e.g. tanh, sigmoid.
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  for (int32 i = 0; i < 5; i++) {
-    int32 pool_stride = 5 + Rand() % 10,
-          pool_size = 2 + Rand() % 3,
-          num_pools = 1 + Rand() % 10;
-    int32 output_dim = num_pools * pool_stride;
-    int32 num_patches = num_pools * pool_size;
-    int32 input_dim = pool_stride * num_patches;
-
-    MaxpoolingComponent component(input_dim, output_dim,
-                                  pool_size, pool_stride);
-    UnitTestGenericComponentInternal(component);
-  }
-
-  {
-    MaxpoolingComponent component;
-    component.InitFromString("input-dim=192 output-dim=64 pool-size=3 pool-stride=16");
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestAffineComponent() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 1.0;
-  int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10;
-  {
-    AffineComponent component;
-    if (Rand() % 2 == 0) {
-      component.Init(learning_rate, input_dim, output_dim,
-                     param_stddev, bias_stddev);
-    } else {
-      Matrix<BaseFloat> mat(output_dim + 1, input_dim);
-      mat.SetRandn();
-      mat.Scale(param_stddev);
-      WriteKaldiObject(mat, "tmpf", true);
-      Sleep(0.5);
-      component.Init(learning_rate, "tmpf");
-      unlink("tmpf");
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=10 output-dim=15 param-stddev=0.1";
-    AffineComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestConvolutional1dComponent() {
-  BaseFloat learning_rate = 0.01,
-            param_stddev = 0.1, bias_stddev = 1.0;
-  int32 patch_stride = 10, patch_step = 1, patch_dim = 4;
-  int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
-  int32 num_splice = 5 + Rand() % 10, num_filters = 5 + Rand() % 10;
-  int32 input_dim = patch_stride * num_splice;
-  int32 filter_dim = patch_dim * num_splice;
-  int32 output_dim = num_patches * num_filters;
-  {
-    Convolutional1dComponent component;
-    if (Rand() % 2 == 0) {
-      component.Init(learning_rate, input_dim, output_dim,
-                     patch_dim, patch_step, patch_stride,
-                     param_stddev, bias_stddev, true);
-    } else {
-      Matrix<BaseFloat> mat(num_filters, filter_dim + 1);
-      mat.SetRandn();
-      mat.Scale(param_stddev);
-      WriteKaldiObject(mat, "tmpf", true);
-      Sleep(0.5);
-      component.Init(learning_rate, patch_dim,
-                     patch_step, patch_stride, "tmpf", false);
-      unlink("tmpf");
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    // appended-conv is false by default
-    const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10";
-    Convolutional1dComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10 appended-conv=true";
-    Convolutional1dComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestDropoutComponent() {
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 num_fail = 0, num_tries = 4;
-  for (int32 i = 0; i < num_tries; i++) {
-    try {
-      int32 input_dim = 10 + Rand() % 50;
-      {
-        DropoutComponent dropout_component(input_dim, 0.5, 0.3);
-        UnitTestGenericComponentInternal(dropout_component);
-      }
-      {
-        DropoutComponent dropout_component;
-        dropout_component.InitFromString("dim=15 dropout-proportion=0.6 dropout-scale=0.1");
-        UnitTestGenericComponentInternal(dropout_component);
-      }
-    } catch (...) {
-      KALDI_WARN << "Ignoring test failure in UnitTestDropoutComponent().";
-      num_fail++;
-    }
-  }
-  if (num_fail >= num_tries/2) {
-    KALDI_ERR << "Too many test failures.";
-  }
-}
-
-void UnitTestAdditiveNoiseComponent() {
-  // We're testing that the gradients are computed correctly:
-  // the input gradients and the model gradients.
-
-  int32 num_fail = 0, num_tries = 4;
-  for (int32 i = 0; i < num_tries; i++) {
-    try {
-      int32 input_dim = 10 + Rand() % 50;
-      {
-        AdditiveNoiseComponent additive_noise_component(input_dim, 0.1);
-        UnitTestGenericComponentInternal(additive_noise_component);
-      }
-      {
-        AdditiveNoiseComponent additive_noise_component;
-        additive_noise_component.InitFromString("dim=15 stddev=0.2");
-        UnitTestGenericComponentInternal(additive_noise_component);
-      }
-    } catch (...) {
-      KALDI_WARN << "Ignoring failure in AdditiveNoiseComponent test";
-      num_fail++;
-    }
-  }
-  if (num_fail >= num_tries/2) {
-    KALDI_ERR << "Too many test failures.";
-  }
-}
-
-void UnitTestScaleComponent() {
-  int32 dim = 1 + Rand() % 10;
-  BaseFloat scale = 0.1 + Rand() % 3;
-  {
-    ScaleComponent component;
-    if (Rand() % 2 == 0) {
-      component.Init(dim, scale);
-    } else {
-      std::ostringstream str;
-      str << "dim=" << dim << " scale=" << scale;
-      component.InitFromString(str.str());
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestAffineComponentPreconditioned() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 1.0, alpha = 0.01,
-      max_change = 100.0;
-  int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10;
-  {
-    AffineComponentPreconditioned component;
-    if (Rand() % 2 == 0) {
-      component.Init(learning_rate, input_dim, output_dim,
-                     param_stddev, bias_stddev,
-                     alpha, max_change);
-    } else {
-      Matrix<BaseFloat> mat(output_dim + 1, input_dim);
-      mat.SetRandn();
-      mat.Scale(param_stddev);
-      WriteKaldiObject(mat, "tmpf", true);
-      Sleep(0.5);
-      component.Init(learning_rate, alpha, max_change, "tmpf");
-      unlink("tmpf");
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=16 output-dim=15 param-stddev=0.1 alpha=0.01";
-    AffineComponentPreconditioned component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestAffineComponentPreconditionedOnline() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 1.0, num_samples_history = 2000.0, alpha = 4.0,
-      max_change_per_sample = 0.1, update_period = 1;
-  int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10,
-      rank_in = 1 + Rand() % 5, rank_out = 1 + Rand() % 5;
-  {
-    AffineComponentPreconditionedOnline component;
-    if (Rand() % 2 == 0) {
-      component.Init(learning_rate, input_dim, output_dim,
-                     param_stddev, bias_stddev,
-                     rank_in, rank_out, update_period,
-                     num_samples_history, alpha,
-                     max_change_per_sample);
-    } else {
-      Matrix<BaseFloat> mat(output_dim + 1, input_dim);
-      mat.SetRandn();
-      mat.Scale(param_stddev);
-      WriteKaldiObject(mat, "tmpf", true);
-      Sleep(0.5);
-      component.Init(learning_rate, rank_in, rank_out,
-                     update_period, num_samples_history, alpha,
-                     max_change_per_sample, "tmpf");
-      unlink("tmpf");
-    }
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=16 output-dim=15 param-stddev=0.1 num-samples-history=3000 alpha=2.0 update-period=1 rank-in=5 rank-out=6";
-    AffineComponentPreconditionedOnline component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestBlockAffineComponent() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 0.1;
-  int32 num_blocks = 1 + Rand() % 3,
-         input_dim = num_blocks * (2 + Rand() % 4),
-        output_dim = num_blocks * (2 + Rand() % 4);
-
-  {
-    BlockAffineComponent component;
-    component.Init(learning_rate, input_dim, output_dim,
-                   param_stddev, bias_stddev, num_blocks);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=10 output-dim=15 param-stddev=0.1 num-blocks=5";
-    BlockAffineComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestBlockAffineComponentPreconditioned() {
-  BaseFloat learning_rate = 0.01,
-      param_stddev = 0.1, bias_stddev = 1.0, alpha = 3.0;
-  int32 num_blocks = 1 + Rand() % 3,
-         input_dim = num_blocks * (2 + Rand() % 4),
-        output_dim = num_blocks * (2 + Rand() % 4);
-
-  {
-    BlockAffineComponentPreconditioned component;
-    component.Init(learning_rate, input_dim, output_dim,
-                   param_stddev, bias_stddev, num_blocks, alpha);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "learning-rate=0.01 input-dim=10 output-dim=15 param-stddev=0.1 num-blocks=5 alpha=3.0";
-    BlockAffineComponentPreconditioned component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestSumGroupComponent() {
-  std::vector<int32> sizes;
-  int32 num_sizes = 1 + Rand() % 5;
-  for (int32 i = 0; i < num_sizes; i++)
-    sizes.push_back(1 + Rand() % 5);
-
-  {
-    SumGroupComponent component;
-    component.Init(sizes);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "sizes=3:4:5";
-    SumGroupComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestDctComponent() {
-  int32 m = 3 + Rand() % 4, n = 3 + Rand() % 4,
-  dct_dim = m, dim = m * n;
-  bool reorder = (Rand() % 2 == 0);
-  {
-    DctComponent component;
-    component.Init(dim, dct_dim, reorder);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=2";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=3";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=4";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestFixedLinearComponent() {
-  int32 m = 1 + Rand() % 4, n = 1 + Rand() % 4;
-  {
-    CuMatrix<BaseFloat> mat(m, n);
-    mat.SetRandn();
-    FixedLinearComponent component;
-    component.Init(mat);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-void UnitTestFixedAffineComponent() {
-  int32 m = 15 + Rand() % 4, n = 15 + Rand() % 4;
-  {
-    CuMatrix<BaseFloat> mat(m, n);
-    mat.SetRandn();
-    FixedAffineComponent component;
-    component.Init(mat);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestFixedScaleComponent() {
-  int32 m = 1 + Rand() % 20;
-  {
-    CuVector<BaseFloat> vec(m);
-    vec.SetRandn();
-    FixedScaleComponent component;
-    component.Init(vec);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-void UnitTestFixedBiasComponent() {
-  int32 m = 1 + Rand() % 20;
-  {
-    CuVector<BaseFloat> vec(m);
-    vec.SetRandn();
-    FixedBiasComponent component;
-    component.Init(vec);
-    UnitTestGenericComponentInternal(component);
-  }
-}
-
-
-
-void UnitTestParsing() {
-  int32 i;
-  BaseFloat f;
-  bool b;
-  std::vector<int32> v;
-  std::string s = "x=y";
-  KALDI_ASSERT(ParseFromString("foo", &s, &i) == false
-               && s == "x=y");
-  KALDI_ASSERT(ParseFromString("foo", &s, &f) == false
-               && s == "x=y");
-  KALDI_ASSERT(ParseFromString("foo", &s, &v) == false
-               && s == "x=y");
-  KALDI_ASSERT(ParseFromString("foo", &s, &b) == false
-               && s == "x=y");
-  {
-    std::string s = "x=1";
-    KALDI_ASSERT(ParseFromString("x", &s, &i) == true
-                 && i == 1 && s == "");
-    s = "a=b x=1";
-    KALDI_ASSERT(ParseFromString("x", &s, &i) == true
-                 && i == 1 && s == "a=b");
-  }
-  {
-    std::string s = "foo=false";
-    KALDI_ASSERT(ParseFromString("foo", &s, &b) == true
-                 && b == false && s == "");
-    s = "x=y foo=true a=b";
-    KALDI_ASSERT(ParseFromString("foo", &s, &b) == true
-                 && b == true && s == "x=y a=b");
-  }
-
-  {
-    std::string s = "foobar x=1";
-    KALDI_ASSERT(ParseFromString("x", &s, &f) == true
-                 && f == 1.0 && s == "foobar");
-    s = "a=b x=1 bxy";
-    KALDI_ASSERT(ParseFromString("x", &s, &f) == true
-                 && f == 1.0 && s == "a=b bxy");
-  }
-  {
-    std::string s = "x=1:2:3";
-    KALDI_ASSERT(ParseFromString("x", &s, &v) == true
-                 && v.size() == 3 && v[0] == 1 && v[1] == 2 && v[2] == 3
-                 && s == "");
-    s = "a=b x=1:2:3 c=d";
-    KALDI_ASSERT(ParseFromString("x", &s, &v) == true
-                 && f == 1.0 && s == "a=b c=d");
-  }
-
-}
-
-void UnitTestSpliceComponent() {
-  int32 feat_dim = RandInt(1, 20),
-      const_dim =  RandInt(0, 10),
-      left_context = RandInt(-5, 0),
-      right_context = RandInt(0, 5),
-      num_chunks = RandInt(1, 20);
-        // multiple chunks are required as splice component
-        // has separate index computation logic for more than one chunks
-  KALDI_LOG << " Feat_dim :" << feat_dim << " const_dim: " << const_dim  ;
-  std::vector<bool> contiguous(2);
-  contiguous[0] = true;
-  contiguous[1] = false;
-  for (int32 i = 0; i < contiguous.size(); i++) {
-    std::vector<int32> splice_indexes;
-    if (contiguous[i]) {
-      // create contiguous set of splice indexes in the range
-      // (-left_context, right_context)
-      KALDI_LOG << "Testing contiguous splice component";
-      splice_indexes.reserve(right_context - left_context + 1);
-      for (int32 i = left_context; i <= right_context; i++)
-        splice_indexes.push_back(i);
-    } else  {
-      // generate random splice indexes in range (-left_context, right_context)
-      KALDI_LOG << "Testing non-contiguous splice component";
-      int32 num_left_splice_indexes = RandInt(0, -left_context) + 1;
-      int32 num_right_splice_indexes = RandInt(0, right_context);
-      splice_indexes.reserve(num_left_splice_indexes + num_right_splice_indexes);
-      while (splice_indexes.size() < num_left_splice_indexes)  {
-        int32 new_index = RandInt(left_context, 0);
-        // check if the index already exists in the vector
-        if (std::find(splice_indexes.begin(), splice_indexes.end(), new_index)
-            == splice_indexes.end())  {
-          splice_indexes.push_back(new_index);
-        }
-      }
-      while (splice_indexes.size() < num_left_splice_indexes + num_right_splice_indexes)  {
-        int32 new_index = RandInt(0, right_context);
-        // check if the index already exists in the vector
-        if (std::find(splice_indexes.begin(), splice_indexes.end(), new_index)
-            == splice_indexes.end())  {
-          splice_indexes.push_back(new_index);
-        }
-      }
-      sort(splice_indexes.begin(), splice_indexes.end());
-      if (splice_indexes.back() < 0) // will fail assertion in init of component
-        splice_indexes.push_back(0);
-    }
-    std::vector<int32> input_offsets;
-    for (int32 i = 0; i < splice_indexes.size(); i++) {
-      input_offsets.push_back(splice_indexes[i] - splice_indexes.front());
-      KALDI_LOG << i << " : " << splice_indexes[i] << " : " << input_offsets[i] ;
-    }
-    int32 output_offset = -splice_indexes.front();
-    SpliceComponent *component = new SpliceComponent();
-    component->Init(feat_dim + const_dim, splice_indexes, const_dim);
-    ChunkInfo in_info = ChunkInfo(feat_dim + const_dim, num_chunks,
-                                  input_offsets),
-              out_info = ChunkInfo(feat_dim * splice_indexes.size() + const_dim,
-                                   num_chunks, output_offset, output_offset);
-    UnitTestGenericComponentInternal(*component, in_info, out_info);
-    delete component;
-  }
-}
-
-void BasicDebugTestForSpliceMax(bool output=false) {
-  int32 C=5,
-        context_len=2,
-        R= 3 + 2*context_len;
-
-  SpliceMaxComponent *c = new SpliceMaxComponent();
-  std::vector<int32> context(2 * context_len + 1);
-  for (int32 i = -1 * context_len; i <= context_len; i++)
-    context[i + context_len] = i;
-  c->Init(C, context);
-  CuMatrix<BaseFloat> in(R, C), in_deriv(R, C);
-  CuMatrix<BaseFloat> out(R, c->OutputDim());
-  ChunkInfo in_info = ChunkInfo(C, 1, 0, R - 1),
-            out_info = ChunkInfo(C, 1, context_len, R - 1 - context_len);
-
-  in.SetRandn();
-  if (output)
-    KALDI_LOG << in;
-
-  c->Propagate(in_info, out_info, in, &out);
-
-  if (output)
-    KALDI_LOG << out;
-
-  out.Set(5.0);
-
-  if (output)
-    KALDI_LOG << out;
-
-  c->Backprop(in_info, out_info, in, in, out, c, &in_deriv);
-
-  if (output)
-    KALDI_LOG << in_deriv;
-
-  delete c;
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#include "matrix/matrix-functions.h"
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-
-  int32 loop = 0;
-#if HAVE_CUDA == 1
-  for (loop = 0; loop < 2; loop++) {
-    //// Uncomment the following line to expose the bug in UnitTestDropoutComponent
-    //CuDevice::Instantiate().SetDebugStrideMode(true);
-    if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
-    else
-      CuDevice::Instantiate().SelectGpuId("optional"); // -2 .. automatic selection
-#endif
-
-    BasicDebugTestForSpliceMax(true);
-    // We used to test this 3 times, but now that nnet2 is rarely changed,
-    // reducing it to once.
-    for (int32 i = 0; i < 1; i++) {
-      UnitTestGenericComponent<SigmoidComponent>();
-      UnitTestGenericComponent<TanhComponent>();
-      UnitTestGenericComponent<PowerComponent>("power=1.5");
-      UnitTestGenericComponent<PowerComponent>("power=1.0");
-      UnitTestGenericComponent<PermuteComponent>();
-      UnitTestGenericComponent<SoftmaxComponent>();
-      UnitTestGenericComponent<LogSoftmaxComponent>();
-      UnitTestGenericComponent<RectifiedLinearComponent>();
-      UnitTestGenericComponent<SoftHingeComponent>();
-      UnitTestSpliceComponent();
-      UnitTestMaxoutComponent();
-      UnitTestPnormComponent();
-      UnitTestMaxpoolingComponent();
-      UnitTestGenericComponent<NormalizeComponent>();
-      UnitTestSigmoidComponent();
-      UnitTestAffineComponent();
-      UnitTestScaleComponent();
-      UnitTestBlockAffineComponent();
-      UnitTestBlockAffineComponentPreconditioned();
-      UnitTestSumGroupComponent();
-      UnitTestDctComponent();
-      UnitTestFixedLinearComponent();
-      UnitTestFixedAffineComponent();
-      UnitTestFixedScaleComponent();
-      UnitTestFixedBiasComponent();
-      UnitTestAffineComponentPreconditioned();
-      UnitTestAffineComponentPreconditionedOnline();
-      UnitTestConvolutional1dComponent();
-      UnitTestDropoutComponent();
-      UnitTestAdditiveNoiseComponent();
-      UnitTestParsing();
-      if (loop == 0)
-        KALDI_LOG << "Tests without GPU use succeeded.";
-      else
-        KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-    }
-#if HAVE_CUDA == 1
-  } // No for loop if 'HAVE_CUDA != 1',
-  CuDevice::Instantiate().PrintProfile();
-#endif
-  return 0;
-}
diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc
deleted file mode 100644
index f0919acfac8..00000000000
--- a/src/nnet2/nnet-component.cc
+++ /dev/null
@@ -1,4390 +0,0 @@
-// nnet2/nnet-component.cc
-
-// Copyright 2011-2012  Karel Vesely
-//           2013-2014  Johns Hopkins University (author: Daniel Povey)
-//                2013  Xiaohui Zhang
-//                2014  Vijayaditya Peddinti
-//           2014-2015  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <iterator>
-#include <sstream>
-#include "nnet2/nnet-component.h"
-#include "nnet2/nnet-precondition.h"
-#include "nnet2/nnet-precondition-online.h"
-#include "util/stl-utils.h"
-#include "util/text-utils.h"
-#include "util/kaldi-io.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// static
-Component* Component::ReadNew(std::istream &is, bool binary) {
-  std::string token;
-  ReadToken(is, binary, &token); // e.g. "<SigmoidComponent>".
-  token.erase(0, 1); // erase "<".
-  token.erase(token.length()-1); // erase ">".
-  Component *ans = NewComponentOfType(token);
-  if (!ans)
-    KALDI_ERR << "Unknown component type " << token;
-  ans->Read(is, binary);
-  return ans;
-}
-
-
-// static
-Component* Component::NewComponentOfType(const std::string &component_type) {
-  Component *ans = NULL;
-  if (component_type == "SigmoidComponent") {
-    ans = new SigmoidComponent();
-  } else if (component_type == "TanhComponent") {
-    ans = new TanhComponent();
-  } else if (component_type == "PowerComponent") {
-    ans = new PowerComponent();
-  } else if (component_type == "SoftmaxComponent") {
-    ans = new SoftmaxComponent();
-  } else if (component_type == "LogSoftmaxComponent") {
-    ans = new LogSoftmaxComponent();
-  } else if (component_type == "RectifiedLinearComponent") {
-    ans = new RectifiedLinearComponent();
-  } else if (component_type == "NormalizeComponent") {
-    ans = new NormalizeComponent();
-  } else if (component_type == "SoftHingeComponent") {
-    ans = new SoftHingeComponent();
-  } else if (component_type == "PnormComponent") {
-    ans = new PnormComponent();
-  } else if (component_type == "MaxoutComponent") {
-    ans = new MaxoutComponent();
-  } else if (component_type == "ScaleComponent") {
-    ans = new ScaleComponent();
-  } else if (component_type == "AffineComponent") {
-    ans = new AffineComponent();
-  } else if (component_type == "AffineComponentPreconditioned") {
-    ans = new AffineComponentPreconditioned();
-  } else if (component_type == "AffineComponentPreconditionedOnline") {
-    ans = new AffineComponentPreconditionedOnline();
-  } else if (component_type == "SumGroupComponent") {
-    ans = new SumGroupComponent();
-  } else if (component_type == "BlockAffineComponent") {
-    ans = new BlockAffineComponent();
-  } else if (component_type == "BlockAffineComponentPreconditioned") {
-    ans = new BlockAffineComponentPreconditioned();
-  } else if (component_type == "PermuteComponent") {
-    ans = new PermuteComponent();
-  } else if (component_type == "DctComponent") {
-    ans = new DctComponent();
-  } else if (component_type == "FixedLinearComponent") {
-    ans = new FixedLinearComponent();
-  } else if (component_type == "FixedAffineComponent") {
-    ans = new FixedAffineComponent();
-  } else if (component_type == "FixedScaleComponent") {
-    ans = new FixedScaleComponent();
-  } else if (component_type == "FixedBiasComponent") {
-    ans = new FixedBiasComponent();
-  } else if (component_type == "SpliceComponent") {
-    ans = new SpliceComponent();
-  } else if (component_type == "SpliceMaxComponent") {
-    ans = new SpliceMaxComponent();
-  } else if (component_type == "DropoutComponent") {
-    ans = new DropoutComponent();
-  } else if (component_type == "AdditiveNoiseComponent") {
-    ans = new AdditiveNoiseComponent();
-  } else if (component_type == "Convolutional1dComponent") {
-    ans = new Convolutional1dComponent();
-  } else if (component_type == "MaxpoolingComponent") {
-    ans = new MaxpoolingComponent();
-  }
-  return ans;
-}
-
-// static
-Component* Component::NewFromString(const std::string &initializer_line) {
-  std::istringstream istr(initializer_line);
-  std::string component_type; // e.g. "SigmoidComponent".
-  istr >> component_type >> std::ws;
-  std::string rest_of_line;
-  getline(istr, rest_of_line);
-  Component *ans = NewComponentOfType(component_type);
-  if (ans == NULL)
-    KALDI_ERR << "Bad initializer line (no such type of Component): "
-              << initializer_line;
-  ans->InitFromString(rest_of_line);
-  return ans;
-}
-
-
-// This is like ExpectToken but for two tokens, and it
-// will either accept token1 and then token2, or just token2.
-// This is useful in Read functions where the first token
-// may already have been consumed.
-static void ExpectOneOrTwoTokens(std::istream &is, bool binary,
-                                 const std::string &token1,
-                                 const std::string &token2) {
-  KALDI_ASSERT(token1 != token2);
-  std::string temp;
-  ReadToken(is, binary, &temp);
-  if (temp == token1) {
-    ExpectToken(is, binary, token2);
-  } else {
-    if (temp != token2) {
-      KALDI_ERR << "Expecting token " << token1 << " or " << token2
-                << " but got " << temp;
-    }
-  }
-}
-
-
-// static
-bool ParseFromString(const std::string &name, std::string *string,
-                     int32 *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!ConvertStringToInteger(split_string[i].substr(len), param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     bool *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      std::string b = split_string[i].substr(len);
-      if (b.empty())
-        KALDI_ERR << "Bad option " << split_string[i];
-      if (b[0] == 'f' || b[0] == 'F') *param = false;
-      else if (b[0] == 't' || b[0] == 'T') *param = true;
-      else
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     BaseFloat *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!ConvertStringToReal(split_string[i].substr(len), param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::string *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      *param = split_string[i].substr(len);
-
-      // Set "string" to all the pieces but the one we used.
-      *string = "";
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::vector<int32> *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!SplitStringToIntegers(split_string[i].substr(len), ":",
-                                 false, param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-
-Component *PermuteComponent::Copy() const {
-  PermuteComponent *ans = new PermuteComponent();
-  ans->reorder_ = reorder_;
-  return ans;
-}
-void PermuteComponent::Init(const std::vector<int32> &reorder) {
-  reorder_ = reorder;
-  KALDI_ASSERT(!reorder.empty());
-  std::vector<int32> indexes(reorder);
-  std::sort(indexes.begin(), indexes.end());
-  for (int32 i = 0; i < static_cast<int32>(indexes.size()); i++)
-    KALDI_ASSERT(i == indexes[i] && "Not a permutation");
-}
-
-
-std::string Component::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim();
-  return stream.str();
-}
-
-std::string UpdatableComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim() << ", learning-rate="
-         << LearningRate();
-  return stream.str();
-}
-
-
-void NonlinearComponent::SetDim(int32 dim) {
-  KALDI_ASSERT(dim > 0);
-  dim_ = dim;
-  value_sum_.Resize(dim);
-  deriv_sum_.Resize(dim);
-  count_ = 0.0;
-}
-
-void NonlinearComponent::UpdateStats(const CuMatrixBase<BaseFloat> &out_value,
-                                     const CuMatrixBase<BaseFloat> *deriv) {
-  KALDI_ASSERT(out_value.NumCols() == InputDim());
-  // Check we have the correct dimensions.
-  if (value_sum_.Dim() != InputDim() ||
-      (deriv != NULL && deriv_sum_.Dim() != InputDim())) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    if (value_sum_.Dim() != InputDim()) {
-      value_sum_.Resize(InputDim());
-      count_ = 0.0;
-    }
-    if (deriv != NULL && deriv_sum_.Dim() != InputDim()) {
-      deriv_sum_.Resize(InputDim());
-      count_ = 0.0;
-      value_sum_.SetZero();
-    }
-  }
-  count_ += out_value.NumRows();
-  CuVector<BaseFloat> temp(InputDim());
-  temp.AddRowSumMat(1.0, out_value, 0.0);
-  value_sum_.AddVec(1.0, temp);
-  if (deriv != NULL) {
-    temp.AddRowSumMat(1.0, *deriv, 0.0);
-    deriv_sum_.AddVec(1.0, temp);
-  }
-}
-
-void NonlinearComponent::Scale(BaseFloat scale) {
-  value_sum_.Scale(scale);
-  deriv_sum_.Scale(scale);
-  count_ *= scale;
-}
-
-void NonlinearComponent::Add(BaseFloat alpha, const NonlinearComponent &other) {
-  if (value_sum_.Dim() == 0 && other.value_sum_.Dim() != 0)
-    value_sum_.Resize(other.value_sum_.Dim());
-  if (deriv_sum_.Dim() == 0 && other.deriv_sum_.Dim() != 0)
-    deriv_sum_.Resize(other.deriv_sum_.Dim());
-  if (other.value_sum_.Dim() != 0)
-    value_sum_.AddVec(alpha, other.value_sum_);
-  if (other.deriv_sum_.Dim() != 0)
-    deriv_sum_.AddVec(alpha, other.deriv_sum_);
-  count_ += alpha * other.count_;
-}
-
-void NonlinearComponent::Read(std::istream &is, bool binary) {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<SigmoidComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</SigmoidComponent>"
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<Dim>");
-  ReadBasicType(is, binary, &dim_); // Read dimension.
-  ExpectToken(is, binary, "<ValueSum>");
-  value_sum_.Read(is, binary);
-  ExpectToken(is, binary, "<DerivSum>");
-  deriv_sum_.Read(is, binary);
-  ExpectToken(is, binary, "<Count>");
-  ReadBasicType(is, binary, &count_);
-  ExpectToken(is, binary, ostr_end.str());
-}
-
-void NonlinearComponent::Write(std::ostream &os, bool binary) const {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<SigmoidComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</SigmoidComponent>"
-  WriteToken(os, binary, ostr_beg.str());
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<ValueSum>");
-  value_sum_.Write(os, binary);
-  WriteToken(os, binary, "<DerivSum>");
-  deriv_sum_.Write(os, binary);
-  WriteToken(os, binary, "<Count>");
-  WriteBasicType(os, binary, count_);
-  WriteToken(os, binary, ostr_end.str());
-}
-
-NonlinearComponent::NonlinearComponent(const NonlinearComponent &other):
-    dim_(other.dim_), value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_),
-    count_(other.count_) { }
-
-void NonlinearComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim;
-  bool ok = ParseFromString("dim", &args, &dim);
-  if (!ok || !args.empty() || dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(dim);
-}
-
-void MaxoutComponent::Init(int32 input_dim, int32 output_dim)  {
-  input_dim_ = input_dim;
-  output_dim_ = output_dim;
-  if (input_dim_ == 0)
-    input_dim_ = 10 * output_dim_; // default group size : 10
-  KALDI_ASSERT(input_dim_ > 0 && output_dim_ >= 0);
-  KALDI_ASSERT(input_dim_ % output_dim_ == 0);
-}
-
-void MaxoutComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 input_dim = 0;
-  int32 output_dim = 0;
-  bool ok = ParseFromString("output-dim", &args, &output_dim) &&
-      ParseFromString("input-dim", &args, &input_dim);
-  KALDI_LOG << output_dim << " " << input_dim << " " << ok;
-  if (!ok || !args.empty() || output_dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(input_dim, output_dim);
-}
-
-
-void MaxoutComponent::Propagate(const ChunkInfo &in_info,
-                                const ChunkInfo &out_info,
-                                const CuMatrixBase<BaseFloat> &in,
-                                CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  out->GroupMax(in);
-}
-
-void MaxoutComponent::Backprop(const ChunkInfo &, // in_info,
-                               const ChunkInfo &, // out_info,
-                               const CuMatrixBase<BaseFloat> &in_value,
-                               const CuMatrixBase<BaseFloat> &out_value,
-                               const CuMatrixBase<BaseFloat> &out_deriv,
-                               Component *to_update,
-                               CuMatrix<BaseFloat> *in_deriv) const {
-  in_deriv->Resize(in_value.NumRows(), in_value.NumCols(), kSetZero);
-  in_deriv->GroupMaxDeriv(in_value, out_value);
-  in_deriv->MulRowsGroupMat(out_deriv);
-}
-
-void MaxoutComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<MaxoutComponent>", "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_);
-  ExpectToken(is, binary, "<OutputDim>");
-  ReadBasicType(is, binary, &output_dim_);
-  ExpectToken(is, binary, "</MaxoutComponent>");
-}
-
-void MaxoutComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<MaxoutComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  WriteToken(os, binary, "<OutputDim>");
-  WriteBasicType(os, binary, output_dim_);
-  WriteToken(os, binary, "</MaxoutComponent>");
-}
-
-std::string MaxoutComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", input-dim = " << input_dim_
-         << ", output-dim = " << output_dim_;
-  return stream.str();
-}
-
-void PnormComponent::Init(int32 input_dim, int32 output_dim, BaseFloat p)  {
-  input_dim_ = input_dim;
-  output_dim_ = output_dim;
-  if (input_dim_ == 0)
-    input_dim_ = 10 * output_dim_; // default group size : 10
-  p_ = p;
-  KALDI_ASSERT(input_dim_ > 0 && output_dim_ >= 0 && p_ >= 0);
-  KALDI_ASSERT(input_dim_ % output_dim_ == 0);
-}
-
-void PnormComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 input_dim = 0;
-  int32 output_dim = 0;
-  BaseFloat p = 2;
-  bool ok = ParseFromString("output-dim", &args, &output_dim) &&
-      ParseFromString("input-dim", &args, &input_dim);
-  ParseFromString("p", &args, &p);
-  if (!ok || !args.empty() || output_dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(input_dim, output_dim, p);
-}
-
-
-void PnormComponent::Propagate(const ChunkInfo &in_info,
-                               const ChunkInfo &out_info,
-                               const CuMatrixBase<BaseFloat> &in,
-                               CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  out->GroupPnorm(in, p_);
-}
-
-void PnormComponent::Backprop(const ChunkInfo &,  // in_info,
-                              const ChunkInfo &,  // out_info,
-                              const CuMatrixBase<BaseFloat> &in_value,
-                              const CuMatrixBase<BaseFloat> &out_value,
-                              const CuMatrixBase<BaseFloat> &out_deriv,
-                              Component *to_update,
-                                // may be identical to "this".
-                              CuMatrix<BaseFloat> *in_deriv) const  {
-  in_deriv->Resize(in_value.NumRows(), in_value.NumCols(), kSetZero);
-  in_deriv->DiffGroupPnorm(in_value, out_value, out_deriv, p_);
-}
-
-void PnormComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<PnormComponent>", "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_);
-  ExpectToken(is, binary, "<OutputDim>");
-  ReadBasicType(is, binary, &output_dim_);
-  ExpectToken(is, binary, "<P>");
-  ReadBasicType(is, binary, &p_);
-  ExpectToken(is, binary, "</PnormComponent>");
-}
-
-void PnormComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<PnormComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  WriteToken(os, binary, "<OutputDim>");
-  WriteBasicType(os, binary, output_dim_);
-  WriteToken(os, binary, "<P>");
-  WriteBasicType(os, binary, p_);
-  WriteToken(os, binary, "</PnormComponent>");
-}
-
-std::string PnormComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", input-dim = " << input_dim_
-         << ", output-dim = " << output_dim_
-     << ", p = " << p_;
-  return stream.str();
-}
-
-
-const BaseFloat NormalizeComponent::kNormFloor = pow(2.0, -66);
-// This component modifies the vector of activations by scaling it so that the
-// root-mean-square equals 1.0.
-
-void NormalizeComponent::Propagate(const ChunkInfo &in_info,
-                                   const ChunkInfo &out_info,
-                                   const CuMatrixBase<BaseFloat> &in,
-                                   CuMatrixBase<BaseFloat> *out) const  {
-  cu::NormalizePerRow(in, BaseFloat(1), false, out);
-}
-
-/*
-  A note on the derivative of NormalizeComponent...
-  let both row_in and row_out be vectors of dimension D.
-  Let p = row_in^T row_in / D, and let
-      f = 1 / sqrt(max(kNormFloor, p)), and we compute row_out as:
-row_out = f row_in.
-  Suppose we have a quantity deriv_out which is the derivative
-  of the objective function w.r.t. row_out.  We want to compute
-  deriv_in which is the derivative of the objective function w.r.t.
-  row_in.  Let the objective function be F.  One term is obvious: we have
-     deriv_in = f deriv_out + ....
-  next we have to take into account the derivative that gets back-propagated
-  through f.  Obviously, dF/df = deriv_out^T row_in.
-  And df/dp = (p <= kNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1 / sqrt(kNormFloor) ? 0.0 : -0.5 f^3),
-  and dp/d(row_in) = 2/D row_in. [it's vector_valued].
-  So this term in dF/d(row_in) equals:
-    dF/df df/dp dp/d(row_in)   =    2/D (f == 1 / sqrt(kNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
-  So
-     deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3 / D) (deriv_out^T row_in) row_in
-
-*/
-
-void NormalizeComponent::Backprop(
-    const ChunkInfo &,  // in_info,
-    const ChunkInfo &,  // out_info,
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_value,
-    const CuMatrixBase<BaseFloat> &out_deriv, Component *to_update,
-    // may be identical to "this".
-    CuMatrix<BaseFloat> *in_deriv) const {
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  cu::DiffNormalizePerRow(in_value, out_deriv, BaseFloat(1), false, in_deriv);
-}
-
-void SigmoidComponent::Propagate(const ChunkInfo &in_info,
-                                 const ChunkInfo &out_info,
-                                 const CuMatrixBase<BaseFloat> &in,
-                                 CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  out->Sigmoid(in);
-}
-
-void SigmoidComponent::Backprop(const ChunkInfo &,  //in_info,
-                                const ChunkInfo &,  //out_info,
-                                const CuMatrixBase<BaseFloat> &,  //in_value,
-                                const CuMatrixBase<BaseFloat> &out_value,
-                                const CuMatrixBase<BaseFloat> &out_deriv,
-                                Component *to_update, // may be identical to "this".
-                                CuMatrix<BaseFloat> *in_deriv) const  {
-  // we ignore in_value and to_update.
-
-  // The element by element equation would be:
-  // in_deriv = out_deriv * out_value * (1.0 - out_value);
-  // We can accomplish this via calls to the matrix library.
-
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  in_deriv->Set(1.0);
-  in_deriv->AddMat(-1.0, out_value);
-  // now in_deriv = 1.0 - out_value [element by element]
-  in_deriv->MulElements(out_value);
-  // now in_deriv = out_value * (1.0 - out_value) [element by element], i.e.
-  // it contains the element-by-element derivative of the nonlinearity.
-  if (to_update != NULL)
-    dynamic_cast<NonlinearComponent*>(to_update)->UpdateStats(out_value,
-                                                              in_deriv);
-  in_deriv->MulElements(out_deriv);
-  // now in_deriv = out_deriv * out_value * (1.0 - out_value) [element by element]
-}
-
-
-void TanhComponent::Propagate(const ChunkInfo &in_info,
-                              const ChunkInfo &out_info,
-                              const CuMatrixBase<BaseFloat> &in,
-                              CuMatrixBase<BaseFloat> *out) const  {
-  // Apply tanh function to each element of the output...
-  // the tanh function may be written as -1 + ( 2 / (1 + e^{-2 x})),
-  // which is a scaled and shifted sigmoid.
-
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  out->Tanh(in);
-}
-
-void TanhComponent::Backprop(const ChunkInfo &, //in_info,
-                             const ChunkInfo &, //out_info,
-                             const CuMatrixBase<BaseFloat> &, //in_value,
-                             const CuMatrixBase<BaseFloat> &out_value,
-                             const CuMatrixBase<BaseFloat> &out_deriv,
-                             Component *to_update, // may be identical to "this".
-                             CuMatrix<BaseFloat> *in_deriv) const {
-  /*
-    Note on the derivative of the tanh function:
-    tanh'(x) = sech^2(x) = -(tanh(x)+1) (tanh(x)-1) = 1 - tanh^2(x)
-
-    The element by element equation of what we're doing would be:
-    in_deriv = out_deriv * (1.0 - out_value^2).
-    We can accomplish this via calls to the matrix library. */
-
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  in_deriv->CopyFromMat(out_value);
-  in_deriv->ApplyPow(2.0);
-  in_deriv->Scale(-1.0);
-  in_deriv->Add(1.0);
-  // now in_deriv = (1.0 - out_value^2), the element-by-element derivative of
-  // the nonlinearity.
-  if (to_update != NULL)
-    dynamic_cast<NonlinearComponent*>(to_update)->UpdateStats(out_value,
-                                                              in_deriv);
-  in_deriv->MulElements(out_deriv);
-}
-
-void PowerComponent::Init(int32 dim, BaseFloat power) {
-  dim_ = dim;
-  power_ = power;
-  KALDI_ASSERT(dim > 0 && power >= 0);
-}
-
-void PowerComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim;
-  BaseFloat power = 2.0;
-  ParseFromString("power", &args, &power); // Optional.
-  // Accept either "dim" or "input-dim" to specify the input dim.
-  // "input-dim" is the canonical one; "dim" simplifies the testing code.
-  bool ok = (ParseFromString("dim", &args, &dim) ||
-             ParseFromString("input-dim", &args, &dim));
-  if (!ok || !args.empty() || dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(dim, power);
-}
-
-void PowerComponent::Propagate(const ChunkInfo &in_info,
-                               const ChunkInfo &out_info,
-                               const CuMatrixBase<BaseFloat> &in,
-                               CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  // Apply power operation to each element of the input...
-  out->CopyFromMat(in);
-  out->ApplyPowAbs(power_);
-}
-
-void PowerComponent::Backprop(const ChunkInfo &,  //in_info,
-                              const ChunkInfo &,  //out_info,
-                              const CuMatrixBase<BaseFloat> &in_value,
-                              const CuMatrixBase<BaseFloat> &out_value,
-                              const CuMatrixBase<BaseFloat> &out_deriv,
-                              Component *to_update, // may be identical to "this".
-                              CuMatrix<BaseFloat> *in_deriv) const  {
-  in_deriv->Resize(in_value.NumRows(), in_value.NumCols());
-  // in scalar terms: in_deriv += p * in_value^(p-1) * out_deriv
-  in_deriv->CopyFromMat(in_value);
-  in_deriv->ApplyPowAbs(power_ - 1.0, true);
-  in_deriv->Scale(power_);
-  in_deriv->MulElements(out_deriv);
-}
-
-void PowerComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<PowerComponent>", "<InputDim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<OutputDim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<Power>");
-  ReadBasicType(is, binary, &power_);
-  ExpectToken(is, binary, "</PowerComponent>");
-}
-
-void PowerComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<PowerComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<OutputDim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<Power>");
-  WriteBasicType(os, binary, power_);
-  WriteToken(os, binary, "</PowerComponent>");
-}
-
-std::string PowerComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", dim = " << dim_
-     << ", power = " << power_;
-  return stream.str();
-}
-
-void RectifiedLinearComponent::Propagate(const ChunkInfo &in_info,
-                                         const ChunkInfo &out_info,
-                                         const CuMatrixBase<BaseFloat> &in,
-                                         CuMatrixBase<BaseFloat> *out) const  {
-  // Apply rectified linear function (x >= 0 ? 1.0 : 0.0)
-  out->CopyFromMat(in);
-  out->ApplyFloor(0.0);
-}
-
-void RectifiedLinearComponent::Backprop(const ChunkInfo &,  //in_info,
-                                        const ChunkInfo &,  //out_info,
-                                        const CuMatrixBase<BaseFloat> &,  //in_value,
-                                        const CuMatrixBase<BaseFloat> &out_value,
-                                        const CuMatrixBase<BaseFloat> &out_deriv,
-                                        Component *to_update, // may be identical to "this".
-                                        CuMatrix<BaseFloat> *in_deriv) const  {
-
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols(),
-                   kUndefined);
-  in_deriv->CopyFromMat(out_value);
-  in_deriv->ApplyHeaviside();
-  // Now in_deriv(i, j) equals (out_value(i, j) > 0.0 ? 1.0 : 0.0),
-  // which is the derivative of the nonlinearity (well, except at zero
-  // where it's undefined).
-  if (to_update != NULL)
-    dynamic_cast<NonlinearComponent*>(to_update)->UpdateStats(out_value,
-                                                              in_deriv);
-  in_deriv->MulElements(out_deriv);
-}
-
-void SoftHingeComponent::Propagate(const ChunkInfo &in_info,
-                                   const ChunkInfo &out_info,
-                                   const CuMatrixBase<BaseFloat> &in,
-                                   CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  // Apply function x = log(1 + exp(x))
-  out->SoftHinge(in);
-}
-
-void SoftHingeComponent::Backprop(const ChunkInfo &,  //in_info,
-                                  const ChunkInfo &,  //out_info,
-                                  const CuMatrixBase<BaseFloat> &in_value,
-                                  const CuMatrixBase<BaseFloat> &out_value,
-                                  const CuMatrixBase<BaseFloat> &out_deriv,
-                                  Component *to_update, // may be identical to "this".
-                                  CuMatrix<BaseFloat> *in_deriv) const  {
-
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols(),
-                   kUndefined);
-  // note: d/dx: log(1 + exp(x)) = (exp(x) / (1 + exp(x)) = 1 / (1 + exp(-x)),
-  // which is the sigmoid function.
-
-  // if the output is y, then dy/dx =  (exp(x) / (1 + exp(x)),
-  // and using y = log(1 + exp(x)) -> exp(x) = exp(y) - 1, we have
-  // dy/dx = (exp(y) - 1) / exp(y)
-
-
-  in_deriv->Sigmoid(in_value);
-
-  if (to_update != NULL)
-    dynamic_cast<NonlinearComponent*>(to_update)->UpdateStats(out_value,
-                                                              in_deriv);
-  in_deriv->MulElements(out_deriv);
-}
-
-
-void ScaleComponent::Propagate(const ChunkInfo &in_info,
-                               const ChunkInfo &out_info,
-                               const CuMatrixBase<BaseFloat> &in,
-                               CuMatrixBase<BaseFloat> *out) const  {
-  out->CopyFromMat(in);
-  out->Scale(scale_);
-}
-
-void ScaleComponent::Backprop(const ChunkInfo &,  //in_info,
-                              const ChunkInfo &,  //out_info,
-                              const CuMatrixBase<BaseFloat> &,  //in_value,
-                              const CuMatrixBase<BaseFloat> &,  //out_value,
-                              const CuMatrixBase<BaseFloat> &out_deriv,
-                              Component *, //to_update, // may be identical to "this".
-                              CuMatrix<BaseFloat> *in_deriv) const  {
-
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols(),
-                   kUndefined);
-  in_deriv->CopyFromMat(out_deriv);
-  in_deriv->Scale(scale_);
-}
-
-void ScaleComponent::Init(int32 dim, BaseFloat scale) {
-  dim_ = dim;
-  scale_ = scale;
-  KALDI_ASSERT(dim_ > 0);
-  KALDI_ASSERT(scale_ != 0.0);
-}
-
-void ScaleComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim;
-  BaseFloat scale;
-  if (!ParseFromString("dim", &args, &dim))
-    KALDI_ERR << "Dimension not specified for ScaleComponent in config file";
-  if (!ParseFromString("scale", &args, &scale))
-    KALDI_ERR << "Scale not specified for ScaleComponent in config file";
-  Init(dim, scale);
-}
-
-void ScaleComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<ScaleComponent>");
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<Scale>");
-  WriteBasicType(os, binary, scale_);
-  WriteToken(os, binary, "</ScaleComponent>");
-}
-
-void ScaleComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<ScaleComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<Scale>");
-  ReadBasicType(is, binary, &scale_);
-  ExpectToken(is, binary, "</ScaleComponent>");
-}
-
-std::string ScaleComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", dim=" << dim_ << ", scale=" << scale_;
-  return stream.str();
-}
-
-void SoftmaxComponent::Propagate(const ChunkInfo &in_info,
-                                 const ChunkInfo &out_info,
-                                 const CuMatrixBase<BaseFloat> &in,
-                                 CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  // Apply softmax function to each row of the output...
-  // for that row, we do
-  // x_i = exp(x_i) / sum_j exp(x_j).
-
-  out->SoftMaxPerRow(in);
-
-  // This floor on the output helps us deal with
-  // almost-zeros in a way that doesn't lead to overflow.
-  out->ApplyFloor(1.0e-20);
-}
-
-void SoftmaxComponent::Backprop(const ChunkInfo &in_info,
-                                const ChunkInfo &out_info,
-                                const CuMatrixBase<BaseFloat> &,  //in_value,
-                                const CuMatrixBase<BaseFloat> &out_value,
-                                const CuMatrixBase<BaseFloat> &out_deriv,
-                                Component *to_update, // only thing updated is counts_.
-                                CuMatrix<BaseFloat> *in_deriv) const  {
-  /*
-    Note on the derivative of the softmax function: let it be
-    p_i = exp(x_i) / sum_i exp_i
-    The [matrix-valued] Jacobian of this function is
-    diag(p) - p p^T
-    Let the derivative vector at the output be e, and at the input be
-    d.  We have
-    d = diag(p) e - p (p^T e).
-    d_i = p_i e_i - p_i (p^T e).
-  */
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  in_deriv->DiffSoftmaxPerRow(out_value, out_deriv);
-
-  // The SoftmaxComponent does not have any real trainable parameters, but
-  // during the backprop we store some statistics on the average counts;
-  // these may be used in mixing-up.
-  if (to_update != NULL) {
-    NonlinearComponent *to_update_nonlinear =
-        dynamic_cast<NonlinearComponent*>(to_update);
-    to_update_nonlinear->UpdateStats(out_value);
-  }
-}
-
-void LogSoftmaxComponent::Propagate(const ChunkInfo &in_info,
-                                    const ChunkInfo &out_info,
-                                    const CuMatrixBase<BaseFloat> &in,
-                                    CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  // Applies log softmax function to each row of the output. For each row, we do
-  // x_i = x_i - log(sum_j exp(x_j))
-  out->LogSoftMaxPerRow(in);
-
-  // Just to be consistent with SoftmaxComponent::Propagate()
-  out->ApplyFloor(Log(1.0e-20));
-}
-
-void LogSoftmaxComponent::Backprop(const ChunkInfo &in_info,
-                                   const ChunkInfo &out_info,
-                                   const CuMatrixBase<BaseFloat> &,  //in_value,
-                                   const CuMatrixBase<BaseFloat> &out_value,
-                                   const CuMatrixBase<BaseFloat> &out_deriv,
-                                   Component *to_update,
-                                   CuMatrix<BaseFloat> *in_deriv) const  {
-  /*
-    Let the output be y, then
-      y_i = x_i - log(sum_i exp(x_i))
-    where x_i is the input to the component. The Jacobian matrix of this
-    function is
-      J = I - 1 exp(y^T)
-    where 1 is a vector of ones. Let the derivative vector at the output be e,
-    and at the input be d, then we have
-      d = e - exp(y) Sum(e)
-      d_i = e_i - exp(y_i) Sum(e)
-  */
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  KALDI_ASSERT(SameDim(out_value, out_deriv) && SameDim(out_value, *in_deriv));
-
-  in_deriv->DiffLogSoftmaxPerRow(out_value, out_deriv);
-
-  // Updates stats.
-  if (to_update != NULL) {
-    NonlinearComponent *to_update_nonlinear =
-        dynamic_cast<NonlinearComponent*>(to_update);
-    to_update_nonlinear->UpdateStats(out_value);
-  }
-}
-
-
-void AffineComponent::Scale(BaseFloat scale) {
-  linear_params_.Scale(scale);
-  bias_params_.Scale(scale);
-}
-
-// virtual
-void AffineComponent::Resize(int32 input_dim, int32 output_dim) {
-  KALDI_ASSERT(input_dim > 0 && output_dim > 0);
-  bias_params_.Resize(output_dim);
-  linear_params_.Resize(output_dim, input_dim);
-}
-
-void AffineComponent::Add(BaseFloat alpha, const UpdatableComponent &other_in) {
-  const AffineComponent *other =
-      dynamic_cast<const AffineComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  linear_params_.AddMat(alpha, other->linear_params_);
-  bias_params_.AddVec(alpha, other->bias_params_);
-}
-
-AffineComponent::AffineComponent(const AffineComponent &component):
-    UpdatableComponent(component),
-    linear_params_(component.linear_params_),
-    bias_params_(component.bias_params_),
-    is_gradient_(component.is_gradient_) { }
-
-AffineComponent::AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
-                                 const CuVectorBase<BaseFloat> &bias_params,
-                                 BaseFloat learning_rate):
-    UpdatableComponent(learning_rate),
-    linear_params_(linear_params),
-    bias_params_(bias_params) {
-  KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim()&&
-               bias_params.Dim() != 0);
-  is_gradient_ = false;
-}
-
-
-
-void AffineComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetLearningRate(1.0);
-  }
-  linear_params_.SetZero();
-  bias_params_.SetZero();
-  if (treat_as_gradient)
-    is_gradient_ = true;
-}
-
-void AffineComponent::SetParams(const VectorBase<BaseFloat> &bias,
-                                const MatrixBase<BaseFloat> &linear) {
-  bias_params_ = bias;
-  linear_params_ = linear;
-  KALDI_ASSERT(bias_params_.Dim() == linear_params_.NumRows());
-}
-
-void AffineComponent::PerturbParams(BaseFloat stddev) {
-  CuMatrix<BaseFloat> temp_linear_params(linear_params_);
-  temp_linear_params.SetRandn();
-  linear_params_.AddMat(stddev, temp_linear_params);
-
-  CuVector<BaseFloat> temp_bias_params(bias_params_);
-  temp_bias_params.SetRandn();
-  bias_params_.AddVec(stddev, temp_bias_params);
-}
-
-std::string AffineComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat linear_params_size = static_cast<BaseFloat>(linear_params_.NumRows())
-      * static_cast<BaseFloat>(linear_params_.NumCols());
-  BaseFloat linear_stddev =
-      std::sqrt(TraceMatMat(linear_params_, linear_params_, kTrans) /
-                linear_params_size),
-      bias_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
-                              bias_params_.Dim());
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim()
-         << ", linear-params-stddev=" << linear_stddev
-         << ", bias-params-stddev=" << bias_stddev
-         << ", learning-rate=" << LearningRate();
-  return stream.str();
-}
-
-Component* AffineComponent::Copy() const {
-  AffineComponent *ans = new AffineComponent();
-  ans->learning_rate_ = learning_rate_;
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  ans->is_gradient_ = is_gradient_;
-  return ans;
-}
-
-BaseFloat AffineComponent::DotProduct(const UpdatableComponent &other_in) const {
-  const AffineComponent *other =
-      dynamic_cast<const AffineComponent*>(&other_in);
-  return TraceMatMat(linear_params_, other->linear_params_, kTrans)
-      + VecVec(bias_params_, other->bias_params_);
-}
-
-void AffineComponent::Init(BaseFloat learning_rate,
-                           int32 input_dim, int32 output_dim,
-                           BaseFloat param_stddev, BaseFloat bias_stddev) {
-  UpdatableComponent::Init(learning_rate);
-  linear_params_.Resize(output_dim, input_dim);
-  bias_params_.Resize(output_dim);
-  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
-  linear_params_.SetRandn(); // sets to random normally distributed noise.
-  linear_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-}
-
-void AffineComponent::Init(BaseFloat learning_rate,
-                           std::string matrix_filename) {
-  UpdatableComponent::Init(learning_rate);
-  CuMatrix<BaseFloat> mat;
-  ReadKaldiObject(matrix_filename, &mat); // will abort on failure.
-  KALDI_ASSERT(mat.NumCols() >= 2);
-  int32 input_dim = mat.NumCols() - 1, output_dim = mat.NumRows();
-  linear_params_.Resize(output_dim, input_dim);
-  bias_params_.Resize(output_dim);
-  linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim));
-  bias_params_.CopyColFromMat(mat, input_dim);
-}
-
-void AffineComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  bool ok = true;
-  BaseFloat learning_rate = learning_rate_;
-  std::string matrix_filename;
-  int32 input_dim = -1, output_dim = -1;
-  ParseFromString("learning-rate", &args, &learning_rate); // optional.
-  if (ParseFromString("matrix", &args, &matrix_filename)) {
-    Init(learning_rate, matrix_filename);
-    if (ParseFromString("input-dim", &args, &input_dim))
-      KALDI_ASSERT(input_dim == InputDim() &&
-                   "input-dim mismatch vs. matrix.");
-    if (ParseFromString("output-dim", &args, &output_dim))
-      KALDI_ASSERT(output_dim == OutputDim() &&
-                   "output-dim mismatch vs. matrix.");
-  } else {
-    ok = ok && ParseFromString("input-dim", &args, &input_dim);
-    ok = ok && ParseFromString("output-dim", &args, &output_dim);
-    BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-        bias_stddev = 1.0;
-    ParseFromString("param-stddev", &args, &param_stddev);
-    ParseFromString("bias-stddev", &args, &bias_stddev);
-    Init(learning_rate, input_dim, output_dim,
-         param_stddev, bias_stddev);
-  }
-  if (!args.empty())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << args;
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << orig_args;
-}
-
-
-void AffineComponent::Propagate(const ChunkInfo &in_info,
-                                const ChunkInfo &out_info,
-                                const CuMatrixBase<BaseFloat> &in,
-                                CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  // No need for asserts as they'll happen within the matrix operations.
-  out->CopyRowsFromVec(bias_params_); // copies bias_params_ to each row
-  // of *out.
-  out->AddMatMat(1.0, in, kNoTrans, linear_params_, kTrans, 1.0);
-}
-
-void AffineComponent::UpdateSimple(const CuMatrixBase<BaseFloat> &in_value,
-                                   const CuMatrixBase<BaseFloat> &out_deriv) {
-  bias_params_.AddRowSumMat(learning_rate_, out_deriv, 1.0);
-  linear_params_.AddMatMat(learning_rate_, out_deriv, kTrans,
-                           in_value, kNoTrans, 1.0);
-}
-
-void AffineComponent::Backprop(const ChunkInfo &, //in_info,
-                               const ChunkInfo &, //out_info,
-                               const CuMatrixBase<BaseFloat> &in_value,
-                               const CuMatrixBase<BaseFloat> &, //out_value,
-                               const CuMatrixBase<BaseFloat> &out_deriv,
-                               Component *to_update_in, // may be identical to "this".
-                               CuMatrix<BaseFloat> *in_deriv) const {
-  AffineComponent *to_update = dynamic_cast<AffineComponent*>(to_update_in);
-  in_deriv->Resize(out_deriv.NumRows(), InputDim());
-  // Propagate the derivative back to the input.
-  in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, linear_params_, kNoTrans,
-                      0.0);
-
-  if (to_update != NULL) {
-    // Next update the model (must do this 2nd so the derivatives we propagate
-    // are accurate, in case this == to_update_in.)
-    if (to_update->is_gradient_)
-      to_update->UpdateSimple(in_value, out_deriv);
-    else  // the call below is to a virtual function that may be re-implemented
-      to_update->Update(in_value, out_deriv);  // by child classes.
-  }
-}
-
-void AffineComponent::Read(std::istream &is, bool binary) {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<AffineComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</AffineComponent>"
-  // might not see the "<AffineComponent>" part because
-  // of how ReadNew() works.
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
-  ExpectToken(is, binary, "<LinearParams>");
-  linear_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  std::string tok;
-  // back-compatibility code.  TODO: re-do this later.
-  ReadToken(is, binary, &tok);
-  if (tok == "<AvgInput>") { // discard the following.
-    CuVector<BaseFloat> avg_input;
-    avg_input.Read(is, binary);
-    BaseFloat avg_input_count;
-    ExpectToken(is, binary, "<AvgInputCount>");
-    ReadBasicType(is, binary, &avg_input_count);
-    ReadToken(is, binary, &tok);
-  }
-  if (tok == "<IsGradient>") {
-    ReadBasicType(is, binary, &is_gradient_);
-    ExpectToken(is, binary, ostr_end.str());
-  } else {
-    is_gradient_ = false;
-    KALDI_ASSERT(tok == ostr_end.str());
-  }
-}
-
-void AffineComponent::Write(std::ostream &os, bool binary) const {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<AffineComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</AffineComponent>"
-  WriteToken(os, binary, ostr_beg.str());
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
-  WriteToken(os, binary, "<LinearParams>");
-  linear_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<IsGradient>");
-  WriteBasicType(os, binary, is_gradient_);
-  WriteToken(os, binary, ostr_end.str());
-}
-
-int32 AffineComponent::GetParameterDim() const {
-  return (InputDim() + 1) * OutputDim();
-}
-void AffineComponent::Vectorize(VectorBase<BaseFloat> *params) const {
-  params->Range(0, InputDim() * OutputDim()).CopyRowsFromMat(linear_params_);
-  params->Range(InputDim() * OutputDim(),
-                OutputDim()).CopyFromVec(bias_params_);
-}
-void AffineComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
-  linear_params_.CopyRowsFromVec(params.Range(0, InputDim() * OutputDim()));
-  bias_params_.CopyFromVec(params.Range(InputDim() * OutputDim(),
-                                        OutputDim()));
-}
-
-void AffineComponent::LimitRank(int32 d,
-                                AffineComponent **a, AffineComponent **b) const {
-  KALDI_ASSERT(d <= InputDim());
-
-  // We'll limit the rank of just the linear part, keeping the bias vector full.
-  Matrix<BaseFloat> M (linear_params_);
-  int32 rows = M.NumRows(), cols = M.NumCols(), rc_min = std::min(rows, cols);
-  Vector<BaseFloat> s(rc_min);
-  Matrix<BaseFloat> U(rows, rc_min), Vt(rc_min, cols);
-  // Do the destructive svd M = U diag(s) V^T.  It actually outputs the transpose of V.
-  M.DestructiveSvd(&s, &U, &Vt);
-  SortSvd(&s, &U, &Vt); // Sort the singular values from largest to smallest.
-  BaseFloat old_svd_sum = s.Sum();
-  U.Resize(rows, d, kCopyData);
-  s.Resize(d, kCopyData);
-  Vt.Resize(d, cols, kCopyData);
-  BaseFloat new_svd_sum = s.Sum();
-  KALDI_LOG << "Reduced rank from "
-            << rc_min <<  " to " << d << ", SVD sum reduced from "
-            << old_svd_sum << " to " << new_svd_sum;
-
-  // U.MulColsVec(s); // U <-- U diag(s)
-  Vt.MulRowsVec(s); // Vt <-- diag(s) Vt.
-
-  *a = dynamic_cast<AffineComponent*>(this->Copy());
-  *b = dynamic_cast<AffineComponent*>(this->Copy());
-
-  (*a)->bias_params_.Resize(d, kSetZero);
-  (*a)->linear_params_ = Vt;
-
-  (*b)->bias_params_ = this->bias_params_;
-  (*b)->linear_params_ = U;
-}
-
-Component *AffineComponent::CollapseWithNext(
-    const AffineComponent &next_component) const {
-  AffineComponent *ans = dynamic_cast<AffineComponent*>(this->Copy());
-  KALDI_ASSERT(ans != NULL);
-  // Note: it's possible that "ans" is really of a derived type such
-  // as AffineComponentPreconditioned, but this will still work.
-  // the "copy" call will copy things like learning rates, "alpha" value
-  // for preconditioned component, etc.
-  ans->linear_params_.Resize(next_component.OutputDim(), InputDim());
-  ans->bias_params_ = next_component.bias_params_;
-
-  ans->linear_params_.AddMatMat(1.0, next_component.linear_params_, kNoTrans,
-                                this->linear_params_, kNoTrans, 0.0);
-  ans->bias_params_.AddMatVec(1.0, next_component.linear_params_, kNoTrans,
-                              this->bias_params_, 1.0);
-  return ans;
-}
-
-Component *AffineComponent::CollapseWithNext(
-    const FixedAffineComponent &next_component) const {
-  // If at least one was non-updatable, make the whole non-updatable.
-  FixedAffineComponent *ans =
-      dynamic_cast<FixedAffineComponent*>(next_component.Copy());
-  KALDI_ASSERT(ans != NULL);
-  ans->linear_params_.Resize(next_component.OutputDim(), InputDim());
-  ans->bias_params_ = next_component.bias_params_;
-
-  ans->linear_params_.AddMatMat(1.0, next_component.linear_params_, kNoTrans,
-                                this->linear_params_, kNoTrans, 0.0);
-  ans->bias_params_.AddMatVec(1.0, next_component.linear_params_, kNoTrans,
-                              this->bias_params_, 1.0);
-  return ans;
-}
-
-Component *AffineComponent::CollapseWithNext(
-    const FixedScaleComponent &next_component) const {
-  KALDI_ASSERT(this->OutputDim() == next_component.InputDim());
-  AffineComponent *ans =
-      dynamic_cast<AffineComponent*>(this->Copy());
-  KALDI_ASSERT(ans != NULL);
-  ans->linear_params_.MulRowsVec(next_component.scales_);
-  ans->bias_params_.MulElements(next_component.scales_);
-
-  return ans;
-}
-
-
-
-Component *AffineComponent::CollapseWithPrevious(
-    const FixedAffineComponent &prev_component) const {
-  // If at least one was non-updatable, make the whole non-updatable.
-  FixedAffineComponent *ans =
-      dynamic_cast<FixedAffineComponent*>(prev_component.Copy());
-  KALDI_ASSERT(ans != NULL);
-
-  ans->linear_params_.Resize(this->OutputDim(), prev_component.InputDim());
-  ans->bias_params_ = this->bias_params_;
-
-  ans->linear_params_.AddMatMat(1.0, this->linear_params_, kNoTrans,
-                                prev_component.linear_params_, kNoTrans, 0.0);
-  ans->bias_params_.AddMatVec(1.0, this->linear_params_, kNoTrans,
-                              prev_component.bias_params_, 1.0);
-  return ans;
-}
-
-void AffineComponentPreconditioned::Read(std::istream &is, bool binary) {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<AffineComponentPreconditioned>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</AffineComponentPreconditioned>"
-  // might not see the "<AffineComponentPreconditioned>" part because
-  // of how ReadNew() works.
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
-  ExpectToken(is, binary, "<LinearParams>");
-  linear_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  ExpectToken(is, binary, "<Alpha>");
-  ReadBasicType(is, binary, &alpha_);
-  // todo: remove back-compat code.  Will just be:
-  // ExpectToken(is, binary, "<MaxChange>");
-  // ReadBasicType(is, binary, &max_change_);
-  // ExpectToken(is, binary, ostr_end);
-  // [end of function]
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<MaxChange>") {
-    ReadBasicType(is, binary, &max_change_);
-    ExpectToken(is, binary, ostr_end.str());
-  } else {
-    max_change_ = 0.0;
-    KALDI_ASSERT(tok == ostr_end.str());
-  }
-}
-
-void AffineComponentPreconditioned::InitFromString(std::string args) {
-  std::string orig_args(args);
-  std::string matrix_filename;
-  BaseFloat learning_rate = learning_rate_;
-  BaseFloat alpha = 0.1, max_change = 0.0;
-  int32 input_dim = -1, output_dim = -1;
-  ParseFromString("learning-rate", &args, &learning_rate); // optional.
-  ParseFromString("alpha", &args, &alpha);
-  ParseFromString("max-change", &args, &max_change);
-
-  if (ParseFromString("matrix", &args, &matrix_filename)) {
-    Init(learning_rate, alpha, max_change, matrix_filename);
-    if (ParseFromString("input-dim", &args, &input_dim))
-      KALDI_ASSERT(input_dim == InputDim() &&
-                   "input-dim mismatch vs. matrix.");
-    if (ParseFromString("output-dim", &args, &output_dim))
-      KALDI_ASSERT(output_dim == OutputDim() &&
-                   "output-dim mismatch vs. matrix.");
-  } else {
-    bool ok = true;
-    ok = ok && ParseFromString("input-dim", &args, &input_dim);
-    ok = ok && ParseFromString("output-dim", &args, &output_dim);
-    BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-        bias_stddev = 1.0;
-    ParseFromString("param-stddev", &args, &param_stddev);
-    ParseFromString("bias-stddev", &args, &bias_stddev);
-    if (!ok)
-      KALDI_ERR << "Bad initializer " << orig_args;
-    Init(learning_rate, input_dim, output_dim, param_stddev,
-         bias_stddev, alpha, max_change);
-  }
-  if (!args.empty())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << args;
-}
-
-void AffineComponentPreconditioned::Init(BaseFloat learning_rate,
-                                         BaseFloat alpha, BaseFloat max_change,
-                                         std::string matrix_filename) {
-  UpdatableComponent::Init(learning_rate);
-  alpha_ = alpha;
-  max_change_ = max_change;
-  CuMatrix<BaseFloat> mat;
-  ReadKaldiObject(matrix_filename, &mat); // will abort on failure.
-  KALDI_ASSERT(mat.NumCols() >= 2);
-  int32 input_dim = mat.NumCols() - 1, output_dim = mat.NumRows();
-  linear_params_.Resize(output_dim, input_dim);
-  bias_params_.Resize(output_dim);
-  linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim));
-  bias_params_.CopyColFromMat(mat, input_dim);
-}
-
-void AffineComponentPreconditioned::Init(
-    BaseFloat learning_rate,
-    int32 input_dim, int32 output_dim,
-    BaseFloat param_stddev, BaseFloat bias_stddev,
-    BaseFloat alpha, BaseFloat max_change) {
-  UpdatableComponent::Init(learning_rate);
-  KALDI_ASSERT(input_dim > 0 && output_dim > 0);
-  linear_params_.Resize(output_dim, input_dim);
-  bias_params_.Resize(output_dim);
-  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
-  linear_params_.SetRandn(); // sets to random normally distributed noise.
-  linear_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-  alpha_ = alpha;
-  KALDI_ASSERT(alpha_ > 0.0);
-  max_change_ = max_change; // Note: any value of max_change_is valid, but
-  // only values > 0.0 will actually activate the code.
-}
-
-
-void AffineComponentPreconditioned::Write(std::ostream &os, bool binary) const {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<AffineComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</AffineComponent>"
-  WriteToken(os, binary, ostr_beg.str());
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
-  WriteToken(os, binary, "<LinearParams>");
-  linear_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<Alpha>");
-  WriteBasicType(os, binary, alpha_);
-  WriteToken(os, binary, "<MaxChange>");
-  WriteBasicType(os, binary, max_change_);
-  WriteToken(os, binary, ostr_end.str());
-}
-
-std::string AffineComponentPreconditioned::Info() const {
-  std::stringstream stream;
-  BaseFloat linear_params_size = static_cast<BaseFloat>(linear_params_.NumRows())
-      * static_cast<BaseFloat>(linear_params_.NumCols());
-  BaseFloat linear_stddev =
-      std::sqrt(TraceMatMat(linear_params_, linear_params_, kTrans) /
-                linear_params_size),
-      bias_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
-                              bias_params_.Dim());
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim()
-         << ", linear-params-stddev=" << linear_stddev
-         << ", bias-params-stddev=" << bias_stddev
-         << ", learning-rate=" << LearningRate()
-         << ", alpha=" << alpha_
-         << ", max-change=" << max_change_;
-  return stream.str();
-}
-
-Component* AffineComponentPreconditioned::Copy() const {
-  AffineComponentPreconditioned *ans = new AffineComponentPreconditioned();
-  ans->learning_rate_ = learning_rate_;
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  ans->alpha_ = alpha_;
-  ans->max_change_ = max_change_;
-  ans->is_gradient_ = is_gradient_;
-  return ans;
-}
-
-
-BaseFloat AffineComponentPreconditioned::GetScalingFactor(
-    const CuMatrix<BaseFloat> &in_value_precon,
-    const CuMatrix<BaseFloat> &out_deriv_precon) {
-  static int scaling_factor_printed = 0;
-
-  KALDI_ASSERT(in_value_precon.NumRows() == out_deriv_precon.NumRows());
-  CuVector<BaseFloat> in_norm(in_value_precon.NumRows()),
-      out_deriv_norm(in_value_precon.NumRows());
-  in_norm.AddDiagMat2(1.0, in_value_precon, kNoTrans, 0.0);
-  out_deriv_norm.AddDiagMat2(1.0, out_deriv_precon, kNoTrans, 0.0);
-  // Get the actual l2 norms, not the squared l2 norm.
-  in_norm.ApplyPow(0.5);
-  out_deriv_norm.ApplyPow(0.5);
-  BaseFloat sum = learning_rate_ * VecVec(in_norm, out_deriv_norm);
-  // sum is the product of norms that we are trying to limit
-  // to max_value_.
-  KALDI_ASSERT(sum == sum && sum - sum == 0.0 &&
-               "NaN in backprop");
-  KALDI_ASSERT(sum >= 0.0);
-  if (sum <= max_change_) return 1.0;
-  else {
-    BaseFloat ans = max_change_ / sum;
-    if (scaling_factor_printed < 10) {
-      KALDI_LOG << "Limiting step size to " << max_change_
-                << " using scaling factor " << ans << ", for component index "
-                << Index();
-      scaling_factor_printed++;
-    }
-    return ans;
-  }
-}
-
-void AffineComponentPreconditioned::Update(
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_deriv) {
-  CuMatrix<BaseFloat> in_value_temp;
-
-  in_value_temp.Resize(in_value.NumRows(),
-                       in_value.NumCols() + 1, kUndefined);
-  in_value_temp.Range(0, in_value.NumRows(),
-                      0, in_value.NumCols()).CopyFromMat(in_value);
-
-  // Add the 1.0 at the end of each row "in_value_temp"
-  in_value_temp.Range(0, in_value.NumRows(),
-                      in_value.NumCols(), 1).Set(1.0);
-
-  CuMatrix<BaseFloat> in_value_precon(in_value_temp.NumRows(),
-                                      in_value_temp.NumCols(), kUndefined),
-      out_deriv_precon(out_deriv.NumRows(),
-                       out_deriv.NumCols(), kUndefined);
-  // each row of in_value_precon will be that same row of
-  // in_value, but multiplied by the inverse of a Fisher
-  // matrix that has been estimated from all the other rows,
-  // smoothed by some appropriate amount times the identity
-  // matrix (this amount is proportional to \alpha).
-  PreconditionDirectionsAlphaRescaled(in_value_temp, alpha_, &in_value_precon);
-  PreconditionDirectionsAlphaRescaled(out_deriv, alpha_, &out_deriv_precon);
-
-  BaseFloat minibatch_scale = 1.0;
-
-  if (max_change_ > 0.0)
-    minibatch_scale = GetScalingFactor(in_value_precon, out_deriv_precon);
-
-
-  CuSubMatrix<BaseFloat> in_value_precon_part(in_value_precon,
-                                            0, in_value_precon.NumRows(),
-                                            0, in_value_precon.NumCols() - 1);
-  // this "precon_ones" is what happens to the vector of 1's representing
-  // offsets, after multiplication by the preconditioner.
-  CuVector<BaseFloat> precon_ones(in_value_precon.NumRows());
-
-  precon_ones.CopyColFromMat(in_value_precon, in_value_precon.NumCols() - 1);
-
-  BaseFloat local_lrate = minibatch_scale * learning_rate_;
-  bias_params_.AddMatVec(local_lrate, out_deriv_precon, kTrans,
-                         precon_ones, 1.0);
-  linear_params_.AddMatMat(local_lrate, out_deriv_precon, kTrans,
-                           in_value_precon_part, kNoTrans, 1.0);
-}
-
-
-// virtual
-void AffineComponentPreconditionedOnline::Resize(
-    int32 input_dim, int32 output_dim) {
-  KALDI_ASSERT(input_dim > 1 && output_dim > 1);
-  if (rank_in_ >= input_dim) rank_in_ = input_dim - 1;
-  if (rank_out_ >= output_dim) rank_out_ = output_dim - 1;
-  bias_params_.Resize(output_dim);
-  linear_params_.Resize(output_dim, input_dim);
-  OnlinePreconditioner temp;
-  preconditioner_in_ = temp;
-  preconditioner_out_ = temp;
-  SetPreconditionerConfigs();
-}
-
-
-void AffineComponentPreconditionedOnline::Read(std::istream &is, bool binary) {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">";
-  ostr_end << "</" << Type() << ">";
-  // might not see the "<AffineComponentPreconditionedOnline>" part because
-  // of how ReadNew() works.
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
-  ExpectToken(is, binary, "<LinearParams>");
-  linear_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<Rank>") {  // back-compatibility (temporary)
-    ReadBasicType(is, binary, &rank_in_);
-    rank_out_ = rank_in_;
-  } else {
-    KALDI_ASSERT(tok == "<RankIn>");
-    ReadBasicType(is, binary, &rank_in_);
-    ExpectToken(is, binary, "<RankOut>");
-    ReadBasicType(is, binary, &rank_out_);
-  }
-  ReadToken(is, binary, &tok);
-  if (tok == "<UpdatePeriod>") {
-    ReadBasicType(is, binary, &update_period_);
-    ExpectToken(is, binary, "<NumSamplesHistory>");
-  } else {
-    update_period_ = 1;
-    KALDI_ASSERT(tok == "<NumSamplesHistory>");
-  }
-  ReadBasicType(is, binary, &num_samples_history_);
-  ExpectToken(is, binary, "<Alpha>");
-  ReadBasicType(is, binary, &alpha_);
-  ExpectToken(is, binary, "<MaxChangePerSample>");
-  ReadBasicType(is, binary, &max_change_per_sample_);
-  ExpectToken(is, binary, ostr_end.str());
-  SetPreconditionerConfigs();
-}
-
-void AffineComponentPreconditionedOnline::InitFromString(std::string args) {
-  std::string orig_args(args);
-  bool ok = true;
-  std::string matrix_filename;
-  BaseFloat learning_rate = learning_rate_;
-  BaseFloat num_samples_history = 2000.0, alpha = 4.0,
-      max_change_per_sample = 0.1;
-  int32 input_dim = -1, output_dim = -1, rank_in = 30, rank_out = 80,
-      update_period = 1;
-  ParseFromString("learning-rate", &args, &learning_rate); // optional.
-  ParseFromString("num-samples-history", &args, &num_samples_history);
-  ParseFromString("alpha", &args, &alpha);
-  ParseFromString("max-change-per-sample", &args, &max_change_per_sample);
-  ParseFromString("rank-in", &args, &rank_in);
-  ParseFromString("rank-out", &args, &rank_out);
-  ParseFromString("update-period", &args, &update_period);
-
-  if (ParseFromString("matrix", &args, &matrix_filename)) {
-    Init(learning_rate, rank_in, rank_out, update_period,
-         num_samples_history, alpha, max_change_per_sample,
-         matrix_filename);
-    if (ParseFromString("input-dim", &args, &input_dim))
-      KALDI_ASSERT(input_dim == InputDim() &&
-                   "input-dim mismatch vs. matrix.");
-    if (ParseFromString("output-dim", &args, &output_dim))
-      KALDI_ASSERT(output_dim == OutputDim() &&
-                   "output-dim mismatch vs. matrix.");
-  } else {
-    ok = ok && ParseFromString("input-dim", &args, &input_dim);
-    ok = ok && ParseFromString("output-dim", &args, &output_dim);
-    BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-        bias_stddev = 1.0;
-    ParseFromString("param-stddev", &args, &param_stddev);
-    ParseFromString("bias-stddev", &args, &bias_stddev);
-    Init(learning_rate, input_dim, output_dim, param_stddev,
-         bias_stddev, rank_in, rank_out, update_period,
-         num_samples_history, alpha, max_change_per_sample);
-  }
-  if (!args.empty())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << args;
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << orig_args;
-}
-
-void AffineComponentPreconditionedOnline::SetPreconditionerConfigs() {
-  preconditioner_in_.SetRank(rank_in_);
-  preconditioner_in_.SetNumSamplesHistory(num_samples_history_);
-  preconditioner_in_.SetAlpha(alpha_);
-  preconditioner_in_.SetUpdatePeriod(update_period_);
-  preconditioner_out_.SetRank(rank_out_);
-  preconditioner_out_.SetNumSamplesHistory(num_samples_history_);
-  preconditioner_out_.SetAlpha(alpha_);
-  preconditioner_out_.SetUpdatePeriod(update_period_);
-}
-
-void AffineComponentPreconditionedOnline::Init(
-    BaseFloat learning_rate, int32 rank_in, int32 rank_out,
-    int32 update_period, BaseFloat num_samples_history, BaseFloat alpha,
-    BaseFloat max_change_per_sample,
-    std::string matrix_filename) {
-  UpdatableComponent::Init(learning_rate);
-  rank_in_ = rank_in;
-  rank_out_ = rank_out;
-  update_period_ = update_period;
-  num_samples_history_ = num_samples_history;
-  alpha_ = alpha;
-  SetPreconditionerConfigs();
-  KALDI_ASSERT(max_change_per_sample >= 0.0);
-  max_change_per_sample_ = max_change_per_sample;
-  CuMatrix<BaseFloat> mat;
-  ReadKaldiObject(matrix_filename, &mat); // will abort on failure.
-  KALDI_ASSERT(mat.NumCols() >= 2);
-  int32 input_dim = mat.NumCols() - 1, output_dim = mat.NumRows();
-  linear_params_.Resize(output_dim, input_dim);
-  bias_params_.Resize(output_dim);
-  linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim));
-  bias_params_.CopyColFromMat(mat, input_dim);
-}
-
-AffineComponentPreconditionedOnline::AffineComponentPreconditionedOnline(
-    const AffineComponent &orig,
-    int32 rank_in, int32 rank_out, int32 update_period,
-    BaseFloat num_samples_history, BaseFloat alpha):
-    max_change_per_sample_(0.1) {
-  this->linear_params_ = orig.linear_params_;
-  this->bias_params_ = orig.bias_params_;
-  this->learning_rate_ = orig.learning_rate_;
-  this->is_gradient_ = orig.is_gradient_;
-  this->rank_in_ = rank_in;
-  this->rank_out_ = rank_out;
-  this->update_period_ = update_period;
-  this->num_samples_history_ = num_samples_history;
-  this->alpha_ = alpha;
-  SetPreconditionerConfigs();
-}
-
-void AffineComponentPreconditionedOnline::Init(
-    BaseFloat learning_rate,
-    int32 input_dim, int32 output_dim,
-    BaseFloat param_stddev, BaseFloat bias_stddev,
-    int32 rank_in, int32 rank_out, int32 update_period,
-    BaseFloat num_samples_history, BaseFloat alpha,
-    BaseFloat max_change_per_sample) {
-  UpdatableComponent::Init(learning_rate);
-  linear_params_.Resize(output_dim, input_dim);
-  bias_params_.Resize(output_dim);
-  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0 &&
-               bias_stddev >= 0.0);
-  linear_params_.SetRandn(); // sets to random normally distributed noise.
-  linear_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-  rank_in_ = rank_in;
-  rank_out_ = rank_out;
-  update_period_ = update_period;
-  num_samples_history_ = num_samples_history;
-  alpha_ = alpha;
-  SetPreconditionerConfigs();
-  KALDI_ASSERT(max_change_per_sample >= 0.0);
-  max_change_per_sample_ = max_change_per_sample;
-}
-
-
-void AffineComponentPreconditionedOnline::Write(std::ostream &os, bool binary) const {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<AffineComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</AffineComponent>"
-  WriteToken(os, binary, ostr_beg.str());
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
-  WriteToken(os, binary, "<LinearParams>");
-  linear_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<RankIn>");
-  WriteBasicType(os, binary, rank_in_);
-  WriteToken(os, binary, "<RankOut>");
-  WriteBasicType(os, binary, rank_out_);
-  WriteToken(os, binary, "<UpdatePeriod>");
-  WriteBasicType(os, binary, update_period_);
-  WriteToken(os, binary, "<NumSamplesHistory>");
-  WriteBasicType(os, binary, num_samples_history_);
-  WriteToken(os, binary, "<Alpha>");
-  WriteBasicType(os, binary, alpha_);
-  WriteToken(os, binary, "<MaxChangePerSample>");
-  WriteBasicType(os, binary, max_change_per_sample_);
-  WriteToken(os, binary, ostr_end.str());
-}
-
-std::string AffineComponentPreconditionedOnline::Info() const {
-  std::stringstream stream;
-  BaseFloat linear_params_size = static_cast<BaseFloat>(linear_params_.NumRows())
-      * static_cast<BaseFloat>(linear_params_.NumCols());
-  BaseFloat linear_stddev =
-      std::sqrt(TraceMatMat(linear_params_, linear_params_, kTrans) /
-                linear_params_size),
-      bias_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
-                              bias_params_.Dim());
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim()
-         << ", linear-params-stddev=" << linear_stddev
-         << ", bias-params-stddev=" << bias_stddev
-         << ", learning-rate=" << LearningRate()
-         << ", rank-in=" << rank_in_
-         << ", rank-out=" << rank_out_
-         << ", num_samples_history=" << num_samples_history_
-         << ", update_period=" << update_period_
-         << ", alpha=" << alpha_
-         << ", max-change-per-sample=" << max_change_per_sample_;
-  return stream.str();
-}
-
-Component* AffineComponentPreconditionedOnline::Copy() const {
-  AffineComponentPreconditionedOnline *ans = new AffineComponentPreconditionedOnline();
-  ans->learning_rate_ = learning_rate_;
-  ans->rank_in_ = rank_in_;
-  ans->rank_out_ = rank_out_;
-  ans->update_period_ = update_period_;
-  ans->num_samples_history_ = num_samples_history_;
-  ans->alpha_ = alpha_;
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  ans->preconditioner_in_ = preconditioner_in_;
-  ans->preconditioner_out_ = preconditioner_out_;
-  ans->max_change_per_sample_ = max_change_per_sample_;
-  ans->is_gradient_ = is_gradient_;
-  ans->SetPreconditionerConfigs();
-  return ans;
-}
-
-
-
-BaseFloat AffineComponentPreconditionedOnline::GetScalingFactor(
-    const CuVectorBase<BaseFloat> &in_products,
-    BaseFloat learning_rate_scale,
-    CuVectorBase<BaseFloat> *out_products) {
-  static int scaling_factor_printed = 0;
-  int32 minibatch_size = in_products.Dim();
-
-  out_products->MulElements(in_products);
-  out_products->ApplyPow(0.5);
-  BaseFloat prod_sum = out_products->Sum();
-  BaseFloat tot_change_norm = learning_rate_scale * learning_rate_ * prod_sum,
-      max_change_norm = max_change_per_sample_ * minibatch_size;
-  // tot_change_norm is the product of norms that we are trying to limit
-  // to max_value_.
-  KALDI_ASSERT(tot_change_norm - tot_change_norm == 0.0 && "NaN in backprop");
-  KALDI_ASSERT(tot_change_norm >= 0.0);
-  if (tot_change_norm <= max_change_norm) return 1.0;
-  else {
-    BaseFloat factor = max_change_norm / tot_change_norm;
-    if (scaling_factor_printed < 10) {
-      KALDI_LOG << "Limiting step size using scaling factor "
-                << factor << ", for component index " << Index();
-      scaling_factor_printed++;
-    }
-    return factor;
-  }
-}
-
-void AffineComponentPreconditionedOnline::Update(
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_deriv) {
-  CuMatrix<BaseFloat> in_value_temp;
-
-  in_value_temp.Resize(in_value.NumRows(),
-                       in_value.NumCols() + 1, kUndefined);
-  in_value_temp.Range(0, in_value.NumRows(),
-                      0, in_value.NumCols()).CopyFromMat(in_value);
-
-  // Add the 1.0 at the end of each row "in_value_temp"
-  in_value_temp.Range(0, in_value.NumRows(),
-                      in_value.NumCols(), 1).Set(1.0);
-
-  CuMatrix<BaseFloat> out_deriv_temp(out_deriv);
-
-  CuMatrix<BaseFloat> row_products(2,
-                                   in_value.NumRows());
-  CuSubVector<BaseFloat> in_row_products(row_products, 0),
-      out_row_products(row_products, 1);
-
-  // These "scale" values get will get multiplied into the learning rate (faster
-  // than having the matrices scaled inside the preconditioning code).
-  BaseFloat in_scale, out_scale;
-
-  preconditioner_in_.PreconditionDirections(&in_value_temp, &in_row_products,
-                                            &in_scale);
-  preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_row_products,
-                                             &out_scale);
-
-  // "scale" is a scaling factor coming from the PreconditionDirections calls
-  // (it's faster to have them output a scaling factor than to have them scale
-  // their outputs).
-  BaseFloat scale = in_scale * out_scale;
-  BaseFloat minibatch_scale = 1.0;
-
-  if (max_change_per_sample_ > 0.0)
-    minibatch_scale = GetScalingFactor(in_row_products, scale,
-                                       &out_row_products);
-
-  CuSubMatrix<BaseFloat> in_value_precon_part(in_value_temp,
-                                              0, in_value_temp.NumRows(),
-                                              0, in_value_temp.NumCols() - 1);
-  // this "precon_ones" is what happens to the vector of 1's representing
-  // offsets, after multiplication by the preconditioner.
-  CuVector<BaseFloat> precon_ones(in_value_temp.NumRows());
-
-  precon_ones.CopyColFromMat(in_value_temp, in_value_temp.NumCols() - 1);
-
-  BaseFloat local_lrate = scale * minibatch_scale * learning_rate_;
-  bias_params_.AddMatVec(local_lrate, out_deriv_temp, kTrans,
-                         precon_ones, 1.0);
-  linear_params_.AddMatMat(local_lrate, out_deriv_temp, kTrans,
-                           in_value_precon_part, kNoTrans, 1.0);
-}
-
-void BlockAffineComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetLearningRate(1.0);
-  }
-  linear_params_.SetZero();
-  bias_params_.SetZero();
-}
-
-void BlockAffineComponent::PerturbParams(BaseFloat stddev) {
-  CuMatrix<BaseFloat> temp_linear_params(linear_params_);
-  temp_linear_params.SetRandn();
-  linear_params_.AddMat(stddev, temp_linear_params);
-
-  CuVector<BaseFloat> temp_bias_params(bias_params_);
-  temp_bias_params.SetRandn();
-  bias_params_.AddVec(stddev, temp_bias_params);
-}
-
-BaseFloat BlockAffineComponent::DotProduct(
-    const UpdatableComponent &other_in) const {
-  const BlockAffineComponent *other =
-      dynamic_cast<const BlockAffineComponent*>(&other_in);
-  return TraceMatMat(linear_params_, other->linear_params_, kTrans)
-      + VecVec(bias_params_, other->bias_params_);
-}
-
-Component* BlockAffineComponent::Copy() const {
-  BlockAffineComponent *ans = new BlockAffineComponent();
-  ans->learning_rate_ = learning_rate_;
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  ans->num_blocks_ = num_blocks_;
-  return ans;
-}
-
-void BlockAffineComponent::Scale(BaseFloat scale) {
-  linear_params_.Scale(scale);
-  bias_params_.Scale(scale);
-}
-
-void BlockAffineComponent::Add(BaseFloat alpha,
-                               const UpdatableComponent &other_in) {
-  const BlockAffineComponent *other =
-      dynamic_cast<const BlockAffineComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  linear_params_.AddMat(alpha, other->linear_params_);
-  bias_params_.AddVec(alpha, other->bias_params_);
-}
-
-void BlockAffineComponent::Propagate(const ChunkInfo &in_info,
-                                     const ChunkInfo &out_info,
-                                     const CuMatrixBase<BaseFloat> &in,
-                                     CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  // The matrix has a block structure where each matrix has input dim
-  // (#rows) equal to input_block_dim.  The blocks are stored in linear_params_
-  // as [ M
-  //      N
-  //      O ] but we actually treat it as:
-  // [ M 0 0
-  //   0 N 0
-  //   0 0 O ]
-  int32 input_block_dim = linear_params_.NumCols(),
-       output_block_dim = linear_params_.NumRows() / num_blocks_,
-             num_frames = in.NumRows();
-  KALDI_ASSERT(in.NumCols() == input_block_dim * num_blocks_);
-  KALDI_ASSERT(out->NumCols() == output_block_dim * num_blocks_);
-  KALDI_ASSERT(in.NumRows() == out->NumRows());
-
-  out->CopyRowsFromVec(bias_params_); // copies bias_params_ to each row
-  // of *out.
-
-  for (int32 b = 0; b < num_blocks_; b++) {
-    CuSubMatrix<BaseFloat> in_block(in, 0, num_frames,
-                                  b * input_block_dim, input_block_dim),
-        out_block(*out, 0, num_frames,
-                  b * output_block_dim, output_block_dim),
-        param_block(linear_params_,
-                    b * output_block_dim, output_block_dim,
-                    0, input_block_dim);
-    out_block.AddMatMat(1.0, in_block, kNoTrans, param_block, kTrans, 1.0);
-  }
-}
-
-void BlockAffineComponent::UpdateSimple(
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_deriv) {
-  int32 input_block_dim = linear_params_.NumCols(),
-      output_block_dim = linear_params_.NumRows() / num_blocks_,
-      num_frames = in_value.NumRows();
-
-  bias_params_.AddRowSumMat(learning_rate_, out_deriv, 1.0);
-  for (int32 b = 0; b < num_blocks_; b++) {
-    CuSubMatrix<BaseFloat> in_value_block(in_value, 0, num_frames,
-                                        b * input_block_dim,
-                                        input_block_dim),
-        out_deriv_block(out_deriv, 0, num_frames,
-                        b * output_block_dim, output_block_dim),
-        param_block(linear_params_,
-                    b * output_block_dim, output_block_dim,
-                    0, input_block_dim);
-    // Update the parameters.
-    param_block.AddMatMat(learning_rate_, out_deriv_block, kTrans,
-                          in_value_block, kNoTrans, 1.0);
-  }
-}
-
-void BlockAffineComponent::Backprop(const ChunkInfo &,  //in_info,
-                                    const ChunkInfo &,  //out_info,
-                                    const CuMatrixBase<BaseFloat> &in_value,
-                                    const CuMatrixBase<BaseFloat> &,  //out_value,
-                                    const CuMatrixBase<BaseFloat> &out_deriv,
-                                    Component *to_update_in,
-                                    CuMatrix<BaseFloat> *in_deriv) const  {
-
-  // This code mirrors the code in Propagate().
-  int32 num_frames = in_value.NumRows();
-  BlockAffineComponent *to_update = dynamic_cast<BlockAffineComponent*>(
-      to_update_in);
-  in_deriv->Resize(out_deriv.NumRows(), InputDim());
-  int32 input_block_dim = linear_params_.NumCols(),
-       output_block_dim = linear_params_.NumRows() / num_blocks_;
-  KALDI_ASSERT(in_value.NumCols() == input_block_dim * num_blocks_);
-  KALDI_ASSERT(out_deriv.NumCols() == output_block_dim * num_blocks_);
-
-  for (int32 b = 0; b < num_blocks_; b++) {
-    CuSubMatrix<BaseFloat> in_value_block(in_value, 0, num_frames,
-                                        b * input_block_dim,
-                                        input_block_dim),
-        in_deriv_block(*in_deriv, 0, num_frames,
-                       b * input_block_dim, input_block_dim),
-        out_deriv_block(out_deriv, 0, num_frames,
-                        b * output_block_dim, output_block_dim),
-        param_block(linear_params_,
-                    b * output_block_dim, output_block_dim,
-                    0, input_block_dim);
-
-    // Propagate the derivative back to the input.
-    in_deriv_block.AddMatMat(1.0, out_deriv_block, kNoTrans,
-                             param_block, kNoTrans, 0.0);
-  }
-  if (to_update != NULL)
-    to_update->Update(in_value, out_deriv);
-}
-
-
-void BlockAffineComponent::Init(BaseFloat learning_rate,
-                                int32 input_dim, int32 output_dim,
-                                BaseFloat param_stddev,
-                                BaseFloat bias_stddev,
-                                int32 num_blocks) {
-  UpdatableComponent::Init(learning_rate);
-  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
-  KALDI_ASSERT(input_dim % num_blocks == 0 && output_dim % num_blocks == 0);
-
-  linear_params_.Resize(output_dim, input_dim / num_blocks);
-  bias_params_.Resize(output_dim);
-
-  linear_params_.SetRandn(); // sets to random normally distributed noise.
-  linear_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-  num_blocks_ = num_blocks;
-}
-
-void BlockAffineComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  bool ok = true;
-  BaseFloat learning_rate = learning_rate_;
-  int32 input_dim = -1, output_dim = -1, num_blocks = 1;
-  ParseFromString("learning-rate", &args, &learning_rate); // optional.
-  ok = ok && ParseFromString("input-dim", &args, &input_dim);
-  ok = ok && ParseFromString("output-dim", &args, &output_dim);
-  ok = ok && ParseFromString("num-blocks", &args, &num_blocks);
-  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-      bias_stddev = 1.0;
-  ParseFromString("param-stddev", &args, &param_stddev);
-  ParseFromString("bias-stddev", &args, &bias_stddev);
-  if (!args.empty())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << args;
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << orig_args;
-  Init(learning_rate, input_dim, output_dim,
-       param_stddev, bias_stddev, num_blocks);
-}
-
-
-void BlockAffineComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<BlockAffineComponent>", "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
-  ExpectToken(is, binary, "<NumBlocks>");
-  ReadBasicType(is, binary, &num_blocks_);
-  ExpectToken(is, binary, "<LinearParams>");
-  linear_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  ExpectToken(is, binary, "</BlockAffineComponent>");
-}
-
-void BlockAffineComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<BlockAffineComponent>");
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
-  WriteToken(os, binary, "<NumBlocks>");
-  WriteBasicType(os, binary, num_blocks_);
-  WriteToken(os, binary, "<LinearParams>");
-  linear_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "</BlockAffineComponent>");
-}
-
-
-int32 BlockAffineComponent::GetParameterDim() const {
-  // Note: num_blocks_ should divide both InputDim() and OutputDim().
-  return InputDim() * OutputDim() / num_blocks_;
-}
-
-void BlockAffineComponent::Vectorize(VectorBase<BaseFloat> *params) const {
-  int32 l = linear_params_.NumRows() * linear_params_.NumCols(),
-      b = bias_params_.Dim();
-  params->Range(0, l).CopyRowsFromMat(linear_params_);
-  params->Range(l, b).CopyFromVec(bias_params_);
-}
-void BlockAffineComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
-  int32 l = linear_params_.NumRows() * linear_params_.NumCols(),
-      b = bias_params_.Dim();
-  linear_params_.CopyRowsFromVec(params.Range(0, l));
-  bias_params_.CopyFromVec(params.Range(l, b));
-}
-
-
-void BlockAffineComponentPreconditioned::Init(BaseFloat learning_rate,
-                                              int32 input_dim, int32 output_dim,
-                                              BaseFloat param_stddev,
-                                              BaseFloat bias_stddev,
-                                              int32 num_blocks,
-                                              BaseFloat alpha) {
-  BlockAffineComponent::Init(learning_rate, input_dim, output_dim,
-                             param_stddev, bias_stddev, num_blocks);
-  is_gradient_ = false;
-  KALDI_ASSERT(alpha > 0.0);
-  alpha_ = alpha;
-}
-
-void BlockAffineComponentPreconditioned::InitFromString(std::string args) {
-  std::string orig_args(args);
-  bool ok = true;
-  BaseFloat learning_rate = learning_rate_;
-  BaseFloat alpha = 4.0;
-  int32 input_dim = -1, output_dim = -1, num_blocks = 1;
-  ParseFromString("learning-rate", &args, &learning_rate); // optional.
-  ParseFromString("alpha", &args, &alpha);
-  ok = ok && ParseFromString("input-dim", &args, &input_dim);
-  ok = ok && ParseFromString("output-dim", &args, &output_dim);
-  ok = ok && ParseFromString("num-blocks", &args, &num_blocks);
-
-  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-      bias_stddev = 1.0;
-  ParseFromString("param-stddev", &args, &param_stddev);
-  ParseFromString("bias-stddev", &args, &bias_stddev);
-  if (!args.empty())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << args;
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << orig_args;
-  Init(learning_rate, input_dim, output_dim,
-       param_stddev, bias_stddev, num_blocks,
-       alpha);
-}
-
-void BlockAffineComponentPreconditioned::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient)
-    is_gradient_ = true;
-  BlockAffineComponent::SetZero(treat_as_gradient);
-}
-
-void BlockAffineComponentPreconditioned::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<BlockAffineComponentPreconditioned>",
-                       "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
-  ExpectToken(is, binary, "<NumBlocks>");
-  ReadBasicType(is, binary, &num_blocks_);
-  ExpectToken(is, binary, "<LinearParams>");
-  linear_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  ExpectToken(is, binary, "<Alpha>");
-  ReadBasicType(is, binary, &alpha_);
-  ExpectToken(is, binary, "<IsGradient>");
-  ReadBasicType(is, binary, &is_gradient_);
-  ExpectToken(is, binary, "</BlockAffineComponentPreconditioned>");
-}
-
-void BlockAffineComponentPreconditioned::Write(std::ostream &os,
-                                               bool binary) const {
-  WriteToken(os, binary, "<BlockAffineComponentPreconditioned>");
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
-  WriteToken(os, binary, "<NumBlocks>");
-  WriteBasicType(os, binary, num_blocks_);
-  WriteToken(os, binary, "<LinearParams>");
-  linear_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<Alpha>");
-  WriteBasicType(os, binary, alpha_);
-  WriteToken(os, binary, "<IsGradient>");
-  WriteBasicType(os, binary, is_gradient_);
-  WriteToken(os, binary, "</BlockAffineComponentPreconditioned>");
-}
-
-Component* BlockAffineComponentPreconditioned::Copy() const {
-  BlockAffineComponentPreconditioned *ans = new
-      BlockAffineComponentPreconditioned();
-  ans->learning_rate_ = learning_rate_;
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  ans->num_blocks_ = num_blocks_;
-  ans->alpha_ = alpha_;
-  ans->is_gradient_ = is_gradient_;
-  return ans;
-}
-
-void BlockAffineComponentPreconditioned::Update(
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_deriv) {
-  if (is_gradient_) {
-    UpdateSimple(in_value, out_deriv);
-    // does the baseline update with no preconditioning.
-    return;
-  }
-  int32 input_block_dim = linear_params_.NumCols(),
-      output_block_dim = linear_params_.NumRows() / num_blocks_,
-      num_frames = in_value.NumRows();
-
-  CuMatrix<BaseFloat> in_value_temp(num_frames, input_block_dim + 1, kUndefined),
-      in_value_precon(num_frames, input_block_dim + 1, kUndefined);
-  in_value_temp.Set(1.0); // so last row will have value 1.0.
-  CuSubMatrix<BaseFloat> in_value_temp_part(in_value_temp, 0, num_frames,
-                                            0, input_block_dim); // all but last 1.0
-  CuSubMatrix<BaseFloat> in_value_precon_part(in_value_precon, 0, num_frames,
-                                            0, input_block_dim);
-  CuVector<BaseFloat> precon_ones(num_frames);
-  CuMatrix<BaseFloat> out_deriv_precon(num_frames, output_block_dim, kUndefined);
-
-  for (int32 b = 0; b < num_blocks_; b++) {
-    CuSubMatrix<BaseFloat> in_value_block(in_value, 0, num_frames,
-                                        b * input_block_dim,
-                                        input_block_dim),
-        out_deriv_block(out_deriv, 0, num_frames,
-                        b * output_block_dim, output_block_dim),
-        param_block(linear_params_,
-                    b * output_block_dim, output_block_dim,
-                    0, input_block_dim);
-    in_value_temp_part.CopyFromMat(in_value_block);
-
-    PreconditionDirectionsAlphaRescaled(in_value_temp, alpha_,
-                                        &in_value_precon);
-    PreconditionDirectionsAlphaRescaled(out_deriv_block, alpha_,
-                                        &out_deriv_precon);
-
-
-    // Update the parameters.
-    param_block.AddMatMat(learning_rate_, out_deriv_precon, kTrans,
-                          in_value_precon_part, kNoTrans, 1.0);
-    precon_ones.CopyColFromMat(in_value_precon, input_block_dim);
-    bias_params_.Range(b * output_block_dim, output_block_dim).
-        AddMatVec(learning_rate_, out_deriv_precon, kTrans,
-                  precon_ones, 1.0);
-  }
-}
-
-
-void PermuteComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<PermuteComponent>", "<Reorder>");
-  ReadIntegerVector(is, binary, &reorder_);
-  ExpectToken(is, binary, "</PermuteComponent>");
-}
-
-void PermuteComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<PermuteComponent>");
-  WriteToken(os, binary, "<Reorder>");
-  WriteIntegerVector(os, binary, reorder_);
-  WriteToken(os, binary, "</PermuteComponent>");
-}
-
-void PermuteComponent::Init(int32 dim) {
-  KALDI_ASSERT(dim > 0);
-  reorder_.resize(dim);
-  for (int32 i = 0; i < dim; i++) reorder_[i] = i;
-  std::random_shuffle(reorder_.begin(), reorder_.end());
-}
-
-void PermuteComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim;
-  bool ok = ParseFromString("dim", &args, &dim);
-  if (!ok || !args.empty() || dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(dim);
-}
-
-void PermuteComponent::Propagate(const ChunkInfo &in_info,
-                                 const ChunkInfo &out_info,
-                                 const CuMatrixBase<BaseFloat> &in,
-                                 CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  std::vector<int32> reverse_reorder(reorder_.size());
-  for (size_t i = 0; i < reorder_.size(); i++)
-    reverse_reorder[reorder_[i]] = i;
-  // Note: if we were actually using this component type we could make the
-  // CuArray a member variable for efficiency.
-  CuArray<int32> cu_reverse_reorder(reverse_reorder);
-  out->CopyCols(in, cu_reverse_reorder);
-}
-
-void PermuteComponent::Backprop(const ChunkInfo &,  //in_info,
-                                const ChunkInfo &,  //out_info,
-                                const CuMatrixBase<BaseFloat> &in_value,
-                                const CuMatrixBase<BaseFloat> &out_value,
-                                const CuMatrixBase<BaseFloat> &out_deriv,
-                                Component *to_update,
-                                CuMatrix<BaseFloat> *in_deriv) const  {
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  KALDI_ASSERT(out_deriv.NumCols() == OutputDim());
-  // Note: if we were actually using this component type we could make the
-  // CuArray a member variable for efficiency.
-  CuArray<int32> cu_reorder(reorder_);
-  in_deriv->CopyCols(out_deriv, cu_reorder);
-}
-
-void SumGroupComponent::Init(const std::vector<int32> &sizes) {
-  KALDI_ASSERT(!sizes.empty());
-  std::vector<Int32Pair> cpu_vec(sizes.size());
-  std::vector<int32> reverse_cpu_vec;
-  int32 cur_index = 0;
-  for (size_t i = 0; i < sizes.size(); i++) {
-    KALDI_ASSERT(sizes[i] > 0);
-    cpu_vec[i].first = cur_index;
-    cpu_vec[i].second = cur_index + sizes[i];
-    cur_index += sizes[i];
-    for (int32 j = cpu_vec[i].first; j < cpu_vec[i].second; j++)
-      reverse_cpu_vec.push_back(i);
-  }
-  this->indexes_ = cpu_vec;
-  this->reverse_indexes_ = reverse_cpu_vec;
-  this->input_dim_ = cur_index;
-  this->output_dim_ = sizes.size();
-}
-
-void SumGroupComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  std::vector<int32> sizes;
-  bool ok = ParseFromString("sizes", &args, &sizes);
-
-  if (!ok || !args.empty() || sizes.empty())
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  this->Init(sizes);
-}
-
-Component* SumGroupComponent::Copy() const {
-  SumGroupComponent *ans = new SumGroupComponent();
-  ans->indexes_ = indexes_;
-  ans->reverse_indexes_ = reverse_indexes_;
-  ans->input_dim_ = input_dim_;
-  ans->output_dim_ = output_dim_;
-  return ans;
-}
-
-void SumGroupComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<SumGroupComponent>", "<Sizes>");
-  std::vector<int32> sizes;
-  ReadIntegerVector(is, binary, &sizes);
-
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (!(token == "<SumGroupComponent>" ||
-        token == "</SumGroupComponent>")) {
-    KALDI_ERR << "Expected </SumGroupComponent>, got " << token;
-  }
-  this->Init(sizes);
-}
-
-void SumGroupComponent::GetSizes(std::vector<int32> *sizes) const {
-  std::vector<Int32Pair> indexes;
-  indexes_.CopyToVec(&indexes);
-  sizes->resize(indexes.size());
-  for (size_t i = 0; i < indexes.size(); i++) {
-    (*sizes)[i] = indexes[i].second - indexes[i].first;
-    if (i == 0) { KALDI_ASSERT(indexes[i].first == 0); }
-    else { KALDI_ASSERT(indexes[i].first == indexes[i-1].second); }
-    KALDI_ASSERT(indexes[i].second > indexes[i].first);
-    (*sizes)[i] = indexes[i].second - indexes[i].first;
-  }
-}
-
-void SumGroupComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<SumGroupComponent>");
-  WriteToken(os, binary, "<Sizes>");
-  std::vector<int32> sizes;
-  this->GetSizes(&sizes);
-  WriteIntegerVector(os, binary, sizes);
-  WriteToken(os, binary, "</SumGroupComponent>");
-}
-
-void SumGroupComponent::Propagate(const ChunkInfo &in_info,
-                                  const ChunkInfo &out_info,
-                                  const CuMatrixBase<BaseFloat> &in,
-                                  CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  out->SumColumnRanges(in, indexes_);
-}
-
-void SumGroupComponent::Backprop(const ChunkInfo &in_info,
-                                 const ChunkInfo &out_info,
-                                 const CuMatrixBase<BaseFloat> &, //in_value,
-                                 const CuMatrixBase<BaseFloat> &, //out_value,
-                                 const CuMatrixBase<BaseFloat> &out_deriv,
-                                 Component *to_update, // may be identical to "this".
-                                 CuMatrix<BaseFloat> *in_deriv) const {
-  in_deriv->Resize(out_deriv.NumRows(), InputDim());
-  in_deriv->CopyCols(out_deriv, reverse_indexes_);
-}
-
-
-std::string SpliceComponent::Info() const {
-  std::stringstream stream;
-  std::ostringstream os;
-  std::copy(context_.begin(), context_.end(),
-            std::ostream_iterator<int32>(os, " "));
-  stream << Component::Info() << ", context=" << os.str();
-  if (const_component_dim_ != 0)
-    stream << ", const_component_dim=" << const_component_dim_;
-
-  return stream.str();
-}
-
-void SpliceComponent::Init(int32 input_dim, std::vector<int32> context,
-                           int32 const_component_dim) {
-  input_dim_ = input_dim;
-  const_component_dim_ = const_component_dim;
-  context_ = context;
-  KALDI_ASSERT(context_.size() > 0);
-  KALDI_ASSERT(input_dim_ > 0 && context_.front() <= 0 && context_.back() >= 0);
-  KALDI_ASSERT(IsSortedAndUniq(context));
-  KALDI_ASSERT(const_component_dim_ >= 0 && const_component_dim_ < input_dim_);
-}
-
-
-// e.g. args == "input-dim=10 left-context=2 right-context=2
-void SpliceComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 input_dim, left_context, right_context;
-  std::vector <int32> context;
-  bool in_dim_ok = ParseFromString("input-dim", &args, &input_dim);
-  bool context_ok = ParseFromString("context", &args, &context);
-  bool left_right_context_ok = ParseFromString("left-context", &args,
-                                               &left_context) &&
-                               ParseFromString("right-context", &args,
-                                               &right_context);
-  int32 const_component_dim = 0;
-  ParseFromString("const-component-dim", &args, &const_component_dim);
-
-  if (!(in_dim_ok && (context_ok || left_right_context_ok)) ||
-      !args.empty() || input_dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  if (left_right_context_ok)  {
-    KALDI_ASSERT(context.size() == 0);
-    for (int32 i = -left_context; i <= right_context; i++)
-      context.push_back(i);
-  }
-  Init(input_dim, context, const_component_dim);
-}
-
-int32 SpliceComponent::OutputDim() const {
-  return (input_dim_  - const_component_dim_)
-      * (context_.size())
-      + const_component_dim_;
-}
-
-int32 ChunkInfo::GetIndex(int32 offset) const  {
-  if (offsets_.empty()) {  // if data is contiguous
-    KALDI_ASSERT((offset <= last_offset_) && (offset >= first_offset_));
-    return offset - first_offset_;
-  } else  {
-    std::vector<int32>::const_iterator iter =
-        std::lower_bound(offsets_.begin(), offsets_.end(), offset);
-    // make sure offset is present in the vector
-    KALDI_ASSERT(iter != offsets_.end() && *iter == offset);
-    return static_cast<int32>(iter - offsets_.begin());
-  }
-}
-
-int32 ChunkInfo::GetOffset(int32 index) const {
-  if (offsets_.empty()) { // if data is contiguous
-    int32 offset = index + first_offset_;  // just offset by the first_offset_
-    KALDI_ASSERT((offset <= last_offset_) && (offset >= first_offset_));
-    return offset;
-  } else  {
-    KALDI_ASSERT((index >= 0) && (index < offsets_.size()));
-    return offsets_[index];
-  }
-}
-
-void ChunkInfo::Check() const {
-  // Checking sanity of the ChunkInfo object
-  KALDI_ASSERT((feat_dim_ > 0) && (num_chunks_ > 0));
-
-  if (! offsets_.empty()) {
-    KALDI_ASSERT((first_offset_ == offsets_.front()) &&
-                 (last_offset_ == offsets_.back()));
-  } else  {
-    KALDI_ASSERT((first_offset_ >= 0) && (last_offset_ >= first_offset_));
-    // asserting the chunk is not contiguous, as offsets is not empty
-    KALDI_ASSERT ( last_offset_ - first_offset_ + 1 > offsets_.size() );
-  }
-  KALDI_ASSERT(NumRows() % num_chunks_ == 0);
-
-}
-
-void ChunkInfo::CheckSize(const CuMatrixBase<BaseFloat> &mat) const {
-  KALDI_ASSERT((mat.NumRows()  ==  NumRows()) && (mat.NumCols() == NumCols()));
-}
-
-/*
- * This method was used for debugging, make changes in nnet-component.h to
- * expose it
-void ChunkInfo::ToString() const  {
-    KALDI_LOG << "feat_dim  " << feat_dim_;
-    KALDI_LOG << "num_chunks  " << num_chunks_;
-    KALDI_LOG << "first_index  " << first_offset_;
-    KALDI_LOG << "last_index  " << last_offset_;
-    for (size_t i = 0; i < offsets_.size(); i++)
-      KALDI_LOG << offsets_[i];
-}
-*/
-
-
-void SpliceComponent::Propagate(const ChunkInfo &in_info,
-                                const ChunkInfo &out_info,
-                                const CuMatrixBase<BaseFloat> &in,
-                                CuMatrixBase<BaseFloat> *out) const  {
-
-  // Check the inputs are correct and resize output
-  in_info.Check();
-  out_info.Check();
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  int32 in_chunk_size  = in_info.ChunkSize(),
-        out_chunk_size = out_info.ChunkSize(),
-        input_dim = in_info.NumCols();
-
-  if (out_chunk_size <= 0)
-    KALDI_ERR << "Splicing features: output will have zero dimension. "
-              << "Probably a code error.";
-
-  // 'indexes' is, for each index from 0 to context_.size() - 1,
-  // then for each row of "out", the corresponding row of "in" that we copy from
-  int32 num_splice = context_.size();
-  std::vector<std::vector<int32> > indexes(num_splice);
-  for (int32 c = 0; c < num_splice; c++)
-    indexes[c].resize(out->NumRows());
-  // const_component_dim_ != 0, "const_indexes" will be used to determine which
-  // row of "in" we copy the last part of each row of "out" from (this part is
-  // not subject to splicing, it's assumed constant for each frame of "input".
-  int32 const_dim = const_component_dim_;
-  std::vector<int32> const_indexes(const_dim == 0 ? 0 : out->NumRows());
-
-  for (int32 chunk = 0; chunk < in_info.NumChunks(); chunk++) {
-    if (chunk == 0) {
-      // this branch could be used for all chunks in the matrix,
-      // but is restricted to chunk 0 for efficiency reasons
-      for (int32 c = 0; c < num_splice; c++) {
-        for (int32 out_index = 0; out_index < out_chunk_size; out_index++) {
-          int32 out_offset = out_info.GetOffset(out_index);
-          int32 in_index = in_info.GetIndex(out_offset + context_[c]);
-          indexes[c][chunk * out_chunk_size + out_index] =
-              chunk * in_chunk_size + in_index;
-        }
-      }
-    } else {  // just copy the indices from the previous chunk
-              // and offset these by input chunk size
-     for (int32 c = 0; c < num_splice; c++) {
-       for (int32 out_index = 0; out_index < out_chunk_size; out_index++) {
-         int32 last_value = indexes[c][(chunk-1) * out_chunk_size + out_index];
-         indexes[c][chunk * out_chunk_size + out_index] =
-             (last_value == -1 ? -1 : last_value + in_chunk_size);
-       }
-     }
-   }
-    if (const_dim != 0) {
-      for (int32 out_index = 0; out_index < out_chunk_size; out_index++)
-        const_indexes[chunk * out_chunk_size + out_index] =
-            chunk * in_chunk_size + out_index;  // there is
-      // an arbitrariness here; since we assume the const_component
-      // is constant within a chunk, it doesn't matter from where we copy.
-    }
-  }
-
-
-  for (int32 c = 0; c < num_splice; c++) {
-    int32 dim = input_dim - const_dim;  // dimension we
-    // are splicing
-    CuSubMatrix<BaseFloat> in_part(in, 0, in.NumRows(),
-                                   0, dim),
-        out_part(*out, 0, out->NumRows(),
-                 c * dim, dim);
-    CuArray<int32> cu_indexes(indexes[c]);
-    out_part.CopyRows(in_part, cu_indexes);
-  }
-  if (const_dim != 0) {
-    CuSubMatrix<BaseFloat> in_part(in, 0, in.NumRows(),
-                                   in.NumCols() - const_dim, const_dim),
-        out_part(*out, 0, out->NumRows(),
-                 out->NumCols() - const_dim, const_dim);
-
-    CuArray<int32> cu_const_indexes(const_indexes);
-    out_part.CopyRows(in_part, cu_const_indexes);
-  }
-}
-
-void SpliceComponent::Backprop(const ChunkInfo &in_info,
-                               const ChunkInfo &out_info,
-                               const CuMatrixBase<BaseFloat> &,  // in_value,
-                               const CuMatrixBase<BaseFloat> &,  // out_value,
-                               const CuMatrixBase<BaseFloat> &out_deriv,
-                               Component *to_update,
-                               CuMatrix<BaseFloat> *in_deriv) const {
-  in_info.Check();
-  out_info.Check();
-  out_info.CheckSize(out_deriv);
-  in_deriv->Resize(in_info.NumRows(), in_info.NumCols(), kUndefined);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  int32 num_chunks = in_info.NumChunks();
-  // rewrite backpropagate
-
-  int32 out_chunk_size = out_info.ChunkSize(),
-         in_chunk_size = in_info.ChunkSize(),
-            output_dim = out_deriv.NumCols(),
-             input_dim = InputDim();
-
-  KALDI_ASSERT(OutputDim() == output_dim);
-
-  int32 num_splice = context_.size(),
-      const_dim = const_component_dim_;
-  // 'indexes' is, for each index from 0 to num_splice - 1,
-  // then for each row of "in_deriv", the corresponding row of "out_deriv" that
-  // we add, or -1 if.
-
-  std::vector<std::vector<int32> > indexes(num_splice);
-  // const_dim != 0, "const_indexes" will be used to determine which
-  // row of "in" we copy the last part of each row of "out" from (this part is
-  // not subject to splicing, it's assumed constant for each frame of "input".
-  std::vector<int32> const_indexes(const_dim == 0 ? 0 : in_deriv->NumRows(), -1);
-
-  for (int32 c = 0; c < indexes.size(); c++)
-    indexes[c].resize(in_deriv->NumRows(), -1);  // set to -1 by default,
-  // this gets interpreted by the CopyRows() code
-  // as a signal to zero the output...
-
-  int32 dim = input_dim - const_dim;  // dimension we are splicing
-  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
-    if (chunk == 0) { // this branch can be taken for all chunks, but is not
-                      // taken for efficiency reasons
-      for (int32 c = 0; c < num_splice; c++)  {
-        for (int32 out_index = 0; out_index < out_chunk_size; out_index++) {
-          int32 out_offset = out_info.GetOffset(out_index);
-          int32 in_index = in_info.GetIndex(out_offset + context_[c]);
-          indexes[c][chunk * in_chunk_size + in_index] =
-              chunk * out_chunk_size + out_index;
-        }
-      }
-    } else {  // just copy the indexes from the previous chunk
-      for (int32 c = 0; c < num_splice; c++)  {
-        for (int32 in_index = 0; in_index < in_chunk_size; in_index++) {
-          int32 last_value = indexes[c][(chunk-1) * in_chunk_size + in_index];
-          indexes[c][chunk * in_chunk_size + in_index] =
-              (last_value == -1 ? -1 : last_value + out_chunk_size);
-        }
-      }
-    }
-    // this code corresponds to the way the forward propagation works; see
-    // comments there.
-    if (const_dim != 0) {
-      for (int32 out_index = 0; out_index < out_chunk_size; out_index++)  {
-        const_indexes[chunk * in_chunk_size + out_index] =
-            chunk * out_chunk_size + out_index;
-      }
-    }
-  }
-
-  CuMatrix<BaseFloat> temp_mat(in_deriv->NumRows(), dim, kUndefined);
-
-  for (int32 c = 0; c < num_splice; c++) {
-    CuArray<int32> cu_indexes(indexes[c]);
-    int32 dim = input_dim - const_dim;  // dimension we
-    // are splicing
-    CuSubMatrix<BaseFloat> out_deriv_part(out_deriv, 0, out_deriv.NumRows(),
-                                          c * dim, dim),
-        in_deriv_part(*in_deriv, 0, in_deriv->NumRows(),
-                      0, dim);
-    if (c == 0) {
-      in_deriv_part.CopyRows(out_deriv_part, cu_indexes);
-    } else {
-      temp_mat.CopyRows(out_deriv_part, cu_indexes);
-      in_deriv_part.AddMat(1.0, temp_mat);
-    }
-  }
-  if (const_dim != 0) {
-    CuSubMatrix<BaseFloat> out_deriv_part(out_deriv, 0, out_deriv.NumRows(),
-                                          out_deriv.NumCols() - const_dim,
-                                          const_dim),
-        in_deriv_part(*in_deriv, 0, in_deriv->NumRows(),
-                      in_deriv->NumCols() - const_dim, const_dim);
-    CuArray<int32> cu_const_indexes(const_indexes);
-    in_deriv_part.CopyRows(out_deriv_part, cu_const_indexes);
-  }
-}
-
-Component *SpliceComponent::Copy() const {
-  SpliceComponent *ans = new SpliceComponent();
-  ans->input_dim_ = input_dim_;
-  ans->context_ = context_;
-  ans->const_component_dim_ = const_component_dim_;
-  return ans;
-}
-
-void SpliceComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<SpliceComponent>", "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_);
-  std::string token;
-  ReadToken(is, false, &token);
-  if (token == "<LeftContext>") {
-    int32 left_context=0, right_context=0;
-    std::vector<int32> context;
-    ReadBasicType(is, binary, &left_context);
-    ExpectToken(is, binary, "<RightContext>");
-    ReadBasicType(is, binary, &right_context);
-    for (int32 i = -1 * left_context; i <= right_context; i++)
-      context.push_back(i);
-    context_ = context;
-  } else  if (token == "<Context>") {
-    ReadIntegerVector(is, binary, &context_);
-  } else  {
-    KALDI_ERR << "Unknown token" << token
-              << ", the model might be corrupted";
-  }
-  ExpectToken(is, binary, "<ConstComponentDim>");
-  ReadBasicType(is, binary, &const_component_dim_);
-  ExpectToken(is, binary, "</SpliceComponent>");
-}
-
-void SpliceComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<SpliceComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  WriteToken(os, binary, "<Context>");
-  WriteIntegerVector(os, binary, context_);
-  WriteToken(os, binary, "<ConstComponentDim>");
-  WriteBasicType(os, binary, const_component_dim_);
-  WriteToken(os, binary, "</SpliceComponent>");
-}
-
-
-std::string SpliceMaxComponent::Info() const {
-  std::stringstream stream;
-  std::ostringstream os;
-  std::copy(context_.begin(), context_.end(),
-            std::ostream_iterator<int32>(os, " "));
-  stream << Component::Info() << ", context=" << os.str();
-  return stream.str();
-}
-
-void SpliceMaxComponent::Init(int32 dim,
-                              std::vector<int32> context)  {
-  dim_ = dim;
-  context_ = context;
-  KALDI_ASSERT(dim_ > 0 && context_.front() <= 0 && context_.back() >= 0);
-}
-
-
-// e.g. args == "dim=10 left-context=2 right-context=2
-void SpliceMaxComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim, left_context, right_context;
-  std::vector <int32> context;
-  bool dim_ok = ParseFromString("dim", &args, &dim);
-  bool context_ok = ParseFromString("context", &args, &context);
-  bool left_right_context_ok = ParseFromString("left-context",
-                                               &args, &left_context) &&
-                               ParseFromString("right-context", &args,
-                                               &right_context);
-
-  if (!(dim_ok && (context_ok || left_right_context_ok)) ||
-      !args.empty() || dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  if (left_right_context_ok)  {
-    KALDI_ASSERT(context.size() == 0);
-    for (int32 i = -1 * left_context; i <= right_context; i++)
-      context.push_back(i);
-  }
-  Init(dim, context);
-}
-
-
-void SpliceMaxComponent::Propagate(const ChunkInfo &in_info,
-                                   const ChunkInfo &out_info,
-                                   const CuMatrixBase<BaseFloat> &in,
-                                   CuMatrixBase<BaseFloat> *out) const  {
-  in_info.Check();
-  out_info.Check();
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  int32 in_chunk_size  = in_info.ChunkSize(),
-        out_chunk_size = out_info.ChunkSize(),
-        dim = in_info.NumCols();
-
-  CuMatrix<BaseFloat> input_chunk_part(out_chunk_size, dim);
-  for (int32 chunk = 0; chunk < in_info.NumChunks(); chunk++) {
-    CuSubMatrix<BaseFloat> input_chunk(in,
-                                     chunk * in_chunk_size, in_chunk_size,
-                                     0, dim),
-                        output_chunk(*out,
-                                     chunk * out_chunk_size,
-                                     out_chunk_size, 0, dim);
-    for (int32 offset = 0; offset < context_.size(); offset++) {
-      // computing the indices to copy into input_chunk_part from input_chunk
-      // copy the rows of the input matrix which correspond to the current
-      // context index
-      std::vector<int32> input_chunk_inds(out_chunk_size);
-      for (int32 i = 0; i < out_chunk_size; i++) {
-        int32 out_chunk_ind  = i;
-        int32 out_chunk_offset =
-            out_info.GetOffset(out_chunk_ind);
-        input_chunk_inds[i] =
-            in_info.GetIndex(out_chunk_offset + context_[offset]);
-      }
-      CuArray<int32> cu_chunk_inds(input_chunk_inds);
-      input_chunk_part.CopyRows(input_chunk, cu_chunk_inds);
-      if (offset == 0)  {
-        output_chunk.CopyFromMat(input_chunk_part);
-      } else {
-        output_chunk.Max(input_chunk_part);
-      }
-    }
-  }
-}
-
-void SpliceMaxComponent::Backprop(const ChunkInfo &in_info,
-                                  const ChunkInfo &out_info,
-                                  const CuMatrixBase<BaseFloat> &in_value,
-                                  const CuMatrixBase<BaseFloat> &,  // out_value
-                                  const CuMatrixBase<BaseFloat> &out_deriv,
-                                  Component *to_update,
-                                  CuMatrix<BaseFloat> *in_deriv) const  {
-  in_info.Check();
-  out_info.Check();
-  in_info.CheckSize(in_value);
-  out_info.CheckSize(out_deriv);
-  in_deriv->Resize(in_info.NumRows(), in_info.NumCols());
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  int32 out_chunk_size = out_info.ChunkSize(),
-         in_chunk_size = in_info.ChunkSize(),
-                      dim = out_deriv.NumCols();
-
-  KALDI_ASSERT(dim == InputDim());
-
-  for (int32 chunk = 0; chunk < in_info.NumChunks(); chunk++) {
-    CuSubMatrix<BaseFloat> in_deriv_chunk(*in_deriv,
-                                        chunk * in_chunk_size,
-                                        in_chunk_size,
-                                        0, dim),
-                         in_value_chunk(in_value,
-                                        chunk * in_chunk_size,
-                                        in_chunk_size,
-                                        0, dim),
-                        out_deriv_chunk(out_deriv,
-                                        chunk * out_chunk_size,
-                                        out_chunk_size,
-                                        0, dim);
-    for (int32 r = 0; r < out_deriv_chunk.NumRows(); r++) {
-      int32 out_chunk_ind = r;
-      int32 out_chunk_offset =
-          out_info.GetOffset(out_chunk_ind);
-
-      for (int32 c = 0; c < dim; c++) {
-        int32 in_r_max = -1;
-        BaseFloat max_input = -std::numeric_limits<BaseFloat>::infinity();
-        for (int32 context_ind = 0;
-             context_ind < context_.size(); context_ind++) {
-          int32 in_r =
-              in_info.GetIndex(out_chunk_offset + context_[context_ind]);
-          BaseFloat input = in_value_chunk(in_r, c);
-          if (input > max_input) {
-            max_input = input;
-            in_r_max = in_r;
-          }
-        }
-        KALDI_ASSERT(in_r_max != -1);
-        (*in_deriv)(in_r_max, c) += out_deriv_chunk(r, c);
-      }
-    }
-  }
-}
-
-Component *SpliceMaxComponent::Copy() const {
-  SpliceMaxComponent *ans = new SpliceMaxComponent();
-  ans->Init(dim_, context_);
-  return ans;
-}
-
-void SpliceMaxComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<SpliceMaxComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-  std::string token;
-  ReadToken(is, false, &token);
-  if (token == "<LeftContext>") {
-    int32 left_context = 0, right_context = 0;
-    std::vector<int32> context;
-    ReadBasicType(is, binary, &left_context);
-    ExpectToken(is, binary, "<RightContext>");
-    ReadBasicType(is, binary, &right_context);
-    for (int32 i = -1 * left_context; i <= right_context; i++)
-      context.push_back(i);
-    context_ = context;
-  } else  if (token == "<Context>") {
-    ReadIntegerVector(is, binary, &context_);
-  } else  {
-    KALDI_ERR << "Unknown token" << token << ", the model might be corrupted";
-  }
-  ExpectToken(is, binary, "</SpliceMaxComponent>");
-}
-
-void SpliceMaxComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<SpliceMaxComponent>");
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<Context>");
-  WriteIntegerVector(os, binary, context_);
-  WriteToken(os, binary, "</SpliceMaxComponent>");
-}
-
-std::string DctComponent::Info() const {
-  std::stringstream stream;
-  stream << Component::Info() << ", dct_dim=" << dct_mat_.NumCols();
-  if (dct_mat_.NumCols() != dct_mat_.NumRows())
-    stream << ", dct_keep_dim=" << dct_mat_.NumRows();
-
-  return stream.str();
-}
-
-void DctComponent::Init(int32 dim, int32 dct_dim, bool reorder, int32 dct_keep_dim) {
-  int dct_keep_dim_ = (dct_keep_dim > 0) ? dct_keep_dim : dct_dim;
-
-  KALDI_ASSERT(dim > 0 && dct_dim > 0);
-  KALDI_ASSERT(dim % dct_dim == 0); // dct_dim must divide dim.
-  KALDI_ASSERT(dct_dim >= dct_keep_dim_);
-  dim_ = dim;
-  dct_mat_.Resize(dct_keep_dim_, dct_dim);
-  reorder_ = reorder;
-  Matrix<BaseFloat> dct_mat(dct_keep_dim_, dct_dim);
-  ComputeDctMatrix(&dct_mat);
-  dct_mat_ = dct_mat;
-}
-
-
-
-void DctComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim, dct_dim, dct_keep_dim = 0;
-  bool reorder = false;
-
-  bool ok = ParseFromString("dim", &args, &dim);
-  ok = ParseFromString("dct-dim", &args, &dct_dim) && ok;
-  ok = ParseFromString("reorder", &args, &reorder) && ok;
-  ParseFromString("dct-keep-dim", &args, &dct_keep_dim);
-
-  if (!ok || !args.empty() || dim <= 0 || dct_dim <= 0 || dct_keep_dim < 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(dim, dct_dim, reorder, dct_keep_dim);
-}
-
-void DctComponent::Reorder(CuMatrixBase<BaseFloat> *mat, bool reverse) const {
-  // reorders into contiguous blocks of dize "dct_dim_", assuming that
-  // such blocks were interlaced before.  if reverse==true, does the
-  // reverse.
-  int32 dct_dim = dct_mat_.NumCols(),
-      dct_keep_dim = dct_mat_.NumRows(),
-      block_size_in = dim_ / dct_dim,
-      block_size_out = dct_keep_dim;
-
-  //This does not necesarily needs to be true anymore -- output must be reordered as well, but the dimension differs...
-  //KALDI_ASSERT(mat->NumCols() == dim_);
-  if (reverse) std::swap(block_size_in, block_size_out);
-
-  CuVector<BaseFloat> temp(mat->NumCols());
-  for (int32 i = 0; i < mat->NumRows(); i++) {
-    CuSubVector<BaseFloat> row(*mat, i);
-    int32 num_blocks_in = block_size_out;
-    for (int32 b = 0; b < num_blocks_in; b++) {
-      for (int32 j = 0; j < block_size_in; j++) {
-        temp(j * block_size_out + b) = row(b * block_size_in + j);
-      }
-    }
-    row.CopyFromVec(temp);
-  }
-}
-
-void DctComponent::Propagate(const ChunkInfo &in_info,
-                             const ChunkInfo &out_info,
-                             const CuMatrixBase<BaseFloat> &in,
-                             CuMatrixBase<BaseFloat> *out) const  {
-  KALDI_ASSERT(in.NumCols() == InputDim());
-  int32 dct_dim = dct_mat_.NumCols(),
-        dct_keep_dim = dct_mat_.NumRows(),
-        num_rows = in.NumRows(),
-        num_chunks = dim_ / dct_dim;
-
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(num_rows == out_info.NumRows());
-  KALDI_ASSERT(num_chunks * dct_keep_dim == out_info.NumCols());
-
-  CuMatrix<BaseFloat> in_tmp;
-  if (reorder_) {
-    in_tmp = in;
-    Reorder(&in_tmp, false);
-  }
-
-  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
-    CuSubMatrix<BaseFloat> in_mat(reorder_ ? in_tmp : in,
-                                0, num_rows, dct_dim * chunk, dct_dim),
-                        out_mat(*out,
-                                0, num_rows, dct_keep_dim * chunk, dct_keep_dim);
-
-    out_mat.AddMatMat(1.0, in_mat, kNoTrans, dct_mat_, kTrans, 0.0);
-  }
-  if (reorder_)
-    Reorder(out, true);
-}
-
-void DctComponent::Backprop(const ChunkInfo &,  //in_info,
-                            const ChunkInfo &,  //out_info,
-                            const CuMatrixBase<BaseFloat> &,  //in_value,
-                            const CuMatrixBase<BaseFloat> &,  //out_value,
-                            const CuMatrixBase<BaseFloat> &out_deriv,
-                            Component *,  //to_update,
-                            CuMatrix<BaseFloat> *in_deriv) const  {
-  KALDI_ASSERT(out_deriv.NumCols() == OutputDim());
-
-  int32 dct_dim = dct_mat_.NumCols(),
-        dct_keep_dim = dct_mat_.NumRows(),
-        num_chunks = dim_ / dct_dim,
-        num_rows = out_deriv.NumRows();
-
-  in_deriv->Resize(num_rows, dim_);
-
-  CuMatrix<BaseFloat> out_deriv_tmp;
-  if (reorder_) {
-    out_deriv_tmp = out_deriv;
-    Reorder(&out_deriv_tmp, false);
-  }
-  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
-    CuSubMatrix<BaseFloat> in_deriv_mat(*in_deriv,
-                                      0, num_rows, dct_dim * chunk, dct_dim),
-                        out_deriv_mat(reorder_ ? out_deriv_tmp : out_deriv,
-                                      0, num_rows, dct_keep_dim * chunk, dct_keep_dim);
-
-    // Note: in the reverse direction the DCT matrix is transposed.  This is
-    // normal when computing derivatives; the necessity for the transpose is
-    // obvious if you consider what happens when the input and output dims
-    // differ.
-    in_deriv_mat.AddMatMat(1.0, out_deriv_mat, kNoTrans,
-                           dct_mat_, kNoTrans, 0.0);
-  }
-  if (reorder_)
-    Reorder(in_deriv, true);
-}
-
-Component* DctComponent::Copy() const {
-  DctComponent *ans = new DctComponent();
-  ans->dct_mat_ = dct_mat_;
-  ans->dim_ = dim_;
-  ans->reorder_ = reorder_;
-  return ans;
-}
-
-void DctComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<DctComponent>");
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<DctDim>");
-  int32 dct_dim = dct_mat_.NumCols();
-  WriteBasicType(os, binary, dct_dim);
-  WriteToken(os, binary, "<Reorder>");
-  WriteBasicType(os, binary, reorder_);
-  WriteToken(os, binary, "<DctKeepDim>");
-  int32 dct_keep_dim = dct_mat_.NumRows();
-  WriteBasicType(os, binary, dct_keep_dim);
-  WriteToken(os, binary, "</DctComponent>");
-}
-
-void DctComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<DctComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-
-  ExpectToken(is, binary, "<DctDim>");
-  int32 dct_dim;
-  ReadBasicType(is, binary, &dct_dim);
-
-  ExpectToken(is, binary, "<Reorder>");
-  ReadBasicType(is, binary, &reorder_);
-
-  int32 dct_keep_dim = dct_dim;
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (token == "<DctKeepDim>") {
-    ReadBasicType(is, binary, &dct_keep_dim);
-    ExpectToken(is, binary, "</DctComponent>");
-  } else if (token != "</DctComponent>") {
-    KALDI_ERR << "Expected token \"</DctComponent>\", got instead \""
-              << token << "\".";
-  }
-
-  KALDI_ASSERT(dct_dim > 0 && dim_ > 0 && dim_ % dct_dim == 0);
-  Init(dim_, dct_dim, reorder_, dct_keep_dim);
-  //idct_mat_.Resize(dct_keep_dim, dct_dim);
-  //ComputeDctMatrix(&dct_mat_);
-}
-
-void FixedLinearComponent::InitFromString(std::string args) {
-  std::string orig_args = args;
-  std::string filename;
-  bool ok = ParseFromString("matrix", &args, &filename);
-
-  if (!ok || !args.empty())
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-
-  bool binary;
-  Input ki(filename, &binary);
-  CuMatrix<BaseFloat> mat;
-  mat.Read(ki.Stream(), binary);
-  KALDI_ASSERT(mat.NumRows() != 0);
-  Init(mat);
-}
-
-
-std::string FixedLinearComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat mat_size = static_cast<BaseFloat>(mat_.NumRows())
-      * static_cast<BaseFloat>(mat_.NumCols()),
-      mat_stddev = std::sqrt(TraceMatMat(mat_, mat_, kTrans) /
-                         mat_size);
-  stream << Component::Info() << ", params-stddev=" << mat_stddev;
-  return stream.str();
-}
-
-void FixedLinearComponent::Propagate(const ChunkInfo &in_info,
-                                     const ChunkInfo &out_info,
-                                     const CuMatrixBase<BaseFloat> &in,
-                                     CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  out->AddMatMat(1.0, in, kNoTrans, mat_, kTrans, 0.0);
-}
-
-void FixedLinearComponent::Backprop(const ChunkInfo &,  //in_info,
-                                    const ChunkInfo &,  //out_info,
-                                    const CuMatrixBase<BaseFloat> &,  //in_value,
-                                    const CuMatrixBase<BaseFloat> &,  //out_value,
-                                    const CuMatrixBase<BaseFloat> &out_deriv,
-                                    Component *,  //to_update, // may be identical to "this".
-                                    CuMatrix<BaseFloat> *in_deriv) const  {
-  in_deriv->Resize(out_deriv.NumRows(), mat_.NumCols());
-  in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, mat_, kNoTrans, 0.0);
-}
-
-Component* FixedLinearComponent::Copy() const {
-  FixedLinearComponent *ans = new FixedLinearComponent();
-  ans->Init(mat_);
-  return ans;
-}
-
-
-void FixedLinearComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<FixedLinearComponent>");
-  WriteToken(os, binary, "<CuMatrix>");
-  mat_.Write(os, binary);
-  WriteToken(os, binary, "</FixedLinearComponent>");
-}
-
-void FixedLinearComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<FixedLinearComponent>", "<CuMatrix>");
-  mat_.Read(is, binary);
-  ExpectToken(is, binary, "</FixedLinearComponent>");
-}
-
-void FixedAffineComponent::Init(const CuMatrixBase<BaseFloat> &mat) {
-  KALDI_ASSERT(mat.NumCols() > 1);
-  linear_params_ = mat.Range(0, mat.NumRows(),
-                             0, mat.NumCols() - 1);
-  bias_params_.Resize(mat.NumRows());
-  bias_params_.CopyColFromMat(mat, mat.NumCols() - 1);
-}
-
-
-void FixedAffineComponent::InitFromString(std::string args) {
-  std::string orig_args = args;
-  std::string filename;
-  bool ok = ParseFromString("matrix", &args, &filename);
-
-  if (!ok || !args.empty())
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-
-  bool binary;
-  Input ki(filename, &binary);
-  CuMatrix<BaseFloat> mat;
-  mat.Read(ki.Stream(), binary);
-  KALDI_ASSERT(mat.NumRows() != 0);
-  Init(mat);
-}
-
-
-std::string FixedAffineComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat linear_params_size = static_cast<BaseFloat>(linear_params_.NumRows())
-      * static_cast<BaseFloat>(linear_params_.NumCols()),
-      linear_params_stddev =
-      std::sqrt(TraceMatMat(linear_params_,
-                            linear_params_, kTrans) /
-                linear_params_size),
-      bias_params_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
-                                     bias_params_.Dim());
-
-  stream << Component::Info() << ", linear-params-stddev=" << linear_params_stddev
-         << ", bias-params-stddev=" << bias_params_stddev;
-  return stream.str();
-}
-
-void FixedAffineComponent::Propagate(const ChunkInfo &in_info,
-                                     const ChunkInfo &out_info,
-                                     const CuMatrixBase<BaseFloat> &in,
-                                     CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  out->AddMatMat(1.0, in, kNoTrans, linear_params_, kTrans, 0.0);
-  out->AddVecToRows(1.0, bias_params_);
-}
-
-void FixedAffineComponent::Backprop(const ChunkInfo &,  //in_info,
-                                    const ChunkInfo &,  //out_info,
-                                    const CuMatrixBase<BaseFloat> &,  //in_value,
-                                    const CuMatrixBase<BaseFloat> &,  //out_value,
-                                    const CuMatrixBase<BaseFloat> &out_deriv,
-                                    Component *,  //to_update, // may be identical to "this".
-                                    CuMatrix<BaseFloat> *in_deriv) const  {
-  in_deriv->Resize(out_deriv.NumRows(), linear_params_.NumCols());
-  in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, linear_params_, kNoTrans, 0.0);
-}
-
-Component* FixedAffineComponent::Copy() const {
-  FixedAffineComponent *ans = new FixedAffineComponent();
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  return ans;
-}
-
-
-void FixedAffineComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<FixedAffineComponent>");
-  WriteToken(os, binary, "<LinearParams>");
-  linear_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "</FixedAffineComponent>");
-}
-
-void FixedAffineComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<FixedAffineComponent>", "<LinearParams>");
-  linear_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  ExpectToken(is, binary, "</FixedAffineComponent>");
-}
-
-
-void FixedScaleComponent::Init(const CuVectorBase<BaseFloat> &scales) {
-  KALDI_ASSERT(scales.Dim() != 0);
-  scales_ = scales;
-}
-
-void FixedScaleComponent::InitFromString(std::string args) {
-  std::string orig_args = args;
-  std::string filename;
-  bool ok = ParseFromString("scales", &args, &filename);
-
-  if (!ok || !args.empty())
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-
-  CuVector<BaseFloat> vec;
-  ReadKaldiObject(filename, &vec);
-  Init(vec);
-}
-
-
-std::string FixedScaleComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat scales_size = static_cast<BaseFloat>(scales_.Dim()),
-      scales_mean = scales_.Sum() / scales_size,
-      scales_stddev = std::sqrt(VecVec(scales_, scales_) / scales_size
-       - (scales_mean * scales_mean));
-  stream << Component::Info() << ", scales-mean=" << scales_mean
-         << ", scales-stddev=" << scales_stddev;
-  return stream.str();
-}
-
-void FixedScaleComponent::Propagate(const ChunkInfo &in_info,
-                                    const ChunkInfo &out_info,
-                                    const CuMatrixBase<BaseFloat> &in,
-                                    CuMatrixBase<BaseFloat> *out) const  {
-  out->CopyFromMat(in);
-  out->MulColsVec(scales_);
-}
-
-void FixedScaleComponent::Backprop(const ChunkInfo &, //in_info,
-                                   const ChunkInfo &, //out_info,
-                                   const CuMatrixBase<BaseFloat> &, //in_value,
-                                   const CuMatrixBase<BaseFloat> &, //out_value,
-                                   const CuMatrixBase<BaseFloat> &out_deriv,
-                                   Component *, //to_update, // may be identical to "this".
-                                   CuMatrix<BaseFloat> *in_deriv) const {
-  *in_deriv = out_deriv;
-  in_deriv->MulColsVec(scales_);
-}
-
-Component* FixedScaleComponent::Copy() const {
-  FixedScaleComponent *ans = new FixedScaleComponent();
-  ans->scales_ = scales_;
-  return ans;
-}
-
-
-void FixedScaleComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<FixedScaleComponent>");
-  WriteToken(os, binary, "<Scales>");
-  scales_.Write(os, binary);
-  WriteToken(os, binary, "</FixedScaleComponent>");
-}
-
-void FixedScaleComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<FixedScaleComponent>", "<Scales>");
-  scales_.Read(is, binary);
-  ExpectToken(is, binary, "</FixedScaleComponent>");
-}
-
-void FixedBiasComponent::Init(const CuVectorBase<BaseFloat> &bias) {
-  KALDI_ASSERT(bias.Dim() != 0);
-  bias_ = bias;
-}
-
-void FixedBiasComponent::InitFromString(std::string args) {
-  std::string orig_args = args;
-  std::string filename;
-  bool ok = ParseFromString("bias", &args, &filename);
-
-  if (!ok || !args.empty())
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-
-  CuVector<BaseFloat> vec;
-  ReadKaldiObject(filename, &vec);
-  Init(vec);
-}
-
-
-std::string FixedBiasComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat bias_size = static_cast<BaseFloat>(bias_.Dim()),
-      bias_mean = bias_.Sum() / bias_size,
-      bias_stddev = std::sqrt(VecVec(bias_, bias_) / bias_size)
-       - (bias_mean * bias_mean);
-  stream << Component::Info() << ", bias-mean=" << bias_mean
-         << ", bias-stddev=" << bias_stddev;
-  return stream.str();
-}
-
-void FixedBiasComponent::Propagate(const ChunkInfo &in_info,
-                                   const ChunkInfo &out_info,
-                                   const CuMatrixBase<BaseFloat> &in,
-                                   CuMatrixBase<BaseFloat> *out) const  {
-  out->CopyFromMat(in);
-  out->AddVecToRows(1.0, bias_, 1.0);
-}
-
-void FixedBiasComponent::Backprop(const ChunkInfo &, //in_info,
-                                  const ChunkInfo &, //out_info,
-                                  const CuMatrixBase<BaseFloat> &,  //in_value,
-                                  const CuMatrixBase<BaseFloat> &,  //out_value,
-                                  const CuMatrixBase<BaseFloat> &out_deriv,
-                                  Component *,  //to_update,
-                                  CuMatrix<BaseFloat> *in_deriv) const  {
-  *in_deriv = out_deriv;
-}
-
-Component* FixedBiasComponent::Copy() const {
-  FixedBiasComponent *ans = new FixedBiasComponent();
-  ans->bias_ = bias_;
-  return ans;
-}
-
-
-void FixedBiasComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<FixedBiasComponent>");
-  WriteToken(os, binary, "<Bias>");
-  bias_.Write(os, binary);
-  WriteToken(os, binary, "</FixedBiasComponent>");
-}
-
-void FixedBiasComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<FixedBiasComponent>", "<Bias>");
-  bias_.Read(is, binary);
-  ExpectToken(is, binary, "</FixedBiasComponent>");
-}
-
-
-
-
-std::string DropoutComponent::Info() const {
-  std::stringstream stream;
-  stream << Component::Info() << ", dropout_proportion = "
-         << dropout_proportion_ << ", dropout_scale = "
-         << dropout_scale_;
-  return stream.str();
-}
-
-void DropoutComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim;
-  BaseFloat dropout_proportion = 0.5, dropout_scale = 0.0;
-  bool ok = ParseFromString("dim", &args, &dim);
-  ParseFromString("dropout-proportion", &args, &dropout_proportion);
-  ParseFromString("dropout-scale", &args, &dropout_scale);
-
-  if (!ok || !args.empty() || dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type DropoutComponent: \""
-              << orig_args << "\"";
-  Init(dim, dropout_proportion, dropout_scale);
-}
-
-void DropoutComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<DropoutComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<DropoutScale>");
-  ReadBasicType(is, binary, &dropout_scale_);
-  ExpectToken(is, binary, "<DropoutProportion>");
-  ReadBasicType(is, binary, &dropout_proportion_);
-  ExpectToken(is, binary, "</DropoutComponent>");
-}
-
-void DropoutComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<DropoutComponent>");
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<DropoutScale>");
-  WriteBasicType(os, binary, dropout_scale_);
-  WriteToken(os, binary, "<DropoutProportion>");
-  WriteBasicType(os, binary, dropout_proportion_);
-  WriteToken(os, binary, "</DropoutComponent>");
-}
-
-
-void DropoutComponent::Init(int32 dim,
-                            BaseFloat dropout_proportion,
-                            BaseFloat dropout_scale){
-  dim_ = dim;
-  dropout_proportion_ = dropout_proportion;
-  dropout_scale_ = dropout_scale;
-}
-
-void DropoutComponent::Propagate(const ChunkInfo &in_info,
-                                 const ChunkInfo &out_info,
-                                 const CuMatrixBase<BaseFloat> &in,
-                                 CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  KALDI_ASSERT(in.NumCols() == this->InputDim());
-
-  BaseFloat dp = dropout_proportion_;
-  KALDI_ASSERT(dp < 1.0 && dp >= 0.0);
-  KALDI_ASSERT(dropout_scale_ <= 1.0 && dropout_scale_ >= 0.0);
-
-  BaseFloat low_scale = dropout_scale_,
-      high_scale = (1.0 - (dp * low_scale)) / (1.0 - dp),
-      average = (low_scale * dp) +
-                (high_scale * (1.0 - dp));
-  KALDI_ASSERT(fabs(average - 1.0) < 0.01);
-
-  // This const_cast is only safe assuming you don't attempt
-  // to use multi-threaded code with the GPU.
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
-
-
-  out->Add(-dp); // now, a proportion "dp" will be <0.0
-  out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dp" will
-                         // be zero and (1-dp) will be 1.0.
-  if ((high_scale - low_scale) != 1.0)
-    out->Scale(high_scale - low_scale); // now, "dp" are 0 and (1-dp) are "high_scale-low_scale".
-  if (low_scale != 0.0)
-    out->Add(low_scale); // now "dp" equal "low_scale" and (1.0-dp) equal "high_scale".
-
-  out->MulElements(in);
-}
-
-void DropoutComponent::Backprop(const ChunkInfo &,  //in_info,
-                                const ChunkInfo &,  //out_info,
-                                const CuMatrixBase<BaseFloat> &in_value,
-                                const CuMatrixBase<BaseFloat> &out_value,
-                                const CuMatrixBase<BaseFloat> &out_deriv,
-                                Component *,  //to_update
-                                CuMatrix<BaseFloat> *in_deriv) const  {
-  KALDI_ASSERT(SameDim(in_value, out_value) && SameDim(in_value, out_deriv));
-  in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-  in_deriv->SetMatMatDivMat(out_deriv, out_value, in_value);
-}
-
-Component* DropoutComponent::Copy() const {
-  return new DropoutComponent(dim_,
-                              dropout_proportion_,
-                              dropout_scale_);
-}
-
-void AdditiveNoiseComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 dim;
-  BaseFloat stddev = 1.0;
-  bool ok = ParseFromString("dim", &args, &dim);
-  ParseFromString("stddev", &args, &stddev);
-
-  if (!ok || !args.empty() || dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type AdditiveNoiseComponent: \""
-              << orig_args << "\"";
-  Init(dim, stddev);
-}
-
-void AdditiveNoiseComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<AdditiveNoiseComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<Stddev>");
-  ReadBasicType(is, binary, &stddev_);
-  ExpectToken(is, binary, "</AdditiveNoiseComponent>");
-}
-
-void AdditiveNoiseComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<AdditiveNoiseComponent>");
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<Stddev>");
-  WriteBasicType(os, binary, stddev_);
-  WriteToken(os, binary, "</AdditiveNoiseComponent>");
-}
-
-void AdditiveNoiseComponent::Init(int32 dim, BaseFloat stddev) {
-  dim_ = dim;
-  stddev_ = stddev;
-}
-
-void AdditiveNoiseComponent::Propagate(const ChunkInfo &in_info,
-                                       const ChunkInfo &out_info,
-                                       const CuMatrixBase<BaseFloat> &in,
-                                       CuMatrixBase<BaseFloat> *out) const  {
-  KALDI_ASSERT(in.NumCols() == this->InputDim());
-  out->CopyFromMat(in);
-  CuMatrix<BaseFloat> rand(in.NumRows(), in.NumCols());
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&rand);
-  out->AddMat(stddev_, rand);
-}
-
-Convolutional1dComponent::Convolutional1dComponent():
-    UpdatableComponent(),
-    patch_dim_(0), patch_step_(0), patch_stride_(0),
-    appended_conv_(false), is_gradient_(false) {}
-
-Convolutional1dComponent::Convolutional1dComponent(const Convolutional1dComponent &component):
-    UpdatableComponent(component),
-    filter_params_(component.filter_params_),
-    bias_params_(component.bias_params_),
-    appended_conv_(component.appended_conv_),
-    is_gradient_(component.is_gradient_) {}
-
-Convolutional1dComponent::Convolutional1dComponent(const CuMatrixBase<BaseFloat> &filter_params,
-                                                   const CuVectorBase<BaseFloat> &bias_params,
-                                                   BaseFloat learning_rate):
-    UpdatableComponent(learning_rate),
-    filter_params_(filter_params),
-    bias_params_(bias_params) {
-  KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
-               bias_params.Dim() != 0);
-  appended_conv_ = false;
-  is_gradient_ = false;
-}
-
-// aquire input dim
-int32 Convolutional1dComponent::InputDim() const {
-  int32 filter_dim = filter_params_.NumCols();
-  int32 num_splice = filter_dim / patch_dim_;
-  return patch_stride_ * num_splice;
-}
-
-// aquire output dim
-int32 Convolutional1dComponent::OutputDim() const {
-  int32 num_filters = filter_params_.NumRows();
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  return num_patches * num_filters;
-}
-
-// initialize the component using hyperparameters
-void Convolutional1dComponent::Init(BaseFloat learning_rate,
-                                    int32 input_dim, int32 output_dim,
-                                    int32 patch_dim, int32 patch_step,
-                                    int32 patch_stride, BaseFloat param_stddev,
-                                    BaseFloat bias_stddev, bool appended_conv) {
-  UpdatableComponent::Init(learning_rate);
-  patch_dim_ = patch_dim;
-  patch_step_ = patch_step;
-  patch_stride_ = patch_stride;
-  appended_conv_ = appended_conv;
-  int32 num_splice = input_dim / patch_stride;
-  int32 filter_dim = num_splice * patch_dim;
-  int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
-  int32 num_filters = output_dim / num_patches;
-  KALDI_ASSERT(input_dim % patch_stride == 0);
-  KALDI_ASSERT((patch_stride - patch_dim) % patch_step == 0);
-  KALDI_ASSERT(output_dim % num_patches == 0);
-
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
-  filter_params_.SetRandn();
-  filter_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-}
-
-// initialize the component using predefined matrix file
-void Convolutional1dComponent::Init(BaseFloat learning_rate, int32 patch_dim,
-                                    int32 patch_step, int32 patch_stride,
-                                    std::string matrix_filename,
-                                    bool appended_conv) {
-  UpdatableComponent::Init(learning_rate);
-  patch_dim_ = patch_dim;
-  patch_step_ = patch_step;
-  patch_stride_ = patch_stride;
-  appended_conv_ = appended_conv;
-  CuMatrix<BaseFloat> mat;
-  ReadKaldiObject(matrix_filename, &mat);
-  KALDI_ASSERT(mat.NumCols() >= 2);
-  int32 filter_dim = mat.NumCols() - 1, num_filters = mat.NumRows();
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-  filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim));
-  bias_params_.CopyColFromMat(mat, filter_dim);
-}
-
-// resize the component, setting the parameters to zero, while
-// leaving any other configuration values the same
-void Convolutional1dComponent::Resize(int32 input_dim, int32 output_dim) {
-  KALDI_ASSERT(input_dim > 0 && output_dim > 0);
-  int32 num_splice = input_dim / patch_stride_;
-  int32 filter_dim = num_splice * patch_dim_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = output_dim / num_patches;
-  KALDI_ASSERT(input_dim % patch_stride_ == 0);
-  KALDI_ASSERT((patch_stride_ - patch_dim_) % patch_step_ == 0);
-  KALDI_ASSERT(output_dim % num_patches == 0);
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-}
-
-// display information about component
-std::string Convolutional1dComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat filter_params_size = static_cast<BaseFloat>(filter_params_.NumRows())
-                                 * static_cast<BaseFloat>(filter_params_.NumCols());
-  BaseFloat filter_stddev =
-            std::sqrt(TraceMatMat(filter_params_, filter_params_, kTrans) /
-                      filter_params_size),
-            bias_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
-                                    bias_params_.Dim());
-
-  int32 num_splice = InputDim() / patch_stride_;
-  int32 filter_dim = num_splice * patch_dim_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = OutputDim() / num_patches;
-
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim()
-         << ", num-splice=" << num_splice
-         << ", num-patches=" << num_patches
-         << ", num-filters=" << num_filters
-         << ", filter-dim=" << filter_dim
-         << ", filter-params-stddev=" << filter_stddev
-         << ", bias-params-stddev=" << bias_stddev
-         << ", appended-conv=" << appended_conv_
-         << ", learning-rate=" << LearningRate();
-  return stream.str();
-}
-
-// initialize the component using configuration file
-void Convolutional1dComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  bool ok = true, appended_conv = false;
-  BaseFloat learning_rate = learning_rate_;
-  std::string matrix_filename;
-  int32 input_dim = -1, output_dim = -1;
-  int32 patch_dim = -1, patch_step = -1, patch_stride = -1;
-  ParseFromString("learning-rate", &args, &learning_rate);
-  ParseFromString("appended-conv", &args, &appended_conv);
-  ok = ok && ParseFromString("patch-dim", &args, &patch_dim);
-  ok = ok && ParseFromString("patch-step", &args, &patch_step);
-  ok = ok && ParseFromString("patch-stride", &args, &patch_stride);
-  if (ParseFromString("matrix", &args, &matrix_filename)) {
-    // initialize from prefined parameter matrix
-    Init(learning_rate, patch_dim, patch_step, patch_stride,
-         matrix_filename, appended_conv);
-    if (ParseFromString("input-dim", &args, &input_dim))
-      KALDI_ASSERT(input_dim == InputDim() &&
-               "input-dim mismatch vs. matrix.");
-    if (ParseFromString("output-dim", &args, &output_dim))
-            KALDI_ASSERT(output_dim == OutputDim() &&
-                     "output-dim mismatch vs. matrix.");
-  } else {
-    // initialize from configuration
-    ok = ok && ParseFromString("input-dim", &args, &input_dim);
-    ok = ok && ParseFromString("output-dim", &args, &output_dim);
-    BaseFloat param_stddev = 1.0 / std::sqrt(input_dim), bias_stddev = 1.0;
-    ParseFromString("param-stddev", &args, &param_stddev);
-    ParseFromString("bias-stddev", &args, &bias_stddev);
-    Init(learning_rate, input_dim, output_dim, patch_dim,
-         patch_step, patch_stride, param_stddev, bias_stddev, appended_conv);
-  }
-  if (!args.empty())
-    KALDI_ERR << "Could not process these elements in initializer: " << args;
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << orig_args;
-}
-
-// propagation function
-
-/*
-   In Convolution1dComponent, filter is defined $num-filters x $filter-dim,
-   and bias vector B is defined by length $num-filters. The propatation is
-   Y = X o A' + B
-   where "o" is executing matrix-matrix convolution, which consists of a group
-   of vector-matrix convolutions.
-   For instance, the convolution of X(t) and the i-th filter A(i) is
-   Y(t,i) = X(t) o A'(i) + B(i)
-   The convolution used here is valid convolution. Meaning that the
-   output of M o N is of dim |M| - |N| + 1, assuming M is not shorter then N.
-
-   By default, input is arranged by
-   x (time), y (channel), z(frequency)
-   and output is arranged by
-   x (time), y (frequency), z(channel).
-   When appending convolutional1dcomponent, appended_conv_ should be
-   set ture for the appended convolutional1dcomponent.
-*/
-void Convolutional1dComponent::Propagate(const ChunkInfo &in_info,
-                                         const ChunkInfo &out_info,
-                                         const CuMatrixBase<BaseFloat> &in,
-                                         CuMatrixBase<BaseFloat> *out) const {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-
-  // dims
-  int32 num_splice = InputDim() / patch_stride_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = filter_params_.NumRows();
-  int32 num_frames = in.NumRows();
-  int32 filter_dim = filter_params_.NumCols();
-
-  /** Buffer of reshaped inputs:
-   *  1row = vectorized rectangular feature patches
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> patches(num_frames, filter_dim * num_patches, kUndefined);
-  // column_map is indexed by the column-index of "patches",
-  // and the value is the corresponding column-index of "in".
-  std::vector<int32> column_map(filter_dim * num_patches);
-
-  // build-up a column selection map
-  for (int32 patch = 0, index = 0; patch < num_patches; patch++) {
-    int32 fstride = patch * patch_step_;
-    for (int32 splice = 0; splice < num_splice; splice++) {
-      int32 cstride = splice * patch_stride_;
-      for (int32 d = 0; d < patch_dim_; d++, index++) {
-        if (appended_conv_)
-          column_map[index] = (fstride + d) * num_splice + splice;
-        else
-          column_map[index] = fstride + cstride + d;
-      }
-    }
-  }
-  CuArray<int32> cu_cols(column_map);
-  patches.CopyCols(in, cu_cols);
-
-  //
-  // compute filter activations
-  //
-
-  std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch, filter_params_batch;
-
-  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
-
-  // form batch in vector container
-  for (int32 p = 0; p < num_patches; p++) {
-    // form batch in vector container. for filter_params_batch, all elements
-    // point to the same copy filter_params_elem
-    tgt_batch.push_back(new CuSubMatrix<BaseFloat>(out->ColRange(p * num_filters,
-                                                                 num_filters)));
-    patch_batch.push_back(new CuSubMatrix<BaseFloat>(
-        patches.ColRange(p * filter_dim, filter_dim)));
-    filter_params_batch.push_back(filter_params_elem);
-
-    tgt_batch[p]->AddVecToRows(1.0, bias_params_, 0.0); // add bias
-  }
-
-  // apply all filters
-  AddMatMatBatched<BaseFloat>(1.0, tgt_batch, patch_batch, kNoTrans,
-                              filter_params_batch, kTrans, 1.0);
-
-  // release memory
-  delete filter_params_elem;
-  for (int32 p = 0; p < num_patches; p++) {
-    delete tgt_batch[p];
-    delete patch_batch[p];
-  }
-}
-
-// scale the parameters
-void Convolutional1dComponent::Scale(BaseFloat scale) {
-  filter_params_.Scale(scale);
-  bias_params_.Scale(scale);
-}
-
-// add another convolution component
-void Convolutional1dComponent::Add(BaseFloat alpha, const UpdatableComponent &other_in) {
-  const Convolutional1dComponent *other =
-      dynamic_cast<const Convolutional1dComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  filter_params_.AddMat(alpha, other->filter_params_);
-  bias_params_.AddVec(alpha, other->bias_params_);
-}
-
-/*
- This function does an operation similar to reversing a map,
- except it handles maps that are not one-to-one by outputting
- the reversed map as a vector of lists.
- @param[in] forward_indexes is a vector of int32, each of whose
-            elements is between 0 and input_dim - 1.
- @param[in] input_dim. See definitions of forward_indexes and
-            backward_indexes.
- @param[out] backward_indexes is a vector of dimension input_dim
-            of lists, The list at (backward_indexes[i]) is a list
-            of all indexes j such that forward_indexes[j] = i.
-*/
-void Convolutional1dComponent::ReverseIndexes(const std::vector<int32> &forward_indexes,
-                                              int32 input_dim,
-                                              std::vector<std::vector<int32> > *backward_indexes) {
-  int32 i, size = forward_indexes.size();
-  int32 reserve_size = 2 + size / input_dim;
-  backward_indexes->resize(input_dim);
-  std::vector<std::vector<int32> >::iterator iter = backward_indexes->begin(),
-    end = backward_indexes->end();
-  for (; iter != end; ++iter)
-    iter->reserve(reserve_size);
-  for (int32 j = 0; j < forward_indexes.size(); j++) {
-    i = forward_indexes[j];
-    KALDI_ASSERT(i < input_dim);
-    (*backward_indexes)[i].push_back(j);
-  }
-}
-
-/*
- This function transforms a vector of lists into a list of vectors,
- padded with -1.
- @param[in] The input vector of lists. Let in.size() be D, and let
-            the longest list length (i.e. the max of in[i].size()) be L.
- @param[out] The output list of vectors. The length of the list will
-            be L, each vector-dimension will be D (i.e. out[i].size() == D),
-            and if in[i] == j, then for some k we will have that
-            out[k][j] = i. The output vectors are padded with -1
-            where necessary if not all the input lists have the same side.
-*/
-void Convolutional1dComponent::RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                                                std::vector<std::vector<int32> > *out) {
-  int32 D = in.size();
-  int32 L = 0;
-  for (int32 i = 0; i < D; i++)
-    if (in[i].size() > L)
-      L = in[i].size();
-  out->resize(L);
-  for (int32 i = 0; i < L; i++)
-    (*out)[i].resize(D, -1);
-  for (int32 i = 0; i < D; i++) {
-    for (int32 j = 0; j < in[i].size(); j++) {
-      (*out)[j][i] = in[i][j];
-    }
-  }
-}
-
-// back propagation function
-void Convolutional1dComponent::Backprop(const ChunkInfo &in_info,
-                                        const ChunkInfo &out_info,
-                                        const CuMatrixBase<BaseFloat> &in_value,
-                                        const CuMatrixBase<BaseFloat> &out_value,
-                                        const CuMatrixBase<BaseFloat> &out_deriv,
-                                        Component *to_update_in,
-                                        CuMatrix<BaseFloat> *in_deriv) const {
-  in_deriv->Resize(out_deriv.NumRows(), InputDim());
-  Convolutional1dComponent *to_update = dynamic_cast<Convolutional1dComponent*>(to_update_in);
-  int32 num_splice = InputDim() / patch_stride_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = filter_params_.NumRows();
-  int32 num_frames = out_deriv.NumRows();
-  int32 filter_dim = filter_params_.NumCols();
-
-  /** Buffer for backpropagation:
-   *  derivatives in the domain of 'patches_',
-   *  1row = vectorized rectangular feature patches,
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> patches_deriv(num_frames, filter_dim * num_patches, kSetZero);
-
-  //
-  // backpropagate to vector of matrices
-  // (corresponding to position of a filter)
-  //
-  std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
-      filter_params_batch;
-
-  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
-
-  // form batch in vector container
-  for (int32 p = 0; p < num_patches; p++) {
-    // form batch in vector container. for filter_params_batch, all elements
-    // point to the same copy filter_params_elem
-    patch_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(patches_deriv.ColRange(
-        p * filter_dim, filter_dim)));
-    out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-        p * num_filters, num_filters)));
-    filter_params_batch.push_back(filter_params_elem);
-  }
-  AddMatMatBatched<BaseFloat>(1.0, patch_deriv_batch, out_deriv_batch, kNoTrans,
-                              filter_params_batch, kNoTrans, 0.0);
-
-  // release memory
-  delete filter_params_elem;
-  for (int32 p = 0; p < num_patches; p++) {
-    delete patch_deriv_batch[p];
-    delete out_deriv_batch[p];
-  }
-
-  // sum the derivatives into in_deriv
-  std::vector<int32> column_map(filter_dim * num_patches);
-  for (int32 patch = 0, index = 0; patch < num_patches; patch++) {
-    int32 fstride = patch * patch_step_;
-    for (int32 splice = 0; splice < num_splice; splice++) {
-      int32 cstride = splice * patch_stride_;
-      for (int32 d = 0; d < patch_dim_; d++, index++) {
-        if (appended_conv_)
-          column_map[index] = (fstride + d) * num_splice + splice;
-        else
-          column_map[index] = fstride + cstride + d;
-      }
-    }
-  }
-  std::vector<std::vector<int32> > reversed_column_map;
-  ReverseIndexes(column_map, InputDim(), &reversed_column_map);
-  std::vector<std::vector<int32> > rearranged_column_map;
-  RearrangeIndexes(reversed_column_map, &rearranged_column_map);
-  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
-    CuArray<int32> cu_cols(rearranged_column_map[p]);
-    in_deriv->AddCols(patches_deriv, cu_cols);
-  }
-
-  if (to_update != NULL) {
-    // Next update the model (must do this 2nd so the derivatives we propagate
-    // are accurate, in case this == to_update_in.)
-    to_update->Update(in_value, out_deriv);
-  }
-}
-
-void Convolutional1dComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetLearningRate(1.0);
-  }
-  filter_params_.SetZero();
-  bias_params_.SetZero();
-  if (treat_as_gradient) {
-    is_gradient_ = true;
-  }
-}
-
-void Convolutional1dComponent::Read(std::istream &is, bool binary) {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<Convolutional1dComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</Convolutional1dComponent>"
-  // might not see the "<Convolutional1dComponent>" part because
-  // of how ReadNew() works.
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
-  ExpectToken(is, binary, "<PatchDim>");
-  ReadBasicType(is, binary, &patch_dim_);
-  ExpectToken(is, binary, "<PatchStep>");
-  ReadBasicType(is, binary, &patch_step_);
-  ExpectToken(is, binary, "<PatchStride>");
-  ReadBasicType(is, binary, &patch_stride_);
-  // back-compatibility
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<AppendedConv>") {
-    ReadBasicType(is, binary, &appended_conv_);
-    ExpectToken(is, binary, "<FilterParams>");
-  } else {
-    appended_conv_ = false;
-    KALDI_ASSERT(tok == "<FilterParams>");
-  }
-  filter_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  ReadToken(is, binary, &tok);
-  if (tok == "<IsGradient>") {
-    ReadBasicType(is, binary, &is_gradient_);
-    ExpectToken(is, binary, ostr_end.str());
-  } else {
-    is_gradient_ = false;
-    KALDI_ASSERT(tok == ostr_end.str());
-  }
-}
-
-void Convolutional1dComponent::Write(std::ostream &os, bool binary) const {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">"; // e.g. "<Convolutional1dComponent>"
-  ostr_end << "</" << Type() << ">"; // e.g. "</Convolutional1dComponent>"
-  WriteToken(os, binary, ostr_beg.str());
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
-  WriteToken(os, binary, "<PatchDim>");
-  WriteBasicType(os, binary, patch_dim_);
-  WriteToken(os, binary, "<PatchStep>");
-  WriteBasicType(os, binary, patch_step_);
-  WriteToken(os, binary, "<PatchStride>");
-  WriteBasicType(os, binary, patch_stride_);
-  WriteToken(os, binary, "<AppendedConv>");
-  WriteBasicType(os, binary, appended_conv_);
-  WriteToken(os, binary, "<FilterParams>");
-  filter_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<IsGradient>");
-  WriteBasicType(os, binary, is_gradient_);
-  WriteToken(os, binary, ostr_end.str());
-}
-
-BaseFloat Convolutional1dComponent::DotProduct(const UpdatableComponent &other_in) const {
-  const Convolutional1dComponent *other =
-      dynamic_cast<const Convolutional1dComponent*>(&other_in);
-  return TraceMatMat(filter_params_, other->filter_params_, kTrans)
-         + VecVec(bias_params_, other->bias_params_);
-}
-
-Component* Convolutional1dComponent::Copy() const {
-  Convolutional1dComponent *ans = new Convolutional1dComponent();
-  ans->learning_rate_ = learning_rate_;
-  ans->patch_dim_ = patch_dim_;
-  ans->patch_step_ = patch_step_;
-  ans->patch_stride_ = patch_stride_;
-  ans->filter_params_ = filter_params_;
-  ans->bias_params_ = bias_params_;
-  ans->appended_conv_ = appended_conv_;
-  ans->is_gradient_ = is_gradient_;
-  return ans;
-}
-
-void Convolutional1dComponent::PerturbParams(BaseFloat stddev) {
-  CuMatrix<BaseFloat> temp_filter_params(filter_params_);
-  temp_filter_params.SetRandn();
-  filter_params_.AddMat(stddev, temp_filter_params);
-
-  CuVector<BaseFloat> temp_bias_params(bias_params_);
-  temp_bias_params.SetRandn();
-  bias_params_.AddVec(stddev, temp_bias_params);
-}
-
-void Convolutional1dComponent::SetParams(const VectorBase<BaseFloat> &bias,
-                                         const MatrixBase<BaseFloat> &filter) {
-  bias_params_ = bias;
-  filter_params_ = filter;
-  KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows());
-}
-
-int32 Convolutional1dComponent::GetParameterDim() const {
-  return (filter_params_.NumCols() + 1) * filter_params_.NumRows();
-}
-
-// update parameters
-void Convolutional1dComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
-                                      const CuMatrixBase<BaseFloat> &out_deriv) {
-  // useful dims
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = filter_params_.NumRows();
-  int32 filter_dim = filter_params_.NumCols();
-  int32 num_frames = in_value.NumRows();
-  int32 num_splice = InputDim() / patch_stride_;
-  CuMatrix<BaseFloat> filters_grad;
-  CuVector<BaseFloat> bias_grad;
-
-  /** Buffer of reshaped inputs:
-   *  1row = vectorized rectangular feature patches
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> patches(num_frames, filter_dim * num_patches, kUndefined);
-  std::vector<int32> column_map(filter_dim * num_patches);
-  for (int32 patch = 0, index = 0; patch < num_patches; patch++) {
-    int32 fstride = patch * patch_step_;
-    for (int32 splice = 0; splice < num_splice; splice++) {
-      int32 cstride = splice * patch_stride_;
-      for (int32 d = 0; d < patch_dim_; d++, index++) {
-        if (appended_conv_)
-          column_map[index] = (fstride + d) * num_splice + splice;
-        else
-          column_map[index] = fstride + cstride + d;
-      }
-    }
-  }
-  CuArray<int32> cu_cols(column_map);
-  patches.CopyCols(in_value, cu_cols);
-
-  //
-  // calculate the gradient
-  //
-  filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset
-  bias_grad.Resize(num_filters, kSetZero); // reset
-
-  //
-  // use all the patches
-  //
-
-  // create a single large matrix holding the smaller matrices
-  // from the vector container filters_grad_batch along the rows
-  CuMatrix<BaseFloat> filters_grad_blocks_batch(
-      num_patches * filters_grad.NumRows(), filters_grad.NumCols());
-
-  std::vector<CuSubMatrix<BaseFloat>* > filters_grad_batch, diff_patch_batch,
-      patch_batch;
-  for (int32 p = 0; p < num_patches; p++) {
-    // form batch in vector container
-    filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
-        filters_grad_blocks_batch.RowRange(
-            p * filters_grad.NumRows(),
-            filters_grad.NumRows())));
-    diff_patch_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-        p * num_filters, num_filters)));
-    patch_batch.push_back(new CuSubMatrix<BaseFloat>(patches.ColRange(
-        p * filter_dim, filter_dim)));
-  }
-
-  AddMatMatBatched<BaseFloat>(1.0, filters_grad_batch, diff_patch_batch,
-                              kTrans, patch_batch, kNoTrans, 1.0);
-
-  // add the row blocks together to filters_grad
-  filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch);
-
-  // create a matrix holding the col blocks sum of out_deriv
-  CuMatrix<BaseFloat> out_deriv_col_blocks_sum(out_deriv.NumRows(), num_filters);
-
-  // add the col blocks together to out_deriv_col_blocks_sum
-  out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv);
-
-  bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0);
-
-  // release memory
-  for (int32 p = 0; p < num_patches; p++) {
-    delete filters_grad_batch[p];
-    delete diff_patch_batch[p];
-    delete patch_batch[p];
-  }
-
-  //
-  // update
-  //
-  filter_params_.AddMat(learning_rate_, filters_grad);
-  bias_params_.AddVec(learning_rate_, bias_grad);
-}
-
-void MaxpoolingComponent::Init(int32 input_dim, int32 output_dim,
-                               int32 pool_size, int32 pool_stride)  {
-  input_dim_ = input_dim;
-  output_dim_ = output_dim;
-  pool_size_ = pool_size;
-  pool_stride_ = pool_stride;
-
-  // sanity check
-  // number of patches
-  KALDI_ASSERT(input_dim_ % pool_stride_ == 0);
-  int32 num_patches = input_dim_ / pool_stride_;
-  // number of pools
-  KALDI_ASSERT(num_patches % pool_size_ == 0);
-  int32 num_pools = num_patches / pool_size_;
-  // check output dim
-  KALDI_ASSERT(output_dim_ == num_pools * pool_stride_);
-}
-
-void MaxpoolingComponent::InitFromString(std::string args) {
-  std::string orig_args(args);
-  int32 input_dim = 0;
-  int32 output_dim = 0;
-  int32 pool_size = -1, pool_stride = -1;
-  bool ok = true;
-
-  ok = ok && ParseFromString("input-dim", &args, &input_dim);
-  ok = ok && ParseFromString("output-dim", &args, &output_dim);
-  ok = ok && ParseFromString("pool-size", &args, &pool_size);
-  ok = ok && ParseFromString("pool-stride", &args, &pool_stride);
-
-  KALDI_LOG << output_dim << " " << input_dim << " " << ok;
-  KALDI_LOG << "Pool: " << pool_size << " "
-            << pool_stride << " " << ok;
-  if (!ok || !args.empty() || output_dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << orig_args << "\"";
-  Init(input_dim, output_dim, pool_size, pool_stride);
-}
-
-/*
-   Input and output of maxpooling component is arranged as
-   x (time), y (frequency), z (channel)
-   for efficient pooling.
- */
-void MaxpoolingComponent::Propagate(const ChunkInfo &in_info,
-                                    const ChunkInfo &out_info,
-                                    const CuMatrixBase<BaseFloat> &in,
-                                    CuMatrixBase<BaseFloat> *out) const  {
-  in_info.CheckSize(in);
-  out_info.CheckSize(*out);
-  KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
-  int32 num_patches = input_dim_ / pool_stride_;
-  int32 num_pools = num_patches / pool_size_;
-
-  // do the max-pooling
-  for (int32 q = 0; q < num_pools; q++) {
-    // get output buffer of the pool
-    CuSubMatrix<BaseFloat> pool(out->ColRange(q * pool_stride_, pool_stride_));
-    pool.Set(-1e20); // reset a large negative value
-    for (int32 r = 0; r < pool_size_; r++) {
-      // col-by-col block comparison pool
-      int32 p = r + q * pool_size_;
-      pool.Max(in.ColRange(p * pool_stride_, pool_stride_));
-    }
-  }
-}
-
-void MaxpoolingComponent::Backprop(const ChunkInfo &, // in_info,
-                                   const ChunkInfo &, // out_info,
-                                   const CuMatrixBase<BaseFloat> &in_value,
-                                   const CuMatrixBase<BaseFloat> &out_value,
-                                   const CuMatrixBase<BaseFloat> &out_deriv,
-                                   Component *to_update,
-                                   CuMatrix<BaseFloat> *in_deriv) const {
-  int32 num_patches = input_dim_ / pool_stride_;
-  int32 num_pools = num_patches / pool_size_;
-  std::vector<int32> patch_summands(num_patches, 0);
-  in_deriv->Resize(in_value.NumRows(), in_value.NumCols(), kSetZero);
-
-  for(int32 q = 0; q < num_pools; q++) {
-    for(int32 r = 0; r < pool_size_; r++) {
-      int32 p = r + q * pool_size_;
-      CuSubMatrix<BaseFloat> in_p(in_value.ColRange(p * pool_stride_, pool_stride_));
-      CuSubMatrix<BaseFloat> out_q(out_value.ColRange(q * pool_stride_, pool_stride_));
-      CuSubMatrix<BaseFloat> tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_));
-      CuMatrix<BaseFloat> src(out_deriv.ColRange(q * pool_stride_, pool_stride_));
-      // zero-out mask
-      CuMatrix<BaseFloat> mask;
-      in_p.EqualElementMask(out_q, &mask);
-      src.MulElements(mask);
-      tgt.AddMat(1.0, src);
-      // summed deriv info
-      patch_summands[p] += 1;
-    }
-  }
-
-  // scale in_deriv of overlaped pools
-  for(int32 p = 0; p < num_patches; p++) {
-    CuSubMatrix<BaseFloat> tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_));
-    KALDI_ASSERT(patch_summands[p] > 0);
-    tgt.Scale(1.0 / patch_summands[p]);
-  }
-}
-
-void MaxpoolingComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_);
-  ExpectToken(is, binary, "<OutputDim>");
-  ReadBasicType(is, binary, &output_dim_);
-  ExpectToken(is, binary, "<PoolSize>");
-  ReadBasicType(is, binary, &pool_size_);
-  ExpectToken(is, binary, "<PoolStride>");
-  ReadBasicType(is, binary, &pool_stride_);
-  ExpectToken(is, binary, "</MaxpoolingComponent>");
-}
-
-void MaxpoolingComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<MaxpoolingComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  WriteToken(os, binary, "<OutputDim>");
-  WriteBasicType(os, binary, output_dim_);
-  WriteToken(os, binary, "<PoolSize>");
-  WriteBasicType(os, binary, pool_size_);
-  WriteToken(os, binary, "<PoolStride>");
-  WriteBasicType(os, binary, pool_stride_);
-  WriteToken(os, binary, "</MaxpoolingComponent>");
-}
-
-std::string MaxpoolingComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", input-dim = " << input_dim_
-         << ", output-dim = " << output_dim_
-         << ", pool-size = " << pool_size_
-         << ", pool-stride = " << pool_stride_;
-  return stream.str();
-}
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-component.h b/src/nnet2/nnet-component.h
deleted file mode 100644
index 673467eeee4..00000000000
--- a/src/nnet2/nnet-component.h
+++ /dev/null
@@ -1,1816 +0,0 @@
-// nnet2/nnet-component.h
-
-// Copyright 2011-2013  Karel Vesely
-//           2012-2014  Johns Hopkins University (author: Daniel Povey)
-//                2013  Xiaohui Zhang
-//                2014  Vijayaditya Peddinti
-//           2014-2015  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_COMPONENT_H_
-#define KALDI_NNET2_NNET_COMPONENT_H_
-
-#include <mutex>
-#include "base/kaldi-common.h"
-#include "itf/options-itf.h"
-#include "matrix/matrix-lib.h"
-#include "cudamatrix/cu-matrix-lib.h"
-#include "nnet2/nnet-precondition-online.h"
-
-#include <iostream>
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/**
-   ChunkInfo is a class whose purpose is to describe the structure of matrices
-   holding features.  This is useful mostly in training time.
-   The main reason why we have this is to support efficient
-   training for networks which we have splicing components that splice in a
-   non-contiguous way, e.g. frames -5, 0 and 5.  We also have in mind future
-   extensibility to convnets which might have similar issues.  This class
-   describes the structure of a minibatch of features, or of a single
-   contiguous block of features.
-   Examples are as follows, and offsets is empty if not mentioned:
-     When decoding, at input to the network:
-       feat_dim = 13, num_chunks = 1, first_offset = 0, last_offset = 691
-      and in the middle of the network (assuming splicing is +-7):
-       feat_dim = 1024, num_chunks = 1, first_offset = 7, last_offset = 684
-    When training, at input to the network:
-      feat_dim = 13, num_chunks = 512, first_offset = 0, last_offset= 14
-     and in the middle of the network:
-      feat_dim = 1024, num_chunks = 512, first_offset = 7, last_offset = 7
-   The only situation where offsets would be nonempty would be if we do
-   splicing with gaps in.  E.g. suppose at network input we splice +-2 frames
-   (contiguous) and somewhere in the middle we splice frames {-5, 0, 5}, then
-   we would have the following while training
-     At input to the network:
-      feat_dim = 13, num_chunks = 512, first_offset = 0, last_offset = 14
-     After the first hidden layer:
-      feat_dim = 1024, num_chunks = 512, first_offset = 2, last_offset = 12,
-       offsets = {2, 10, 12}
-     At the output of the last hidden layer (after the {-5, 0, 5} splice):
-      feat_dim = 1024, num_chunks = 512, first_offset = 7, last_offset = 7
-   (the decoding setup would still look pretty normal, so we don't give an example).
-
-*/
-class ChunkInfo {
- public:
-  ChunkInfo()  // default constructor we assume this object will not be used
-      : feat_dim_(0), num_chunks_(0),
-        first_offset_(0), last_offset_(0),
-        offsets_() { }
-
-  ChunkInfo(int32 feat_dim, int32 num_chunks,
-            int32 first_offset, int32 last_offset )
-      : feat_dim_(feat_dim), num_chunks_(num_chunks),
-        first_offset_(first_offset), last_offset_(last_offset),
-        offsets_() { Check(); }
-
-  ChunkInfo(int32 feat_dim, int32 num_chunks,
-            const std::vector<int32> offsets)
-      : feat_dim_(feat_dim), num_chunks_(num_chunks),
-        first_offset_(offsets.front()), last_offset_(offsets.back()),
-        offsets_(offsets) { if (last_offset_ - first_offset_ + 1 == offsets_.size())
-                              offsets_.clear();
-          Check(); }
-
-  // index : actual row index in the current chunk
-  // offset : the time offset of feature frame at current row in the chunk
-  // As described above offsets can take a variety of values, we see the indices
-  // corresponding to the offsets in each case
-  // 1) if first_offset = 0 & last_offset = 691, then chunk has data
-  // corresponding to time offsets 0:691, so index = offset
-  // 2) if first_offset = 7 & last_offset = 684,
-  //      then index = offset - first offset
-  // 3) if offsets = {2, 10, 12} then indices for these offsets are 0, 1 and 2
-
-  // Returns the chunk row index corresponding to given time offset
-  int32 GetIndex (int32 offset) const;
-
-  // Returns time offset at the current row index in the chunk
-  int32 GetOffset (int32 index) const;
-
-  // Makes the offsets vector empty, to ensure that the chunk is processed as a
-  // contiguous chunk with the given first_offset and last_offset
-  void MakeOffsetsContiguous () { offsets_.clear(); Check(); }
-
-  // Returns chunk size, meaning the number of distinct frame-offsets we
-  // have for each chunk (they don't have to be contiguous).
-  inline int32 ChunkSize() const { return NumRows() / num_chunks_; }
-
-  // Returns number of chunks we expect the feature matrix to have
-  inline int32 NumChunks() const { return num_chunks_; }
-
-  /// Returns the number of rows that we expect the feature matrix to have.
-  int32 NumRows() const {
-    return num_chunks_ * (!offsets_.empty() ? offsets_.size() :
-                                         last_offset_ - first_offset_ + 1); }
-
-  /// Returns the number of columns that we expect the feature matrix to have.
-  int32 NumCols() const { return feat_dim_; }
-
-  /// Checks that the matrix has the size we expect, and die if not.
-  void CheckSize(const CuMatrixBase<BaseFloat> &mat) const;
-
-  /// Checks that the data in the ChunkInfo is valid, and die if not.
-  void Check() const;
-
- private:
-  int32 feat_dim_;  // Feature dimension.
-  int32 num_chunks_;  // Number of separate equal-sized chunks of features
-  int32 first_offset_;  // Start time offset within each chunk, numbered so that at
-                      // the input to the network, the first_offset of the first
-                      // feature would always be zero.
-  int32 last_offset_;  // End time offset within each chunk.
-  std::vector<int32> offsets_; // offsets is only nonempty if the chunk contains
-                             // a non-contiguous sequence.  If nonempty, it must
-                             // be sorted, and offsets.front() == first_offset,
-                             // offsets.back() == last_offset.
-
-};
-
-/**
- * Abstract class, basic element of the network,
- * it is a box with defined inputs, outputs,
- * and tranformation functions interface.
- *
- * It is able to propagate and backpropagate
- * exact implementation is to be implemented in descendants.
- *
- */
-class Component {
- public:
-  Component(): index_(-1) { }
-
-  virtual std::string Type() const = 0; // each type should return a string such as
-  // "SigmoidComponent".
-
-  /// Returns the index in the sequence of layers in the neural net; intended only
-  /// to be used in debugging information.
-  virtual int32 Index() const { return index_; }
-
-  virtual void SetIndex(int32 index) { index_ = index; }
-
-  /// Initialize, typically from a line of a config file.  The "args" will
-  /// contain any parameters that need to be passed to the Component, e.g.
-  /// dimensions.
-  virtual void InitFromString(std::string args) = 0;
-
-  /// Get size of input vectors
-  virtual int32 InputDim() const = 0;
-
-  /// Get size of output vectors
-  virtual int32 OutputDim() const = 0;
-
-  /// Return a vector describing the temporal context this component requires
-  /// for each frame of output, as a sorted list.  The default implementation
-  /// returns a vector ( 0 ), but a splicing layer might return e.g. (-2, -1, 0,
-  /// 1, 2), but it doesn't have to be contiguous.  Note : The context needed by
-  /// the entire network is a function of the contexts needed by all the
-  /// components.  It is required that Context().front() <= 0 and
-  /// Context().back() >= 0.
-  virtual std::vector<int32> Context() const { return std::vector<int32>(1, 0); }
-
-  /// Perform forward pass propagation Input->Output.  Each row is
-  /// one frame or training example.  Interpreted as "num_chunks"
-  /// equally sized chunks of frames; this only matters for layers
-  /// that do things like context splicing.  Typically this variable
-  /// will either be 1 (when we're processing a single contiguous
-  /// chunk of data) or will be the same as in.NumFrames(), but
-  /// other values are possible if some layers do splicing.
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const = 0;
-
-  /// A non-virtual propagate function that first resizes output if necessary.
-  void Propagate(const ChunkInfo &in_info,
-                 const ChunkInfo &out_info,
-                 const CuMatrixBase<BaseFloat> &in,
-                 CuMatrix<BaseFloat> *out) const {
-    if (out->NumRows() != out_info.NumRows() ||
-        out->NumCols() != out_info.NumCols()) {
-      out->Resize(out_info.NumRows(), out_info.NumCols());
-    }
-
-    // Cast to CuMatrixBase to use the virtual version of propagate function.
-    Propagate(in_info, out_info, in,
-              static_cast<CuMatrixBase<BaseFloat>*>(out));
-  }
-
-  /// Perform backward pass propagation of the derivative, and
-  /// also either update the model (if to_update == this) or
-  /// update another model or compute the model derivative (otherwise).
-  /// Note: in_value and out_value are the values of the input and output
-  /// of the component, and these may be dummy variables if respectively
-  /// BackpropNeedsInput() or BackpropNeedsOutput() return false for
-  /// that component (not all components need these).
-  ///
-  /// num_chunks lets us treat the input matrix as contiguous-in-time
-  /// chunks of equal size; it only matters if splicing is involved.
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const = 0;
-
-  virtual bool BackpropNeedsInput() const { return true; } // if this returns false,
-  // the "in_value" to Backprop may be a dummy variable.
-  virtual bool BackpropNeedsOutput() const { return true; } // if this returns false,
-  // the "out_value" to Backprop may be a dummy variable.
-
-  /// Read component from stream
-  static Component* ReadNew(std::istream &is, bool binary);
-
-  /// Copy component (deep copy).
-  virtual Component* Copy() const = 0;
-
-  /// Initialize the Component from one line that will contain
-  /// first the type, e.g. SigmoidComponent, and then
-  /// a number of tokens (typically integers or floats) that will
-  /// be used to initialize the component.
-  static Component *NewFromString(const std::string &initializer_line);
-
-  /// Return a new Component of the given type e.g. "SoftmaxComponent",
-  /// or NULL if no such type exists.
-  static Component *NewComponentOfType(const std::string &type);
-
-  virtual void Read(std::istream &is, bool binary) = 0; // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const = 0;
-
-  virtual std::string Info() const;
-
-  virtual ~Component() { }
-
- private:
-  int32 index_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(Component);
-};
-
-
-/**
- * Class UpdatableComponent is a Component which has
- * trainable parameters and contains some global
- * parameters for stochastic gradient descent
- * (learning rate, L2 regularization constant).
- * This is a base-class for Components with parameters.
- */
-class UpdatableComponent: public Component {
- public:
-  UpdatableComponent(const UpdatableComponent &other):
-      learning_rate_(other.learning_rate_){ }
-
-  void Init(BaseFloat learning_rate) {
-    learning_rate_ = learning_rate;
-  }
-  UpdatableComponent(BaseFloat learning_rate) {
-    Init(learning_rate);
-  }
-
-  /// Set parameters to zero, and if treat_as_gradient is true, we'll be
-  /// treating this as a gradient so set the learning rate to 1 and make any
-  /// other changes necessary (there's a variable we have to set for the
-  /// MixtureProbComponent).
-  virtual void SetZero(bool treat_as_gradient) = 0;
-
-  UpdatableComponent(): learning_rate_(0.001) { }
-
-  virtual ~UpdatableComponent() { }
-
-  /// Here, "other" is a component of the same specific type.  This
-  /// function computes the dot product in parameters, and is computed while
-  /// automatically adjusting learning rates; typically, one of the two will
-  /// actually contain the gradient.
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const = 0;
-
-  /// We introduce a new virtual function that only applies to
-  /// class UpdatableComponent.  This is used in testing.
-  virtual void PerturbParams(BaseFloat stddev) = 0;
-
-  /// This new virtual function scales the parameters
-  /// by this amount.
-  virtual void Scale(BaseFloat scale) = 0;
-
-  /// This new virtual function adds the parameters of another
-  /// updatable component, times some constant, to the current
-  /// parameters.
-  virtual void Add(BaseFloat alpha, const UpdatableComponent &other) = 0;
-
-  /// Sets the learning rate of gradient descent
-  void SetLearningRate(BaseFloat lrate) {  learning_rate_ = lrate; }
-  /// Gets the learning rate of gradient descent
-  BaseFloat LearningRate() const { return learning_rate_; }
-
-  virtual std::string Info() const;
-
-  // The next few functions are not implemented everywhere; they are
-  // intended for use by L-BFGS code, and we won't implement them
-  // for all child classes.
-
-  /// The following new virtual function returns the total dimension of
-  /// the parameters in this class.  E.g. used for L-BFGS update
-  virtual int32 GetParameterDim() const { KALDI_ASSERT(0); return 0; }
-
-  /// Turns the parameters into vector form.  We put the vector form on the CPU,
-  /// because in the kinds of situations where we do this, we'll tend to use
-  /// too much memory for the GPU.
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const { KALDI_ASSERT(0); }
-  /// Converts the parameters from vector form.
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params) {
-    KALDI_ASSERT(0);
-  }
-
- protected:
-  BaseFloat learning_rate_; ///< learning rate (0.0..0.01)
- private:
-  const UpdatableComponent &operator = (const UpdatableComponent &other); // Disallow.
-};
-
-/// This kind of Component is a base-class for things like
-/// sigmoid and softmax.
-class NonlinearComponent: public Component {
- public:
-  void Init(int32 dim) { dim_ = dim; count_ = 0.0; }
-  explicit NonlinearComponent(int32 dim) { Init(dim); }
-  NonlinearComponent(): dim_(0) { } // e.g. prior to Read().
-  explicit NonlinearComponent(const NonlinearComponent &other);
-
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-
-  /// We implement InitFromString at this level.
-  virtual void InitFromString(std::string args);
-
-  /// We implement Read at this level as it just needs the Type().
-  virtual void Read(std::istream &is, bool binary);
-
-  /// Write component to stream.
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  void Scale(BaseFloat scale); // relates to scaling stats, not parameters.
-  void Add(BaseFloat alpha, const NonlinearComponent &other); // relates to
-                                                              // adding stats
-
-  // The following functions are unique to NonlinearComponent.
-  // They mostly relate to diagnostics.
-  const CuVector<double> &ValueSum() const { return value_sum_; }
-  const CuVector<double> &DerivSum() const { return deriv_sum_; }
-  double Count() const { return count_; }
-
-  // The following function is used when "widening" neural networks.
-  void SetDim(int32 dim);
-
- protected:
-  friend class NormalizationComponent;
-  friend class SigmoidComponent;
-  friend class TanhComponent;
-  friend class SoftmaxComponent;
-  friend class LogSoftmaxComponent;
-  friend class RectifiedLinearComponent;
-  friend class SoftHingeComponent;
-
-
-  // This function updates the stats "value_sum_", "deriv_sum_", and
-  // count_. (If deriv == NULL, it won't update "deriv_sum_").
-  // It will be called from the Backprop function of child classes.
-  void UpdateStats(const CuMatrixBase<BaseFloat> &out_value,
-                   const CuMatrixBase<BaseFloat> *deriv = NULL);
-
-
-  const NonlinearComponent &operator = (const NonlinearComponent &other); // Disallow.
-  int32 dim_;
-  CuVector<double> value_sum_; // stats at the output.
-  CuVector<double> deriv_sum_; // stats of the derivative of the nonlinearity (only
-  // applicable to element-by-element nonlinearities, not Softmax.
-  double count_;
-  // The mutex is used in UpdateStats, only for resizing vectors.
-  std::mutex mutex_;
-};
-
-class MaxoutComponent: public Component {
- public:
-  void Init(int32 input_dim, int32 output_dim);
-  explicit MaxoutComponent(int32 input_dim, int32 output_dim) {
-    Init(input_dim, output_dim);
-  }
-  MaxoutComponent(): input_dim_(0), output_dim_(0) { }
-  virtual std::string Type() const { return "MaxoutComponent"; }
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const { return output_dim_; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &,  //out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  virtual Component* Copy() const { return new MaxoutComponent(input_dim_,
-                                                              output_dim_); }
-
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual std::string Info() const;
- protected:
-  int32 input_dim_;
-  int32 output_dim_;
-};
-
-/**
- * MaxPoolingComponent :
- * Maxpooling component was firstly used in ConvNet for selecting an representative
- * activation in an area. It inspired Maxout nonlinearity.
- *
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * For instance, a minibatch of 512 frames is propagated by a convolutional
- * layer, resulting in a 512 x 3840 input matrix for MaxpoolingComponent,
- * which is composed of 128 feature maps for each frame (128 x 30). If you want
- * a 3-to-1 maxpooling on each feature map, set 'pool_stride_' and 'pool_size_'
- * as 128 and 3 respectively. Maxpooling component would create an output
- * matrix of 512 x 1280. The 30 input neurons are grouped by a group size of 3, and
- * the maximum in a group is selected, creating a smaller feature map of 10.
- *
- * Our pooling does not supports overlaps, which simplifies the
- * implementation (and was not helpful for Ossama).
- */
-class MaxpoolingComponent: public Component {
- public:
-  void Init(int32 input_dim, int32 output_dim,
-            int32 pool_size, int32 pool_stride);
-  explicit MaxpoolingComponent(int32 input_dim, int32 output_dim,
-                               int32 pool_size, int32 pool_stride) {
-    Init(input_dim, output_dim, pool_size, pool_stride);
-  }
-  MaxpoolingComponent(): input_dim_(0), output_dim_(0),
-    pool_size_(0), pool_stride_(0) { }
-  virtual std::string Type() const { return "MaxpoolingComponent"; }
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const { return output_dim_; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &,  //out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  virtual Component* Copy() const {
-    return new MaxpoolingComponent(input_dim_, output_dim_,
-                               pool_size_, pool_stride_); }
-
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual std::string Info() const;
- protected:
-  int32 input_dim_;
-  int32 output_dim_;
-  int32 pool_size_;
-  int32 pool_stride_;
-};
-
-class PnormComponent: public Component {
- public:
-  void Init(int32 input_dim, int32 output_dim, BaseFloat p);
-  explicit PnormComponent(int32 input_dim, int32 output_dim, BaseFloat p) {
-    Init(input_dim, output_dim, p);
-  }
-  PnormComponent(): input_dim_(0), output_dim_(0), p_(0) { }
-  virtual std::string Type() const { return "PnormComponent"; }
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const { return output_dim_; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &,  //out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  virtual Component* Copy() const { return new PnormComponent(input_dim_,
-                                                              output_dim_, p_); }
-
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual std::string Info() const;
- protected:
-  int32 input_dim_;
-  int32 output_dim_;
-  BaseFloat p_;
-};
-
-class NormalizeComponent: public NonlinearComponent {
- public:
-  explicit NormalizeComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit NormalizeComponent(const NormalizeComponent &other): NonlinearComponent(other) { }
-  NormalizeComponent() { }
-  virtual std::string Type() const { return "NormalizeComponent"; }
-  virtual Component* Copy() const { return new NormalizeComponent(*this); }
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
- private:
-  NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
-  static const BaseFloat kNormFloor;
-  // about 0.7e-20.  We need a value that's exactly representable in
-  // float and whose inverse square root is also exactly representable
-  // in float (hence, an even power of two).
-};
-
-
-class SigmoidComponent: public NonlinearComponent {
- public:
-  explicit SigmoidComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }
-  SigmoidComponent() { }
-  virtual std::string Type() const { return "SigmoidComponent"; }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  virtual Component* Copy() const { return new SigmoidComponent(*this); }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
- private:
-  SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow.
-};
-
-class TanhComponent: public NonlinearComponent {
- public:
-  explicit TanhComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit TanhComponent(const TanhComponent &other): NonlinearComponent(other) { }
-  TanhComponent() { }
-  virtual std::string Type() const { return "TanhComponent"; }
-  virtual Component* Copy() const { return new TanhComponent(*this); }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
- private:
-  TanhComponent &operator = (const TanhComponent &other); // Disallow.
-};
-
-/// Take the absoute values of an input vector to a power.
-/// The derivative for zero input will be treated as zero.
-class PowerComponent: public NonlinearComponent {
- public:
-  void Init(int32 dim, BaseFloat power = 2);
-  explicit PowerComponent(int32 dim, BaseFloat power = 2) {
-    Init(dim, power);
-  }
-  PowerComponent(): dim_(0), power_(2) { }
-  virtual std::string Type() const { return "PowerComponent"; }
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  virtual Component* Copy() const { return new PowerComponent(dim_, power_); }
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual std::string Info() const;
-
- private:
-  int32 dim_;
-  BaseFloat power_;
-};
-
-class RectifiedLinearComponent: public NonlinearComponent {
- public:
-  explicit RectifiedLinearComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit RectifiedLinearComponent(const RectifiedLinearComponent &other): NonlinearComponent(other) { }
-  RectifiedLinearComponent() { }
-  virtual std::string Type() const { return "RectifiedLinearComponent"; }
-  virtual Component* Copy() const { return new RectifiedLinearComponent(*this); }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
- private:
-  RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
-};
-
-class SoftHingeComponent: public NonlinearComponent {
- public:
-  explicit SoftHingeComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit SoftHingeComponent(const SoftHingeComponent &other): NonlinearComponent(other) { }
-  SoftHingeComponent() { }
-  virtual std::string Type() const { return "SoftHingeComponent"; }
-  virtual Component* Copy() const { return new SoftHingeComponent(*this); }
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
- private:
-  SoftHingeComponent &operator = (const SoftHingeComponent &other); // Disallow.
-};
-
-
-// This class scales the input by a specified constant.  This is, of course,
-// useless, but we use it when we want to change how fast the next layer learns.
-// (e.g. a smaller scale will make the next layer learn slower.)
-class ScaleComponent: public Component {
- public:
-  explicit ScaleComponent(int32 dim, BaseFloat scale): dim_(dim), scale_(scale) { }
-  explicit ScaleComponent(const ScaleComponent &other):
-      dim_(other.dim_), scale_(other.scale_) { }
-  ScaleComponent(): dim_(0), scale_(0.0) { }
-  virtual std::string Type() const { return "ScaleComponent"; }
-  virtual Component* Copy() const { return new ScaleComponent(*this); }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-  virtual void Read(std::istream &is, bool binary);
-
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  void Init(int32 dim, BaseFloat scale);
-
-  virtual void InitFromString(std::string args);
-
-  virtual std::string Info() const;
-
- private:
-  int32 dim_;
-  BaseFloat scale_;
-  ScaleComponent &operator = (const ScaleComponent &other); // Disallow.
-};
-
-
-
-class SumGroupComponent;  // Forward declaration.
-class AffineComponent;  // Forward declaration.
-class FixedScaleComponent;  // Forward declaration.
-
-class SoftmaxComponent: public NonlinearComponent {
- public:
-  explicit SoftmaxComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit SoftmaxComponent(const SoftmaxComponent &other): NonlinearComponent(other) { }
-  SoftmaxComponent() { }
-  virtual std::string Type() const { return "SoftmaxComponent"; }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-
-  void MixUp(int32 num_mixtures,
-             BaseFloat power,
-             BaseFloat min_count,
-             BaseFloat perturb_stddev,
-             AffineComponent *ac,
-             SumGroupComponent *sc);
-
-  virtual Component* Copy() const { return new SoftmaxComponent(*this); }
- private:
-  SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
-};
-
-class LogSoftmaxComponent: public NonlinearComponent {
- public:
-  explicit LogSoftmaxComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit LogSoftmaxComponent(const LogSoftmaxComponent &other): NonlinearComponent(other) { }
-  LogSoftmaxComponent() { }
-  virtual std::string Type() const { return "LogSoftmaxComponent"; }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-
-  virtual Component* Copy() const { return new LogSoftmaxComponent(*this); }
- private:
-  LogSoftmaxComponent &operator = (const LogSoftmaxComponent &other); // Disallow.
-};
-
-
-class FixedAffineComponent;
-
-// Affine means a linear function plus an offset.
-// Note: although this class can be instantiated, it also
-// functions as a base-class for more specialized versions of
-// AffineComponent.
-class AffineComponent: public UpdatableComponent {
-  friend class SoftmaxComponent; // Friend declaration relates to mixing up.
- public:
-  AffineComponent(const AffineComponent &other);
-  // The next constructor is used in converting from nnet1.
-  AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
-                  const CuVectorBase<BaseFloat> &bias_params,
-                  BaseFloat learning_rate);
-
-  virtual int32 InputDim() const { return linear_params_.NumCols(); }
-  virtual int32 OutputDim() const { return linear_params_.NumRows(); }
-  void Init(BaseFloat learning_rate,
-            int32 input_dim, int32 output_dim,
-            BaseFloat param_stddev, BaseFloat bias_stddev);
-  void Init(BaseFloat learning_rate,
-            std::string matrix_filename);
-
-  // This function resizes the dimensions of the component, setting the
-  // parameters to zero, while leaving any other configuration values the same.
-  virtual void Resize(int32 input_dim, int32 output_dim);
-
-  // The following functions are used for collapsing multiple layers
-  // together.  They return a pointer to a new Component equivalent to
-  // the sequence of two components.  We haven't implemented this for
-  // FixedLinearComponent yet.
-  Component *CollapseWithNext(const AffineComponent &next) const ;
-  Component *CollapseWithNext(const FixedAffineComponent &next) const;
-  Component *CollapseWithNext(const FixedScaleComponent &next) const;
-  Component *CollapseWithPrevious(const FixedAffineComponent &prev) const;
-
-  virtual std::string Info() const;
-  virtual void InitFromString(std::string args);
-
-  AffineComponent(): is_gradient_(false) { } // use Init to really initialize.
-  virtual std::string Type() const { return "AffineComponent"; }
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual void SetZero(bool treat_as_gradient);
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  virtual Component* Copy() const;
-  virtual void PerturbParams(BaseFloat stddev);
-  // This new function is used when mixing up:
-  virtual void SetParams(const VectorBase<BaseFloat> &bias,
-                         const MatrixBase<BaseFloat> &linear);
-  const CuVector<BaseFloat> &BiasParams() { return bias_params_; }
-  const CuMatrix<BaseFloat> &LinearParams() { return linear_params_; }
-
-  virtual int32 GetParameterDim() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-
-  /// This function is for getting a low-rank approximations of this
-  /// AffineComponent by two AffineComponents.
-  virtual void LimitRank(int32 dimension,
-                         AffineComponent **a, AffineComponent **b) const;
-
-  /// This function is implemented in widen-nnet.cc
-  void Widen(int32 new_dimension,
-             BaseFloat param_stddev,
-             BaseFloat bias_stddev,
-             std::vector<NonlinearComponent*> c2, // will usually have just one
-                                                  // element.
-             AffineComponent *c3);
- protected:
-  friend class AffineComponentPreconditionedOnline;
-  // This function Update() is for extensibility; child classes may override this.
-  virtual void Update(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv) {
-    UpdateSimple(in_value, out_deriv);
-  }
-  // UpdateSimple is used when *this is a gradient.  Child classes may
-  // or may not override this.
-  virtual void UpdateSimple(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv);
-
-  const AffineComponent &operator = (const AffineComponent &other); // Disallow.
-  CuMatrix<BaseFloat> linear_params_;
-  CuVector<BaseFloat> bias_params_;
-
-  bool is_gradient_; // If true, treat this as just a gradient.
-};
-
-
-// This is an idea Dan is trying out, a little bit like
-// preconditioning the update with the Fisher matrix, but the
-// Fisher matrix has a special structure.
-// [note: it is currently used in the standard recipe].
-class AffineComponentPreconditioned: public AffineComponent {
- public:
-  virtual std::string Type() const { return "AffineComponentPreconditioned"; }
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  void Init(BaseFloat learning_rate,
-            int32 input_dim, int32 output_dim,
-            BaseFloat param_stddev, BaseFloat bias_stddev,
-            BaseFloat alpha, BaseFloat max_change);
-  void Init(BaseFloat learning_rate, BaseFloat alpha,
-            BaseFloat max_change, std::string matrix_filename);
-
-  virtual void InitFromString(std::string args);
-  virtual std::string Info() const;
-  virtual Component* Copy() const;
-  AffineComponentPreconditioned(): alpha_(1.0), max_change_(0.0) { }
-  void SetMaxChange(BaseFloat max_change) { max_change_ = max_change; }
- protected:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(AffineComponentPreconditioned);
-  BaseFloat alpha_;
-  BaseFloat max_change_; // If > 0, this is the maximum amount of parameter change (in L2 norm)
-                         // that we allow per minibatch.  This was introduced in order to
-                         // control instability.  Instead of the exact L2 parameter change,
-                         // for efficiency purposes we limit a bound on the exact change.
-                         // The limit is applied via a constant <= 1.0 for each minibatch,
-                         // A suitable value might be, for example, 10 or so; larger if there are
-                         // more parameters.
-
-  /// The following function is only called if max_change_ > 0.  It returns the
-  /// greatest value alpha <= 1.0 such that (alpha times the sum over the
-  /// row-index of the two matrices of the product the l2 norms of the two rows
-  /// times learning_rate_)
-  /// is <= max_change.
-  BaseFloat GetScalingFactor(const CuMatrix<BaseFloat> &in_value_precon,
-                             const CuMatrix<BaseFloat> &out_deriv_precon);
-
-  virtual void Update(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv);
-};
-
-
-/// Keywords: natural gradient descent, NG-SGD, naturalgradient.  For
-/// the top-level of the natural gradient code look here, and also in
-/// nnet-precondition-online.h.
-/// AffineComponentPreconditionedOnline is, like AffineComponentPreconditioned,
-/// a version of AffineComponent that has a non-(multiple of unit) learning-rate
-/// matrix.  See nnet-precondition-online.h for a description of the technique.
-class AffineComponentPreconditionedOnline: public AffineComponent {
- public:
-  virtual std::string Type() const {
-    return "AffineComponentPreconditionedOnline";
-  }
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  void Init(BaseFloat learning_rate,
-            int32 input_dim, int32 output_dim,
-            BaseFloat param_stddev, BaseFloat bias_stddev,
-            int32 rank_in, int32 rank_out, int32 update_period,
-            BaseFloat num_samples_history, BaseFloat alpha,
-            BaseFloat max_change_per_sample);
-  void Init(BaseFloat learning_rate, int32 rank_in,
-            int32 rank_out, int32 update_period,
-            BaseFloat num_samples_history,
-            BaseFloat alpha, BaseFloat max_change_per_sample,
-            std::string matrix_filename);
-
-  virtual void Resize(int32 input_dim, int32 output_dim);
-
-  // This constructor is used when converting neural networks partway through
-  // training, from AffineComponent or AffineComponentPreconditioned to
-  // AffineComponentPreconditionedOnline.
-  AffineComponentPreconditionedOnline(const AffineComponent &orig,
-                                      int32 rank_in, int32 rank_out,
-                                      int32 update_period,
-                                      BaseFloat eta, BaseFloat alpha);
-
-  virtual void InitFromString(std::string args);
-  virtual std::string Info() const;
-  virtual Component* Copy() const;
-  AffineComponentPreconditionedOnline(): max_change_per_sample_(0.0) { }
-
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(AffineComponentPreconditionedOnline);
-
-
-  // Configs for preconditioner.  The input side tends to be better conditioned ->
-  // smaller rank needed, so make them separately configurable.
-  int32 rank_in_;
-  int32 rank_out_;
-  int32 update_period_;
-  BaseFloat num_samples_history_;
-  BaseFloat alpha_;
-
-  OnlinePreconditioner preconditioner_in_;
-
-  OnlinePreconditioner preconditioner_out_;
-
-  BaseFloat max_change_per_sample_;
-  // If > 0, max_change_per_sample_ this is the maximum amount of parameter
-  // change (in L2 norm) that we allow per sample, averaged over the minibatch.
-  // This was introduced in order to control instability.
-  // Instead of the exact L2 parameter change, for
-  // efficiency purposes we limit a bound on the exact
-  // change.  The limit is applied via a constant <= 1.0
-  // for each minibatch, A suitable value might be, for
-  // example, 10 or so; larger if there are more
-  // parameters.
-
-  /// The following function is only called if max_change_per_sample_ > 0, it returns a
-  /// scaling factor alpha <= 1.0 (1.0 in the normal case) that enforces the
-  /// "max-change" constraint.  "in_products" is the inner product with itself
-  /// of each row of the matrix of preconditioned input features; "out_products"
-  /// is the same for the output derivatives.  gamma_prod is a product of two
-  /// scalars that are output by the preconditioning code (for the input and
-  /// output), which we will need to multiply into the learning rate.
-  /// out_products is a pointer because we modify it in-place.
-  BaseFloat GetScalingFactor(const CuVectorBase<BaseFloat> &in_products,
-                             BaseFloat gamma_prod,
-                             CuVectorBase<BaseFloat> *out_products);
-
-  // Sets the configs rank, alpha and eta in the preconditioner objects,
-  // from the class variables.
-  void SetPreconditionerConfigs();
-
-  virtual void Update(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv);
-};
-
-class RandomComponent: public Component {
- public:
-  // This function is required in testing code and in other places we need
-  // consistency in the random number generation (e.g. when optimizing
-  // validation-set performance), but check where else we call sRand().  You'll
-  // need to call srand as well as making this call.
-  void ResetGenerator() { random_generator_.SeedGpu(); }
- protected:
-  CuRand<BaseFloat> random_generator_;
-};
-
-/// Splices a context window of frames together [over time]
-class SpliceComponent: public Component {
- public:
-  SpliceComponent() { }  // called only prior to Read() or Init().
-  // Note: it is required that the elements of "context" be in
-  // strictly increasing order, that the lowest element of component
-  // be nonpositive, and the highest element be nonnegative.
-  void Init(int32 input_dim,
-            std::vector<int32> context,
-            int32 const_component_dim=0);
-  virtual std::string Type() const { return "SpliceComponent"; }
-  virtual std::string Info() const;
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const;
-  virtual std::vector<int32> Context() const { return context_; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(SpliceComponent);
-  int32 input_dim_;
-  std::vector<int32> context_;
-  int32 const_component_dim_;
-};
-
-/// This is as SpliceComponent but outputs the max of
-/// any of the inputs (taking the max across time).
-class SpliceMaxComponent: public Component {
- public:
-  SpliceMaxComponent() { }  // called only prior to Read() or Init().
-  void Init(int32 dim,
-            std::vector<int32> context);
-  virtual std::string Type() const { return "SpliceMaxComponent"; }
-  virtual std::string Info() const;
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-  virtual std::vector<int32> Context() const  { return context_; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(SpliceMaxComponent);
-  int32 dim_;
-  std::vector<int32> context_;
-};
-
-
-// Affine means a linear function plus an offset.  "Block" means
-// here that we support a number of equal-sized blocks of parameters,
-// in the linear part, so e.g. 2 x 500 would mean 2 blocks of 500 each.
-class BlockAffineComponent: public UpdatableComponent {
- public:
-  virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
-  virtual int32 OutputDim() const { return linear_params_.NumRows(); }
-  virtual int32 GetParameterDim() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-
-  // Note: num_blocks must divide input_dim.
-  void Init(BaseFloat learning_rate,
-                    int32 input_dim, int32 output_dim,
-                    BaseFloat param_stddev, BaseFloat bias_stddev,
-                    int32 num_blocks);
-  virtual void InitFromString(std::string args);
-
-  BlockAffineComponent() { } // use Init to really initialize.
-  virtual std::string Type() const { return "BlockAffineComponent"; }
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual void SetZero(bool treat_as_gradient);
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  virtual Component* Copy() const;
-  virtual void PerturbParams(BaseFloat stddev);
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
- protected:
-  virtual void Update(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv) {
-    UpdateSimple(in_value, out_deriv);
-  }
-  // UpdateSimple is used when *this is a gradient.  Child classes may
-  // override this.
-  virtual void UpdateSimple(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv);
-
-  // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
-  // equal size.  The blocks are stored in linear_params_ as
-  // [ M
-  //   N
-  //   O ] but we actually treat it as the matrix:
-  // [ M 0 0
-  //   0 N 0
-  //   0 0 O ]
-  CuMatrix<BaseFloat> linear_params_;
-  CuVector<BaseFloat> bias_params_;
-  int32 num_blocks_;
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(BlockAffineComponent);
-
-};
-
-
-// Affine means a linear function plus an offset.  "Block" means
-// here that we support a number of equal-sized blocks of parameters,
-// in the linear part, so e.g. 2 x 500 would mean 2 blocks of 500 each.
-class BlockAffineComponentPreconditioned: public BlockAffineComponent {
- public:
-  // Note: num_blocks must divide input_dim.
-  void Init(BaseFloat learning_rate,
-            int32 input_dim, int32 output_dim,
-            BaseFloat param_stddev, BaseFloat bias_stddev,
-            int32 num_blocks, BaseFloat alpha);
-
-  virtual void InitFromString(std::string args);
-
-  BlockAffineComponentPreconditioned() { } // use Init to really initialize.
-  virtual std::string Type() const { return "BlockAffineComponentPreconditioned"; }
-  virtual void SetZero(bool treat_as_gradient);
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual Component* Copy() const;
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(BlockAffineComponentPreconditioned);
-  virtual void Update(
-      const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv);
-
-  bool is_gradient_;
-  BaseFloat alpha_;
-};
-
-// SumGroupComponent is used to sum up groups of posteriors.
-// It's used to introduce a kind of Gaussian-mixture-model-like
-// idea into neural nets.  This is basically a degenerate case of
-// MixtureProbComponent; we had to implement it separately to
-// be efficient for CUDA (we can use this one regardless whether
-// we have CUDA or not; it's the normal case we want anyway).
-class SumGroupComponent: public Component {
-public:
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const { return output_dim_; }
-  void Init(const std::vector<int32> &sizes); // the vector is of the input dim
-                                              // (>= 1) for each output dim.
-  void GetSizes(std::vector<int32> *sizes) const; // Get a vector saying, for
-                                                  // each output-dim, how many
-                                                  // inputs were summed over.
-  virtual void InitFromString(std::string args);
-  SumGroupComponent() { }
-  virtual std::string Type() const { return "SumGroupComponent"; }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  // Note: in_value and out_value are both dummy variables.
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
-private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(SumGroupComponent);
-  // Note: Int32Pair is just struct{ int32 first; int32 second }; it's defined
-  // in cu-matrixdim.h as extern "C" which is needed for the CUDA interface.
-  CuArray<Int32Pair> indexes_; // for each output index, the (start, end) input
-                               // index.
-  CuArray<int32> reverse_indexes_; // for each input index, the output index.
-  int32 input_dim_;
-  int32 output_dim_;
-};
-
-
-/// PermuteComponent does a permutation of the dimensions (by default, a fixed
-/// random permutation, but it may be specified).  Useful in conjunction with
-/// block-diagonal transforms.
-class PermuteComponent: public Component {
- public:
-  void Init(int32 dim);
-  void Init(const std::vector<int32> &reorder);
-  PermuteComponent(int32 dim) { Init(dim); }
-  PermuteComponent(const std::vector<int32> &reorder) { Init(reorder); }
-
-  PermuteComponent() { } // e.g. prior to Read() or Init()
-
-  virtual int32 InputDim() const { return reorder_.size(); }
-  virtual int32 OutputDim() const { return reorder_.size(); }
-  virtual Component *Copy() const;
-
-  virtual void InitFromString(std::string args);
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual std::string Type() const { return "PermuteComponent"; }
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(PermuteComponent);
-  std::vector<int32> reorder_; // This class sends input dimension i to
-                               // output dimension reorder_[i].
-};
-
-
-/// Discrete cosine transform.
-/// TODO: modify this Component so that it supports only keeping a subset
-class DctComponent: public Component {
- public:
-  DctComponent() { dim_ = 0; }
-  virtual std::string Type() const { return "DctComponent"; }
-  virtual std::string Info() const;
-  //dim = dimension of vector being processed
-  //dct_dim = effective lenght of DCT, i.e. how many compoments will be kept
-  void Init(int32 dim, int32 dct_dim, bool reorder, int32 keep_dct_dim=0);
-  // InitFromString takes numeric options
-  // dim, dct-dim, and (optionally) reorder={true,false}, keep-dct-dim
-  // Note: reorder defaults to false. keep-dct-dim defaults to dct-dim
-  virtual void InitFromString(std::string args);
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dct_mat_.NumRows() * (dim_ / dct_mat_.NumCols()); }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
- private:
-  void Reorder(CuMatrixBase<BaseFloat> *mat, bool reverse) const;
-  int32 dim_; // The input dimension of the (sub)vector.
-
-  bool reorder_; // If true, transformation matrix we use is not
-  // block diagonal but is block diagonal after reordering-- so
-  // effectively we transform with the Kronecker product D x I,
-  // rather than a matrix with D's on the diagonal (i.e. I x D,
-  // where x is the Kronecker product).  We'll set reorder_ to
-  // true if we want to use this to transform in the time domain,
-  // because the SpliceComponent splices blocks of e.g. MFCCs
-  // together so each time is a dimension of the block.
-
-  CuMatrix<BaseFloat> dct_mat_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DctComponent);
-};
-
-
-/// FixedLinearComponent is a linear transform that is supplied
-/// at network initialization time and is not trainable.
-class FixedLinearComponent: public Component {
- public:
-  FixedLinearComponent() { }
-  virtual std::string Type() const { return "FixedLinearComponent"; }
-  virtual std::string Info() const;
-
-  void Init(const CuMatrixBase<BaseFloat> &matrix) { mat_ = matrix; }
-
-  // InitFromString takes only the option matrix=<string>,
-  // where the string is the filename of a Kaldi-format matrix to read.
-  virtual void InitFromString(std::string args);
-
-  virtual int32 InputDim() const { return mat_.NumCols(); }
-  virtual int32 OutputDim() const { return mat_.NumRows(); }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
- protected:
-  friend class AffineComponent;
-  CuMatrix<BaseFloat> mat_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FixedLinearComponent);
-};
-
-
-/// FixedAffineComponent is an affine transform that is supplied
-/// at network initialization time and is not trainable.
-class FixedAffineComponent: public Component {
- public:
-  FixedAffineComponent() { }
-  virtual std::string Type() const { return "FixedAffineComponent"; }
-  virtual std::string Info() const;
-
-  /// matrix should be of size input-dim+1 to output-dim, last col is offset
-  void Init(const CuMatrixBase<BaseFloat> &matrix);
-
-  // InitFromString takes only the option matrix=<string>,
-  // where the string is the filename of a Kaldi-format matrix to read.
-  virtual void InitFromString(std::string args);
-
-  virtual int32 InputDim() const { return linear_params_.NumCols(); }
-  virtual int32 OutputDim() const { return linear_params_.NumRows(); }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  // Function to provide access to linear_params_.
-  const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
- protected:
-  friend class AffineComponent;
-  CuMatrix<BaseFloat> linear_params_;
-  CuVector<BaseFloat> bias_params_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FixedAffineComponent);
-};
-
-
-/// FixedScaleComponent applies a fixed per-element scale; it's similar
-/// to the Rescale component in the nnet1 setup (and only needed for nnet1
-/// model conversion).
-class FixedScaleComponent: public Component {
- public:
-  FixedScaleComponent() { }
-  virtual std::string Type() const { return "FixedScaleComponent"; }
-  virtual std::string Info() const;
-
-  void Init(const CuVectorBase<BaseFloat> &scales);
-
-  // InitFromString takes only the option scales=<string>,
-  // where the string is the filename of a Kaldi-format matrix to read.
-  virtual void InitFromString(std::string args);
-
-  virtual int32 InputDim() const { return scales_.Dim(); }
-  virtual int32 OutputDim() const { return scales_.Dim(); }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
- protected:
-  friend class AffineComponent;  // necessary for collapse
-  CuVector<BaseFloat> scales_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FixedScaleComponent);
-};
-
-/// FixedBiasComponent applies a fixed per-element bias; it's similar
-/// to the AddShift component in the nnet1 setup (and only needed for nnet1
-/// model conversion.
-class FixedBiasComponent: public Component {
- public:
-  FixedBiasComponent() { }
-  virtual std::string Type() const { return "FixedBiasComponent"; }
-  virtual std::string Info() const;
-
-  void Init(const CuVectorBase<BaseFloat> &scales);
-
-  // InitFromString takes only the option bias=<string>,
-  // where the string is the filename of a Kaldi-format matrix to read.
-  virtual void InitFromString(std::string args);
-
-  virtual int32 InputDim() const { return bias_.Dim(); }
-  virtual int32 OutputDim() const { return bias_.Dim(); }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const ;
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const;
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
- protected:
-  CuVector<BaseFloat> bias_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent);
-};
-
-
-/// This Component, if present, randomly zeroes half of
-/// the inputs and multiplies the other half by two.
-/// Typically you would use this in training but not in
-/// test or when computing validation-set objective functions.
-class DropoutComponent: public RandomComponent {
- public:
-  /// dropout-proportion is the proportion that is dropped out,
-  /// e.g. if 0.1, we set 10% to a low value.  [note, in
-  /// some older code it was interpreted as the value not dropped
-  /// out, so be careful.]  The low scale-value
-  /// is equal to dropout_scale.  The high scale-value is chosen
-  /// such that the expected scale-value is one.
-  void Init(int32 dim,
-            BaseFloat dropout_proportion = 0.5,
-            BaseFloat dropout_scale = 0.0);
-  DropoutComponent(int32 dim, BaseFloat dp = 0.5, BaseFloat sc = 0.0) {
-    Init(dim, dp, sc);
-  }
-  DropoutComponent(): dim_(0), dropout_proportion_(0.5) { }
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-  virtual void InitFromString(std::string args);
-
-  virtual void Read(std::istream &is, bool binary);
-
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual std::string Type() const { return "DropoutComponent"; }
-
-  void SetDropoutScale(BaseFloat scale) { dropout_scale_ = scale; }
-  virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return true; }
-  virtual Component* Copy() const;
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  virtual std::string Info() const;
- private:
-  int32 dim_;
-  BaseFloat dropout_proportion_;
-  BaseFloat dropout_scale_; // Set the scale that we scale "dropout_proportion_"
-  // of the neurons by (default 0.0, but can be set arbitrarily close to 1.0).
-};
-
-/// This is a bit similar to dropout but adding (not multiplying) Gaussian
-/// noise with a given standard deviation.
-class AdditiveNoiseComponent: public RandomComponent {
- public:
-  void Init(int32 dim, BaseFloat noise_stddev);
-  AdditiveNoiseComponent(int32 dim, BaseFloat stddev) { Init(dim, stddev); }
-  AdditiveNoiseComponent(): dim_(0), stddev_(1.0) { }
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-  virtual void InitFromString(std::string args);
-
-  virtual void Read(std::istream &is, bool binary);
-
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual std::string Type() const { return "AdditiveNoiseComponent"; }
-
-  virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }
-  virtual Component* Copy() const {
-    return new AdditiveNoiseComponent(dim_, stddev_);
-  }
-  using Component::Propagate; // to avoid name hiding
-  virtual void Propagate(const ChunkInfo &in_info,
-                         const ChunkInfo &out_info,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update, // may be identical to "this".
-                        CuMatrix<BaseFloat> *in_deriv) const { *in_deriv = out_deriv; }
- private:
-  int32 dim_;
-  BaseFloat stddev_;
-};
-
-/**
- * Convolutional1dComponent implements convolution over frequency axis.
- * We assume the input featrues are spliced, i.e. each frame is in
- * fact a set of stacked frames, where we can form patches which span
- * over several frequency bands and whole time axis. A patch is the
- * instance of a filter on a group of frequency bands and whole time
- * axis. Shifts of the filter generate patches.
- *
- * The convolution is done over whole axis with same filter
- * coefficients, i.e. we don't use separate filters for different
- * 'regions' of frequency axis. Due to convolution, same weights are
- * used repeateadly, the final gradient is a sum of all
- * position-specific gradients (the sum was found better than
- * averaging).
- *
- * In order to have a fast implementations, the filters are
- * represented in vectorized form, where each rectangular filter
- * corresponds to a row in a matrix, where all the filters are
- * stored. The features are then re-shaped to a set of matrices, where
- * one matrix corresponds to single patch-position, where all the
- * filters get applied.
- *
- * The type of convolution is controled by hyperparameters:
- * patch_dim_     ... frequency axis size of the patch
- * patch_step_    ... size of shift in the convolution
- * patch_stride_  ... shift for 2nd dim of a patch
- *                    (i.e. frame length before splicing)
- * For instance, for a convolutional component after raw input,
- * if the input is 36-dim fbank feature with delta of order 2
- * and spliced using +/- 5 frames of contexts, the convolutional
- * component takes the input as a 36 x 33 image. The patch_stride_
- * should be configured 36. If patch_step_ and patch_dim_ are
- * configured 1 and 7, the Convolutional1dComponent creates a
- * 2D filter of 7 x 33, such that the convolution is actually done
- * only along the frequency axis. Specifically, the convolutional
- * output along the frequency axis is (36 - 7) / 1 + 1 = 30, and
- * the convolutional output along the temporal axis is 33 - 33 + 1 = 1,
- * resulting in an output image of 30 x 1, which is called a feature map
- * in ConvNet. Then if the output-dim is set 3840, the constructor
- * would know there should be 3840 / 30 = 128 distinct filters,
- * which will create 128 feature maps of 30 x 1 for one frame of
- * input. The feature maps are vectorized as a 3840-dim row vector
- * in the output matrix of this component. For details on progatation
- * of Convolutional1dComponent, check the function definition.
- *
- */
-class Convolutional1dComponent: public UpdatableComponent {
- public:
-  Convolutional1dComponent();
-  // constructor using another component
-  Convolutional1dComponent(const Convolutional1dComponent &component);
-  // constructor using parameters
-  Convolutional1dComponent(const CuMatrixBase<BaseFloat> &filter_params,
-                           const CuVectorBase<BaseFloat> &bias_params,
-                           BaseFloat learning_rate);
-
-  int32 InputDim() const;
-  int32 OutputDim() const;
-  void Init(BaseFloat learning_rate, int32 input_dim, int32 output_dim,
-            int32 patch_dim, int32 patch_step, int32 patch_stride,
-            BaseFloat param_stddev, BaseFloat bias_stddev, bool appended_conv);
-  void Init(BaseFloat learning_rate,
-            int32 patch_dim, int32 patch_step, int32 patch_stride,
-            std::string matrix_filename, bool appended_conv);
-
-  // resize the component, setting the parameters to zero, while
-  // leaving any other configuration values the same
-  void Resize(int32 input_dim, int32 output_dim);
-  std::string Info() const;
-  void InitFromString(std::string args);
-  std::string Type() const { return "Convolutional1dComponent"; }
-  bool BackpropNeedsInput() const { return true; }
-  bool BackpropNeedsOutput() const { return false; }
-  using Component::Propagate; // to avoid name hiding
-  void Propagate(const ChunkInfo &in_info,
-                 const ChunkInfo &out_info,
-                 const CuMatrixBase<BaseFloat> &in,
-                 CuMatrixBase<BaseFloat> *out) const;
-  void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
-  virtual void Backprop(const ChunkInfo &in_info,
-                        const ChunkInfo &out_info,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update_in,
-                        CuMatrix<BaseFloat> *in_deriv) const;
-  void SetZero(bool treat_as_gradient);
-  void Read(std::istream &is, bool binary);
-  void Write(std::ostream &os, bool binary) const;
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  Component* Copy() const;
-  void PerturbParams(BaseFloat stddev);
-  void SetParams(const VectorBase<BaseFloat> &bias,
-                 const MatrixBase<BaseFloat> &filter);
-  const CuVector<BaseFloat> &BiasParams() { return bias_params_; }
-  const CuMatrix<BaseFloat> &LinearParams() { return filter_params_; }
-  int32 GetParameterDim() const;
-  void Update(const CuMatrixBase<BaseFloat> &in_value,
-              const CuMatrixBase<BaseFloat> &out_deriv);
-
- private:
-  int32 patch_dim_;
-  int32 patch_step_;
-  int32 patch_stride_;
-
-  static void ReverseIndexes(const std::vector<int32> &forward_indexes,
-                             int32 input_dim,
-                             std::vector<std::vector<int32> > *backward_indexes);
-  static void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                               std::vector<std::vector<int32> > *out);
-
-  const Convolutional1dComponent &operator = (const Convolutional1dComponent &other); // Disallow.
-  CuMatrix<BaseFloat> filter_params_;
-  CuVector<BaseFloat> bias_params_;
-  // When appending convolutional1dcomponents, appended_conv_ should be
-  // set ture for the appended convolutional1dcomponents.
-  bool appended_conv_;
-  bool is_gradient_;
-};
-
-
-/// Functions used in Init routines.  Suppose name=="foo", if "string" has a
-/// field like foo=12, this function will set "param" to 12 and remove that
-/// element from "string".  It returns true if the parameter was read.
-bool ParseFromString(const std::string &name, std::string *string,
-                     int32 *param);
-/// This version is for parameters of type BaseFloat.
-bool ParseFromString(const std::string &name, std::string *string,
-                     BaseFloat *param);
-/// This version is for parameters of type std::vector<int32>; it expects
-/// them as a colon-separated list, without spaces.
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::vector<int32> *param);
-/// This version is for parameters of type bool, which can appear
-/// as any string beginning with f, F, t or T.
-bool ParseFromString(const std::string &name, std::string *string,
-                     bool *param);
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-#endif
diff --git a/src/nnet2/nnet-compute-discriminative-parallel.cc b/src/nnet2/nnet-compute-discriminative-parallel.cc
deleted file mode 100644
index 0ffd73f45bb..00000000000
--- a/src/nnet2/nnet-compute-discriminative-parallel.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-// nnet2/nnet-compute-discriminative-parallel.cc
-
-// Copyright 2012-2013   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <deque>
-#include <mutex>
-#include "nnet2/nnet-compute-discriminative-parallel.h"
-#include "hmm/posterior.h"
-#include "lat/lattice-functions.h"
-#include "util/kaldi-semaphore.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/** This struct stores neural net training examples to be used in
-    multi-threaded training.  */
-class DiscriminativeExamplesRepository {
- public:
-  /// The following function is called by the code that reads in the examples.
-  void AcceptExample(const DiscriminativeNnetExample &example);
-
-  /// The following function is called by the code that reads in the examples,
-  /// when we're done reading examples; it signals this way to this class
-  /// that the stream is now empty
-  void ExamplesDone();
-
-  /// This function is called by the code that does the training.  If there is
-  /// an example available it will provide it, or it will sleep till one is
-  /// available.  It returns NULL when there are no examples left and
-  /// ExamplesDone() has been called.
-  DiscriminativeNnetExample *ProvideExample();
-
-  DiscriminativeExamplesRepository(): buffer_size_(4),
-                                      empty_semaphore_(buffer_size_),
-                                      done_(false) { }
- private:
-  int32 buffer_size_;
-  Semaphore full_semaphore_;
-  Semaphore empty_semaphore_;
-  std::mutex examples_mutex_; // mutex we lock to modify examples_.
-
-  std::deque<DiscriminativeNnetExample*> examples_;
-  bool done_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DiscriminativeExamplesRepository);
-};
-
-
-void DiscriminativeExamplesRepository::AcceptExample(
-    const DiscriminativeNnetExample &example) {
-  empty_semaphore_.Wait();
-  examples_mutex_.lock();
-  examples_.push_back(new DiscriminativeNnetExample(example));
-  examples_mutex_.unlock();
-  full_semaphore_.Signal();
-}
-
-void DiscriminativeExamplesRepository::ExamplesDone() {
-  for (int32 i = 0; i < buffer_size_; i++)
-    empty_semaphore_.Wait();
-  examples_mutex_.lock();
-  KALDI_ASSERT(examples_.empty());
-  examples_mutex_.unlock();
-  done_ = true;
-  full_semaphore_.Signal();
-}
-
-DiscriminativeNnetExample*
-DiscriminativeExamplesRepository::ProvideExample() {
-  full_semaphore_.Wait();
-  if (done_) {
-    KALDI_ASSERT(examples_.empty());
-    full_semaphore_.Signal(); // Increment the semaphore so
-    // the call by the next thread will not block.
-    return NULL; // no examples to return-- all finished.
-  } else {
-    examples_mutex_.lock();
-    KALDI_ASSERT(!examples_.empty());
-    DiscriminativeNnetExample *ans = examples_.front();
-    examples_.pop_front();
-    examples_mutex_.unlock();
-    empty_semaphore_.Signal();
-    return ans;
-  }
-}
-
-
-class DiscTrainParallelClass: public MultiThreadable {
- public:
-  // This constructor is only called for a temporary object
-  // that we pass to the RunMultiThreaded function.
-  DiscTrainParallelClass(const AmNnet &am_nnet,
-                         const TransitionModel &tmodel,
-                         const NnetDiscriminativeUpdateOptions &opts,
-                         bool store_separate_gradients,
-                         DiscriminativeExamplesRepository *repository,
-                         Nnet *nnet_to_update,
-                         NnetDiscriminativeStats *stats):
-      am_nnet_(am_nnet), tmodel_(tmodel), opts_(opts),
-      store_separate_gradients_(store_separate_gradients),
-      repository_(repository),
-      nnet_to_update_(nnet_to_update),
-      nnet_to_update_orig_(nnet_to_update),
-      stats_ptr_(stats) { }
-
-  // The following constructor is called multiple times within
-  // the RunMultiThreaded template function.
-  DiscTrainParallelClass(const DiscTrainParallelClass &other):
-  MultiThreadable(other),
-  am_nnet_(other.am_nnet_), tmodel_(other.tmodel_), opts_(other.opts_),
-  store_separate_gradients_(other.store_separate_gradients_),
-  repository_(other.repository_), nnet_to_update_(other.nnet_to_update_),
-  nnet_to_update_orig_(other.nnet_to_update_orig_),
-  stats_ptr_(other.stats_ptr_) {
-    if (store_separate_gradients_) {
-      // To ensure correctness, we work on separate copies of the gradient
-      // object, which we'll sum at the end.  This is used for exact gradient
-      // computation.
-      if (other.nnet_to_update_ != NULL) {
-        nnet_to_update_ = new Nnet(*(other.nnet_to_update_));
-        // our "nnet_to_update_" variable is a copy of the neural network
-        // we are to update (presumably a gradient).  If we don't set these
-        // to zero we would end up adding multiple copies of the any initial
-        // gradient that "nnet_to_update_" contained when we initialize
-        // the first instance of the class.
-        nnet_to_update_->SetZero(true);
-      } else { // support case where we don't really need a gradient.
-        nnet_to_update_ = NULL;
-      }
-    }
-  }
-  // This does the main function of the class.
-  void operator () () {
-    DiscriminativeNnetExample *example;
-    while ((example = repository_->ProvideExample()) != NULL) {
-      // This is a function call to a function defined in
-      // nnet-compute-discriminative.h
-      NnetDiscriminativeUpdate(am_nnet_, tmodel_, opts_,
-                               *example, nnet_to_update_, &stats_);
-      delete example;
-
-      if (GetVerboseLevel() > 3) {
-        KALDI_VLOG(3) << "Printing local stats for thread " << thread_id_;
-        stats_.Print(opts_.criterion);
-      }
-    }
-  }
-
-  ~DiscTrainParallelClass() {
-    if (nnet_to_update_orig_ != nnet_to_update_) {
-      // This branch is only taken if this instance of the class is
-      // one of the multiple instances allocated inside the RunMultiThreaded
-      // template function, *and* store_separate_gradients_ has been set to true.
-      // In the typical hogwild case, we don't do this.
-      nnet_to_update_orig_->AddNnet(1.0, *nnet_to_update_);
-      delete nnet_to_update_;
-    }
-    stats_ptr_->Add(stats_);
-  }
- private:
-  const AmNnet &am_nnet_;
-  const TransitionModel &tmodel_;
-  const NnetDiscriminativeUpdateOptions &opts_;
-  bool store_separate_gradients_;
-  DiscriminativeExamplesRepository *repository_;
-  Nnet *nnet_to_update_;
-  Nnet *nnet_to_update_orig_;
-  NnetDiscriminativeStats *stats_ptr_;
-  NnetDiscriminativeStats stats_;
-};
-
-
-
-void NnetDiscriminativeUpdateParallel(
-    const AmNnet &am_nnet,
-    const TransitionModel &tmodel,
-    const NnetDiscriminativeUpdateOptions &opts,
-    int32 num_threads,
-    SequentialDiscriminativeNnetExampleReader *example_reader,
-    Nnet *nnet_to_update,
-    NnetDiscriminativeStats *stats) {
-
-  DiscriminativeExamplesRepository repository;
-
-  const bool store_separate_gradients = (nnet_to_update != &(am_nnet.GetNnet()));
-
-  DiscTrainParallelClass c(am_nnet, tmodel, opts,
-                           store_separate_gradients,
-                           &repository, nnet_to_update, stats);
-
-  {
-    // The initialization of the following class spawns the threads that
-    // process the examples.  They get re-joined in its destructor.
-    MultiThreader<DiscTrainParallelClass> m(num_threads, c);
-
-    for (; !example_reader->Done(); example_reader->Next()) {
-      repository.AcceptExample(example_reader->Value());
-    }
-    repository.ExamplesDone();
-  }
-  stats->Print(opts.criterion);
-}
-
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-compute-discriminative-parallel.h b/src/nnet2/nnet-compute-discriminative-parallel.h
deleted file mode 100644
index 5427126be17..00000000000
--- a/src/nnet2/nnet-compute-discriminative-parallel.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// nnet2/nnet-compute-discriminative-parallel.h
-
-// Copyright 2012-2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_COMPUTE_DISCRIMINATIVE_PARALLEL_H_
-#define KALDI_NNET2_NNET_COMPUTE_DISCRIMINATIVE_PARALLEL_H_
-
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-example.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-compute-discriminative.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/* This header provides a multi-threaded version of the discriminative training
-   code (this is for a CPU-based, instead of GPU-based, setup).
-   Note: we expect that "nnet_to_update" will be the same as "&(am_nnet.GetNnet())"
-*/
-
-void NnetDiscriminativeUpdateParallel(
-    const AmNnet &am_nnet,
-    const TransitionModel &tmodel,
-    const NnetDiscriminativeUpdateOptions &opts,
-    int32 num_threads,
-    SequentialDiscriminativeNnetExampleReader *example_reader,
-    Nnet *nnet_to_update,
-    NnetDiscriminativeStats *stats);
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif //  KALDI_NNET2_NNET_COMPUTE_DISCRIMINATIVE_PARALLEL_H_
diff --git a/src/nnet2/nnet-compute-discriminative.cc b/src/nnet2/nnet-compute-discriminative.cc
deleted file mode 100644
index 72a579d608f..00000000000
--- a/src/nnet2/nnet-compute-discriminative.cc
+++ /dev/null
@@ -1,416 +0,0 @@
-// nnet2/nnet-compute-discriminative.cc
-
-// Copyright 2012-2013   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-compute-discriminative.h"
-#include "hmm/posterior.h"
-#include "lat/lattice-functions.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/*
-  This class does the forward and possibly backward computation for (typically)
-  a whole utterance of contiguous features.  You'll instantiate one of
-  these classes each time you want to do this computation.
-*/
-class NnetDiscriminativeUpdater {
- public:
-
-  NnetDiscriminativeUpdater(const AmNnet &am_nnet,
-                            const TransitionModel &tmodel,
-                            const NnetDiscriminativeUpdateOptions &opts,
-                            const DiscriminativeNnetExample &eg,
-                            Nnet *nnet_to_update,
-                            NnetDiscriminativeStats *stats);
-
-  void Update() {
-    Propagate();
-    LatticeComputations();
-    if (nnet_to_update_ != NULL)
-      Backprop();
-  }
-  
-  /// The forward-through-the-layers part of the computation.
-  void Propagate();  
-
-  /// Does the parts between Propagate() and Backprop(), that
-  /// involve forward-backward over the lattice.
-  void LatticeComputations();
-  
-  void Backprop();
-
-  /// Assuming the lattice already has the correct scores in
-  /// it, this function does the MPE or MMI forward-backward
-  /// and puts the resulting discriminative posteriors (which
-  /// may have positive or negative weight) into "post".
-  /// It returns, for MPFE/SMBR, the objective function, or
-  /// for MMI, the negative of the denominator-lattice log-likelihood.
-  double GetDiscriminativePosteriors(Posterior *post);
-  
-  SubMatrix<BaseFloat> GetInputFeatures() const;
-  
-  CuMatrixBase<BaseFloat> &GetOutput() { return forward_data_.back(); }
-
-  static inline Int32Pair MakePair(int32 first, int32 second) {
-    Int32Pair ans;
-    ans.first = first;
-    ans.second = second;
-    return ans;
-  }
-  
- private:
-  typedef LatticeArc Arc;
-  typedef Arc::StateId StateId;
-
-  
-  const AmNnet &am_nnet_;
-  const TransitionModel &tmodel_;
-  const NnetDiscriminativeUpdateOptions &opts_;
-  const DiscriminativeNnetExample &eg_;
-  Nnet *nnet_to_update_; // will equal am_nnet_.GetNnet(), in SGD case, or
-                         // another Nnet, in gradient-computation case, or
-                         // NULL if we just need the objective function.
-  NnetDiscriminativeStats *stats_; // the objective function, etc.
-  std::vector<ChunkInfo> chunk_info_out_; 
-  // forward_data_[i] is the input of the i'th component and (if i > 0)
-  // the output of the i-1'th component.
-  std::vector<CuMatrix<BaseFloat> > forward_data_; 
-  Lattice lat_; // we convert the CompactLattice in the eg, into Lattice form.
-  CuMatrix<BaseFloat> backward_data_;
-  std::vector<int32> silence_phones_; // derived from opts_.silence_phones_str
-};
-
-
-
-NnetDiscriminativeUpdater::NnetDiscriminativeUpdater(
-    const AmNnet &am_nnet,
-    const TransitionModel &tmodel,
-    const NnetDiscriminativeUpdateOptions &opts,
-    const DiscriminativeNnetExample &eg,
-    Nnet *nnet_to_update,
-    NnetDiscriminativeStats *stats):
-    am_nnet_(am_nnet), tmodel_(tmodel), opts_(opts), eg_(eg),
-    nnet_to_update_(nnet_to_update), stats_(stats) {
-  if (!SplitStringToIntegers(opts_.silence_phones_str, ":", false,
-                             &silence_phones_)) {
-    KALDI_ERR << "Bad value for --silence-phones option: "
-              << opts_.silence_phones_str;
-  }
-  const Nnet &nnet = am_nnet_.GetNnet();
-  nnet.ComputeChunkInfo(eg_.input_frames.NumRows(), 1, &chunk_info_out_);
-}
-
-
-
-SubMatrix<BaseFloat> NnetDiscriminativeUpdater::GetInputFeatures() const {
-  int32 num_frames_output = eg_.num_ali.size();
-  int32 eg_left_context = eg_.left_context,
-      eg_right_context = eg_.input_frames.NumRows() -
-      num_frames_output - eg_left_context;
-  KALDI_ASSERT(eg_right_context >= 0);
-  const Nnet &nnet = am_nnet_.GetNnet();
-  // Make sure the example has enough acoustic left and right
-  // context... normally we'll use examples generated using the same model,
-  // which will have the exact context, but we enable a mismatch in context as
-  // long as it is more, not less.
-  KALDI_ASSERT(eg_left_context >= nnet.LeftContext() &&
-               eg_right_context >= nnet.RightContext());
-  int32 offset = eg_left_context - nnet.LeftContext(),
-      num_output_frames =
-      num_frames_output + nnet.LeftContext() + nnet.RightContext();
-  SubMatrix<BaseFloat> ans(eg_.input_frames, offset, num_output_frames,
-                           0, eg_.input_frames.NumCols());
-  return ans;
-}
-
-void NnetDiscriminativeUpdater::Propagate() {
-  const Nnet &nnet = am_nnet_.GetNnet();
-  forward_data_.resize(nnet.NumComponents() + 1);
-  
-  SubMatrix<BaseFloat> input_feats = GetInputFeatures();
-  int32 spk_dim = eg_.spk_info.Dim();
-  if (spk_dim == 0) {
-    forward_data_[0] = input_feats;
-  } else {
-    forward_data_[0].Resize(input_feats.NumRows(),
-                            input_feats.NumCols() + eg_.spk_info.Dim());
-    forward_data_[0].Range(0, input_feats.NumRows(),
-                           0, input_feats.NumCols()).CopyFromMat(input_feats);
-    forward_data_[0].Range(0, input_feats.NumRows(),
-                           input_feats.NumCols(), spk_dim).CopyRowsFromVec(
-                               eg_.spk_info);
-  }
-
-  for (int32 c = 0; c < nnet.NumComponents(); c++) {
-    const Component &component = nnet.GetComponent(c);
-    CuMatrix<BaseFloat> &input = forward_data_[c],
-        &output = forward_data_[c+1];
-    component.Propagate(chunk_info_out_[c] , chunk_info_out_[c+1], input, &output);
-    const Component *prev_component = (c == 0 ? NULL :
-                                       &(nnet.GetComponent(c-1)));
-    bool will_do_backprop = (nnet_to_update_ != NULL),
-        keep_last_output = will_do_backprop &&
-        ((c>0 && prev_component->BackpropNeedsOutput()) ||
-         component.BackpropNeedsInput());
-    if (!keep_last_output)
-      forward_data_[c].Resize(0, 0); // We won't need this data; save memory.
-  }
-}
-
-
-
-void NnetDiscriminativeUpdater::LatticeComputations() {
-  ConvertLattice(eg_.den_lat, &lat_); // convert to Lattice.
-  TopSort(&lat_); // Topologically sort (required by forward-backward algorithms)
-
-  if (opts_.criterion == "mmi" && opts_.boost != 0.0) {
-    BaseFloat max_silence_error = 0.0;
-    LatticeBoost(tmodel_, eg_.num_ali, silence_phones_,
-                 opts_.boost, max_silence_error, &lat_);
-  }
-  
-  int32 num_frames = static_cast<int32>(eg_.num_ali.size());
-
-  stats_->tot_t += num_frames;
-  stats_->tot_t_weighted += num_frames * eg_.weight;
-  
-  const VectorBase<BaseFloat> &priors = am_nnet_.Priors();
-  const CuMatrix<BaseFloat> &posteriors = forward_data_.back();
-
-  KALDI_ASSERT(posteriors.NumRows() == num_frames);
-  int32 num_pdfs = posteriors.NumCols();
-  KALDI_ASSERT(num_pdfs == priors.Dim());
-  
-  // We need to look up the posteriors of some pdf-ids in the matrix
-  // "posteriors".  Rather than looking them all up using operator (), which is
-  // very slow because each lookup involves a separate CUDA call with
-  // communication over PciExpress, we look them up all at once using
-  // CuMatrix::Lookup().
-  // Note: regardless of the criterion, we evaluate the likelihoods in
-  // the numerator alignment.  Even though they may be irrelevant to
-  // the optimization, they will affect the value of the objective function.
-  
-  std::vector<Int32Pair> requested_indexes;
-  BaseFloat wiggle_room = 1.3; // value not critical.. it's just 'reserve'
-  requested_indexes.reserve(num_frames + wiggle_room * lat_.NumStates());
-
-  if (opts_.criterion == "mmi") { // need numerator probabilities...
-    for (int32 t = 0; t < num_frames; t++) {
-      int32 tid = eg_.num_ali[t], pdf_id = tmodel_.TransitionIdToPdf(tid);
-      KALDI_ASSERT(pdf_id >= 0 && pdf_id < num_pdfs);
-      requested_indexes.push_back(MakePair(t, pdf_id));
-    }
-  }
-
-  std::vector<int32> state_times;
-  int32 T = LatticeStateTimes(lat_, &state_times);
-  KALDI_ASSERT(T == num_frames);
-  
-  StateId num_states = lat_.NumStates();
-  for (StateId s = 0; s < num_states; s++) {
-    StateId t = state_times[s];
-    for (fst::ArcIterator<Lattice> aiter(lat_, s); !aiter.Done(); aiter.Next()) {
-      const Arc &arc = aiter.Value();
-      if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
-        int32 tid = arc.ilabel, pdf_id = tmodel_.TransitionIdToPdf(tid);
-        requested_indexes.push_back(MakePair(t, pdf_id));
-      }
-    }
-  }
-
-  std::vector<BaseFloat> answers;
-  CuArray<Int32Pair> cu_requested_indexes(requested_indexes);
-  answers.resize(requested_indexes.size());
-  posteriors.Lookup(cu_requested_indexes, &(answers[0]));
-
-  int32 num_floored = 0;
-
-  BaseFloat floor_val = 1.0e-20; // floor for posteriors.
-  size_t index;
-
-  // Replace "answers" with the vector of scaled log-probs.  If this step takes
-  // too much time, we can look at other ways to do it, using the CUDA card.
-  for (index = 0; index < answers.size(); index++) {
-    BaseFloat post = answers[index];
-    if (post < floor_val) {
-      post = floor_val;
-      num_floored++;
-    }
-    int32 pdf_id = requested_indexes[index].second;
-    BaseFloat pseudo_loglike = Log(post / priors(pdf_id)) * opts_.acoustic_scale;
-    KALDI_ASSERT(!KALDI_ISINF(pseudo_loglike) && !KALDI_ISNAN(pseudo_loglike));
-    answers[index] = pseudo_loglike;
-  }
-  if (num_floored > 0) {
-    KALDI_WARN << "Floored " << num_floored << " probabilities from nnet.";
-  }
-  
-  index = 0;
-  
-  if (opts_.criterion == "mmi") {
-    double tot_num_like = 0.0;
-    for (; index < eg_.num_ali.size(); index++)
-      tot_num_like += answers[index];
-    stats_->tot_num_objf += eg_.weight * tot_num_like;
-  }
-
-  // Now put the (scaled) acoustic log-likelihoods in the lattice.
-  for (StateId s = 0; s < num_states; s++) {
-    for (fst::MutableArcIterator<Lattice> aiter(&lat_, s);
-         !aiter.Done(); aiter.Next()) {
-      Arc arc = aiter.Value();
-      if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
-        arc.weight.SetValue2(-answers[index]);
-        index++;
-        aiter.SetValue(arc);
-      }
-    }
-    LatticeWeight final = lat_.Final(s);
-    if (final != LatticeWeight::Zero()) {
-      final.SetValue2(0.0); // make sure no acoustic term in final-prob.
-      lat_.SetFinal(s, final);
-    }
-  }
-  KALDI_ASSERT(index == answers.size());
-  
-  // Get the MPE or MMI posteriors.
-  Posterior post;
-  stats_->tot_den_objf += eg_.weight * GetDiscriminativePosteriors(&post);
-
-  ScalePosterior(eg_.weight, &post);
-
-  double tot_num_post = 0.0, tot_den_post = 0.0;
-  std::vector<MatrixElement<BaseFloat> > sv_labels;
-  sv_labels.reserve(answers.size());
-  for (int32 t = 0; t < post.size(); t++) {
-    for (int32 i = 0; i < post[t].size(); i++) {
-      int32 pdf_id = post[t][i].first;
-      BaseFloat weight = post[t][i].second;
-      if (weight > 0.0) { tot_num_post += weight; }
-      else { tot_den_post -= weight; }
-      MatrixElement<BaseFloat> elem = {t, pdf_id, weight};
-      sv_labels.push_back(elem);
-    }
-  }
-  stats_->tot_num_count += tot_num_post;
-  int32 num_components = am_nnet_.GetNnet().NumComponents();
-  const CuMatrix<BaseFloat> &output(forward_data_[num_components]);
-  backward_data_.Resize(output.NumRows(), output.NumCols()); // zeroes it.
-  
-  { // We don't actually need tot_objf and tot_weight; we have already
-    // computed the objective function.
-    BaseFloat tot_objf, tot_weight;
-    backward_data_.CompObjfAndDeriv(sv_labels, output, &tot_objf, &tot_weight);
-    // Now backward_data_ will contan the derivative at the output.
-    // Our work here is done..
-  }
-}
-
-
-double NnetDiscriminativeUpdater::GetDiscriminativePosteriors(Posterior *post) {
-  if (opts_.criterion == "mpfe" || opts_.criterion == "smbr") {
-    Posterior tid_post;
-    double ans;
-    ans = LatticeForwardBackwardMpeVariants(tmodel_, silence_phones_, lat_,
-                                            eg_.num_ali, opts_.criterion,
-                                            opts_.one_silence_class,
-                                            &tid_post);
-    ConvertPosteriorToPdfs(tmodel_, tid_post, post);
-    return ans; // returns the objective function.
-  } else {
-    KALDI_ASSERT(opts_.criterion == "mmi");
-    bool convert_to_pdfs = true, cancel = true;
-    // we'll return the denominator-lattice forward backward likelihood,
-    // which is one term in the objective function.
-    return LatticeForwardBackwardMmi(tmodel_, lat_, eg_.num_ali,
-                                     opts_.drop_frames, convert_to_pdfs,
-                                     cancel, post);
-  }
-}
-
-
-
-void NnetDiscriminativeUpdater::Backprop() {
-  const Nnet &nnet = am_nnet_.GetNnet();
-  for (int32 c = nnet.NumComponents() - 1; c >= 0; c--) {
-    const Component &component = nnet.GetComponent(c);
-    Component *component_to_update = &(nnet_to_update_->GetComponent(c));
-    const CuMatrix<BaseFloat>  &input = forward_data_[c],
-                            &output = forward_data_[c+1],
-                      &output_deriv = backward_data_;
-    CuMatrix<BaseFloat> input_deriv;
-    component.Backprop(chunk_info_out_[c], chunk_info_out_[c+1], input, output, output_deriv,
-                       component_to_update, &input_deriv);
-    backward_data_.Swap(&input_deriv); // backward_data_ = input_deriv.
-  }
-}
-
-
-void NnetDiscriminativeUpdate(const AmNnet &am_nnet,
-                              const TransitionModel &tmodel,
-                              const NnetDiscriminativeUpdateOptions &opts,
-                              const DiscriminativeNnetExample &eg,
-                              Nnet *nnet_to_update,
-                              NnetDiscriminativeStats *stats) {
-  NnetDiscriminativeUpdater updater(am_nnet, tmodel, opts, eg,
-                                    nnet_to_update, stats);
-  updater.Update();
-}
-
-void NnetDiscriminativeStats::Add(const NnetDiscriminativeStats &other) {
-  tot_t += other.tot_t;
-  tot_t_weighted += other.tot_t_weighted;
-  tot_num_count += other.tot_num_count;
-  tot_num_objf += other.tot_num_objf;
-  tot_den_objf += other.tot_den_objf;
-}
-
-void NnetDiscriminativeStats::Print(std::string criterion) {
-  KALDI_ASSERT(criterion == "mmi" || criterion == "smbr" ||
-               criterion == "mpfe");
-
-  double avg_post_per_frame = tot_num_count / tot_t_weighted;
-  KALDI_LOG << "Number of frames is " << tot_t
-            << " (weighted: " << tot_t_weighted
-            << "), average (num or den) posterior per frame is "
-            << avg_post_per_frame;
-  
-  if (criterion == "mmi") {
-    double num_objf = tot_num_objf / tot_t_weighted,
-        den_objf = tot_den_objf / tot_t_weighted,
-        objf = num_objf - den_objf;
-    KALDI_LOG << "MMI objective function is " << num_objf << " - "
-              << den_objf << " = " << objf << " per frame, over "
-              << tot_t_weighted << " frames.";
-  } else if (criterion == "mpfe") {
-    double objf = tot_den_objf / tot_t_weighted; // this contains the actual
-                                                 // summed objf
-    KALDI_LOG << "MPFE objective function is " << objf
-              << " per frame, over " << tot_t_weighted << " frames.";
-  } else {
-    double objf = tot_den_objf / tot_t_weighted; // this contains the actual
-                                                 // summed objf
-    KALDI_LOG << "SMBR objective function is " << objf
-              << " per frame, over " << tot_t_weighted << " frames.";
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-compute-discriminative.h b/src/nnet2/nnet-compute-discriminative.h
deleted file mode 100644
index 219ebb45008..00000000000
--- a/src/nnet2/nnet-compute-discriminative.h
+++ /dev/null
@@ -1,115 +0,0 @@
-// nnet2/nnet-compute-discriminative.h
-
-// Copyright 2012-2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_COMPUTE_DISCRIMINATIVE_H_
-#define KALDI_NNET2_NNET_COMPUTE_DISCRIMINATIVE_H_
-
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-example.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/* This header provides functionality for doing model updates, and computing
-   gradients, using discriminative objective functions (MPFE, SMBR, MMI).
-   We use the DiscriminativeNnetExample defined in nnet-example.h.
-*/
-
-struct NnetDiscriminativeUpdateOptions {
-  std::string criterion; // "mmi" or "mpfe" or "smbr"
-  BaseFloat acoustic_scale; // e.g. 0.1
-  bool drop_frames; // for MMI, true if we ignore frames where alignment
-                    // pdf-id is not in the lattice.
-  bool one_silence_class;  // Affects MPE/SMBR>
-  BaseFloat boost; // for MMI, boosting factor (would be Boosted MMI)... e.g. 0.1.
-
-  std::string silence_phones_str; // colon-separated list of integer ids of silence phones,
-                                  // for MPE/SMBR only.
-
-  NnetDiscriminativeUpdateOptions(): criterion("smbr"), acoustic_scale(0.1),
-                                     drop_frames(false),
-                                     one_silence_class(false),
-                                     boost(0.0) { }
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("criterion", &criterion, "Criterion, 'mmi'|'mpfe'|'smbr', "
-                   "determines the objective function to use.  Should match "
-                   "option used when we created the examples.");
-    opts->Register("acoustic-scale", &acoustic_scale, "Weighting factor to "
-                   "apply to acoustic likelihoods.");
-    opts->Register("drop-frames", &drop_frames, "For MMI, if true we drop frames "
-                   "with no overlap of num and den frames");
-    opts->Register("boost", &boost, "Boosting factor for boosted MMI (e.g. 0.1)");
-    opts->Register("one-silence-class", &one_silence_class, "If true, newer "
-                   "behavior which will tend to reduce insertions.");
-    opts->Register("silence-phones", &silence_phones_str,
-                   "For MPFE or SMBR, colon-separated list of integer ids of "
-                   "silence phones, e.g. 1:2:3");
-    
-  }
-};
-
-
-struct NnetDiscriminativeStats {
-  double tot_t; // total number of frames
-  double tot_t_weighted; // total number of frames times weight.
-  double tot_num_count; // total count of numerator posterior (should be
-                        // identical to denominator-posterior count, so we don't
-                        // separately compute that).
-  double tot_num_objf;  // for MMI, the (weighted) numerator likelihood; for
-                        // SMBR/MPFE, 0.
-  double tot_den_objf;  // for MMI, the (weighted) denominator likelihood; for
-                        // SMBR/MPFE, the objective function.
-  NnetDiscriminativeStats() { std::memset(this, 0, sizeof(*this)); }
-  void Print(std::string criterion); // const NnetDiscriminativeUpdateOptions &opts);
-  void Add(const NnetDiscriminativeStats &other);
-};
-
-/** Does the neural net computation, lattice forward-backward, and backprop,
-    for either the MMI, MPFE or SMBR objective functions.
-    If nnet_to_update == &(am_nnet.GetNnet()), then this does stochastic
-    gradient descent, otherwise (assuming you have called SetZero(true)
-    on *nnet_to_update) it will compute the gradient on this data.
-    If nnet_to_update_ == NULL, no backpropagation is done.
-    
-    Note: we ignore any existing acoustic score in the lattice of "eg".
-
-    For display purposes you should normalize the sum of this return value by
-    dividing by the sum over the examples, of the number of frames
-    (num_ali.size()) times the weight.
-
-    Something you need to be careful with is that the occupation counts and the
-    derivative are, following tradition, missing a factor equal to the acoustic
-    scale.  So you need to multiply them by that scale if you plan to do
-    something like L-BFGS in which you look at both the derivatives and function
-    values.  */
-
-void NnetDiscriminativeUpdate(const AmNnet &am_nnet,
-                              const TransitionModel &tmodel,
-                              const NnetDiscriminativeUpdateOptions &opts,
-                              const DiscriminativeNnetExample &eg,
-                              Nnet *nnet_to_update,
-                              NnetDiscriminativeStats *stats);
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_COMPUTE_DISCRIMINATIVE_H_
diff --git a/src/nnet2/nnet-compute-online.cc b/src/nnet2/nnet-compute-online.cc
deleted file mode 100644
index 18fc48b6c78..00000000000
--- a/src/nnet2/nnet-compute-online.cc
+++ /dev/null
@@ -1,215 +0,0 @@
-// nnet2/nnet-compute-online.cc
-
-// Copyright 2014   Johns Hopkins University (author: Daniel Povey)
-//                  Guoguo Chen
-//                  Vijayaditya Peddinti
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-compute-online.h"
-#include <vector>
-
-namespace kaldi {
-namespace nnet2 {
-
-NnetOnlineComputer::NnetOnlineComputer(const Nnet &nnet, bool pad_input)
-    : nnet_(nnet), pad_input_(pad_input),
-      is_first_chunk_(true), finished_(false) {
-  data_.resize(nnet_.NumComponents() + 1);
-  reusable_component_inputs_.resize(nnet_.NumComponents()+1);
-}
-
-void NnetOnlineComputer::Compute(const CuMatrixBase<BaseFloat> &input,
-                                 CuMatrix<BaseFloat> *output) {
-  KALDI_ASSERT(output != NULL);
-  KALDI_ASSERT(!finished_);
-  int32 dim = input.NumCols();
-
-  // If input is empty, we also set output to zero size.
-  if (input.NumRows() == 0) {
-    output->Resize(0, 0);
-    return;
-  } else {
-    // store the last frame as it might be needed for padding when Flush() is
-    // called.
-    if (last_seen_input_frame_.Dim() != input.NumCols())
-      last_seen_input_frame_.Resize(input.NumCols());
-    last_seen_input_frame_.CopyFromVec(input.Row(input.NumRows() - 1));
-  }
-
-  // Checking if feature dimension matches that required by the neural network.
-  if (dim != nnet_.InputDim()) {
-    KALDI_ERR << "Feature dimension is " << dim << ", but network expects "
-        << nnet_.InputDim();
-  }
-  // num_effective_input_rows is the effective number of input rows we have, for
-  // purposes of computing how much output we will get.  It is the number of
-  // actual input rows plus the amount of context stored at intermediate layers
-  // of the network (which if we have previously done the computation, will
-  // equal nnet_.LeftContext() + nnet_.RightContext()).
-  int32 num_effective_input_rows = 0;
-  // Initialize the first element of data_, with input
-  CuMatrix<BaseFloat> &input_data(data_[0]);
-  if (is_first_chunk_)  {
-    is_first_chunk_ = false;
-    // assert that all the component-wise input buffers are empty
-    for (int32 i = 0; i < reusable_component_inputs_.size(); i++)
-      KALDI_ASSERT(reusable_component_inputs_[0].NumRows() == 0);
-    // Pad at the start of the file if necessary.
-    if ((pad_input_) && (nnet_.LeftContext() > 0))  {
-        input_data.Resize(nnet_.LeftContext() + input.NumRows(), dim);
-        input_data.Range(0, nnet_.LeftContext(), 0,
-                    dim).CopyRowsFromVec(input.Row(0));
-        input_data.Range(nnet_.LeftContext(), input.NumRows(),
-                    0, dim).CopyFromMat(input);
-    } else {
-      input_data.Resize(input.NumRows(), input.NumCols());
-      input_data.CopyFromMat(input);
-    }
-    num_effective_input_rows = input_data.NumRows();
-  } else {
-    int32 extra_input_rows = 0;
-    // checking if we did forward pass for any chunks before.
-    // if we did a forward pass, component input buffers would be non-empty
-    // these buffers store information equivalent to having an nnet_input
-    // buffer of (nnet_.LeftContext() + nnet_.RightContext())
-    for (int32 i = 0; i < reusable_component_inputs_.size(); i++)  {
-      if (reusable_component_inputs_[i].NumRows() > 0) {
-        extra_input_rows = nnet_.LeftContext() + nnet_.RightContext();
-        break;
-      }
-    }
-    // add unprocessed input from the previous calls
-    input_data.Resize(input.NumRows() + unprocessed_buffer_.NumRows(), dim);
-    if (unprocessed_buffer_.NumRows() > 0)
-      input_data.Range(0, unprocessed_buffer_.NumRows(),
-                       0, dim).CopyFromMat(unprocessed_buffer_);
-    input_data.Range(unprocessed_buffer_.NumRows(), input.NumRows(),
-                     0, dim).CopyFromMat(input);
-    unprocessed_buffer_.Resize(0, 0); // clearing the unprocessed buffer
-    num_effective_input_rows = input_data.NumRows() + extra_input_rows;
-  }
-  if (num_effective_input_rows >=
-      nnet_.LeftContext() + nnet_.RightContext() + 1) {
-    // we have sufficient frames to compute at least one nnet output
-    nnet_.ComputeChunkInfo(num_effective_input_rows, 1, &chunk_info_);
-    Propagate();
-    *output = data_.back();
-  } else {
-    // store the input in the unprocessed_buffer_
-    unprocessed_buffer_ = input_data;
-    // not enough input context so just return an empty array
-    output->Resize(0, 0);
-  }
-
-}
-
-void NnetOnlineComputer::Flush(CuMatrix<BaseFloat> *output) {
-  KALDI_ASSERT(!finished_ && !is_first_chunk_);
-  int32 num_frames_padding = (pad_input_ ? nnet_.RightContext() : 0);
-  int32 num_stored_frames = nnet_.LeftContext() + nnet_.RightContext();
-  int32 num_effective_input_rows =  num_stored_frames + num_frames_padding;
-  // If the amount of output would be empty return at this point.
-  if (num_effective_input_rows < nnet_.LeftContext() + nnet_.RightContext() + 1) {
-    output->Resize(0, 0);
-    finished_ = true;
-    return;
-  }
-
-  int32 dim = nnet_.InputDim();
-  CuMatrix<BaseFloat> &input_data(data_[0]);
-  KALDI_ASSERT(num_frames_padding > 0);  // else we would have returned above.
-  input_data.Resize(num_frames_padding, dim);
-  input_data.CopyRowsFromVec(last_seen_input_frame_);
-
-  // Note, we later modify this chunk-info, it isn't quite correct right now
-  // because we add extra data at intermediate layers, and the actual number of
-  // input rows doesn't equal num_effective_input_rows.
-  nnet_.ComputeChunkInfo(num_effective_input_rows, 1,
-                         &chunk_info_);
-  Propagate();
-  *output = data_.back();
-  finished_ = true;
-}
-
-void NnetOnlineComputer::Propagate() {
-  // This method is like the normal nnet propagate, but we reuse the frames
-  // computed from the previous chunk, at each component.
-
-  for (int32 c = 0; c < nnet_.NumComponents(); c++) {
-    // we assume that the chunks are always contiguous
-    chunk_info_[c].MakeOffsetsContiguous();
-    chunk_info_[c + 1].MakeOffsetsContiguous();
-
-    const Component &component = nnet_.GetComponent(c);
-    CuMatrix<BaseFloat> &input_data = data_[c], &output_data = data_[c + 1];
-    CuMatrix<BaseFloat> input_data_temp;
-
-    if (component.Context().size() > 1)  {
-      int32 dim = component.InputDim();
-      if (reusable_component_inputs_[c].NumRows() > 0) {
-        // concatenate any frames computed by previous component
-        // in the last call, to the input of the current component
-        input_data_temp.Resize(reusable_component_inputs_[c].NumRows()
-                               + input_data.NumRows(), dim);
-        input_data_temp.Range(0, reusable_component_inputs_[c].NumRows(),
-                       0, dim).CopyFromMat(reusable_component_inputs_[c]);
-        input_data_temp.Range(reusable_component_inputs_[c].NumRows(),
-                              input_data.NumRows(), 0, dim).CopyFromMat(
-                                  input_data);
-        input_data = input_data_temp;
-      }
-      // store any frames which can be reused in the next call
-      reusable_component_inputs_[c].Resize(component.Context().back() -
-                                component.Context().front(), dim);
-      reusable_component_inputs_[c].CopyFromMat(
-          input_data.RowRange(input_data.NumRows() -
-                              reusable_component_inputs_[c].NumRows(),
-                              reusable_component_inputs_[c].NumRows()));
-    }
-
-    // chunk_info objects provided assume that we added all the reusable
-    // context at the input of the nnet. However we are reusing hidden
-    // activations computed in the previous call.
-    // Hence we manipulate the chunk_info objects to reflect the state of the
-    // actual chunk, each component is computing, in the current Propagate.
-    // As before we always assume the chunks are contiguous.
-
-    // modifying the input chunk_info
-    int32 chunk_size_assumed = chunk_info_[c].ChunkSize();
-    int32 last_offset = chunk_info_[c].GetOffset(chunk_size_assumed - 1);
-    int32 first_offset = last_offset - input_data.NumRows() + 1;
-    ChunkInfo input_chunk_info(chunk_info_[c].NumCols(),
-                               chunk_info_[c].NumChunks(),
-                               first_offset,
-                               last_offset);
-    // modifying the output chunk_info
-    chunk_size_assumed = chunk_info_[c + 1].ChunkSize();
-    last_offset = chunk_info_[c + 1].GetOffset(chunk_size_assumed - 1);
-    first_offset = last_offset - (input_data.NumRows() -
-                                  (component.Context().back() -
-                                   component.Context().front())) + 1;
-    ChunkInfo output_chunk_info(chunk_info_[c + 1].NumCols(),
-                                chunk_info_[c + 1].NumChunks(),
-                                first_offset,
-                                last_offset);
-    component.Propagate(input_chunk_info, output_chunk_info,
-                        input_data, &output_data);
-  }
-}
-
-}  // namespace nnet2
-}  // namespace kaldi
diff --git a/src/nnet2/nnet-compute-online.h b/src/nnet2/nnet-compute-online.h
deleted file mode 100644
index 66a456d5538..00000000000
--- a/src/nnet2/nnet-compute-online.h
+++ /dev/null
@@ -1,110 +0,0 @@
-// nnet2/nnet-compute-online.h
-
-// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
-//                 Guoguo Chen
-//                 Vijayaditya Peddinti
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_COMPUTE_ONLINE_H_
-#define KALDI_NNET2_NNET_COMPUTE_ONLINE_H_
-
-#include "nnet2/nnet-nnet.h"
-#include <vector>
-
-namespace kaldi {
-namespace nnet2 {
-
-/* This header provides functionality for doing forward computation in a situation
-   where you want to start from the beginning of a file and progressively compute
-   more, while re-using the hidden parts that (due to context) may be shared.
-   (note: this sharing is more of an issue in multi-splice networks where there is
-   splicing over time in the middle layers of the network).
-   Note: this doesn't do the final taking-the-log and correcting for the prior.
-   The current implementation is just an inefficient placeholder implementation;
-   later we'll modify it to properly use previously computed activations.
-*/
-
-class NnetOnlineComputer {
-
- public:
-  // All the inputs and outputs are of type CuMatrix, in case we're doing the
-  // computation on the GPU (of course, if there is no GPU, it backs off to
-  // using the CPU).
-  // You should initialize an object of this type for each utterance you want
-  // to decode.
-  
-  // Note: pad_input will normally be true; it means that at the start and end
-  // of the file, we pad with repeats of the first/last frame, so that the total
-  // number of frames it outputs is the same as the number of input frames.
-  NnetOnlineComputer(const Nnet &nnet,
-                     bool pad_input);
-
-  // This function works as follows: given a chunk of input (interpreted
-  // as following in time any previously supplied data), do the computation
-  // and produce all the frames of output we can.  In the middle of the
-  // file, the dimensions of input and output will be the same, but at
-  // the beginning of the file, output will have fewer frames than input
-  // due to required context.
-  // It is the responsibility of the user to keep track of frame indices, if
-  // required.  This class won't output any frame twice.
-  void Compute(const CuMatrixBase<BaseFloat> &input,
-               CuMatrix<BaseFloat> *output);
-  
-  // This flushes out the last frames of output; you call this when all
-  // input has finished.  It's invalid to call Compute or Flush after
-  // calling Flush.  It's valid to call Flush if no frames have been
-  // input or if no frames have been output; this produces empty output.
-  void Flush(CuMatrix<BaseFloat> *output);
-
- private:
-  void Propagate();
-
-  const Nnet &nnet_;
-
-  // data_ contains the intermediate stages and the output of the most recent
-  // computation.
-  std::vector<CuMatrix<BaseFloat> > data_;
-  
-  std::vector<ChunkInfo> chunk_info_;  // contains chunk_info(s) for the
-  // components
-
-  std::vector<CuMatrix<BaseFloat> > reusable_component_inputs_;  
-        // reusable data from previous chunk, this is a buffer to
-        // store the hidden activations before splice type components
-
-  CuMatrix<BaseFloat> unprocessed_buffer_;  // buffer to store unprocessed input
-  // from previous chunks (as we can have several chunks with insufficient
-  // context)
-  
-  CuVector<BaseFloat> last_seen_input_frame_;  // stores the last seen frame
-  // for the sake of right padding the input. This is useful to deal with the
-  // scenario where the initial component is not a splice component.
-
-  bool pad_input_;  // pad input at the beginning of the decode
-
-  bool is_first_chunk_;
-
-  bool finished_;  // forward-pass is complete
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(NnetOnlineComputer);
-};
-
-
-}  // namespace nnet2
-}  // namespace kaldi
-
-#endif  // KALDI_NNET2_NNET_COMPUTE_ONLINE_H_
diff --git a/src/nnet2/nnet-compute-test.cc b/src/nnet2/nnet-compute-test.cc
deleted file mode 100644
index 6f1ff5e2a9b..00000000000
--- a/src/nnet2/nnet-compute-test.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-// nnet2/nnet-compute-test.cc
-
-// Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
-// Copyright 2015  David Snyder
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-nnet.h"
-#include "nnet2/nnet-compute.h"
-#include "nnet2/nnet-compute-online.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void UnitTestNnetCompute() {
-  int32 input_dim = 10 + rand() % 40, output_dim = 100 + rand() % 500;
-  bool pad_input = (rand() % 2 == 0);
-  
-  Nnet *nnet = GenRandomNnet(input_dim, output_dim);
-  KALDI_LOG << "Left context = " << nnet->LeftContext() << ", right context = "
-            << nnet->RightContext() << ", pad-input = " << pad_input;
-  KALDI_LOG << "NNet info is " << nnet->Info();
-  int32 num_feats = 5 + rand() % 1000;
-  CuMatrix<BaseFloat> input(num_feats, input_dim);
-  input.SetRandn();
-
-  int32 num_output_rows = num_feats -
-      (pad_input ? 0 : nnet->LeftContext() + nnet->RightContext());
-  if (num_output_rows <= 0)
-    return;
-  CuMatrix<BaseFloat> output1(num_output_rows, output_dim);
-  NnetComputation(*nnet, input, pad_input, &output1);
-
-  CuMatrix<BaseFloat> output2(output1.NumRows(), output1.NumCols());
-  int32 cur_input_pos = 0, cur_output_pos = 0;
-
-  NnetOnlineComputer computer(*nnet, pad_input);
-  while (cur_input_pos <= num_feats) {
-    int32 feats_left = num_feats - cur_input_pos;
-    CuMatrix<BaseFloat> output_part;
-    if (feats_left > 0) {
-      int32 chunk_size = std::min<int32>(1 + rand() % 10, feats_left);
-      CuSubMatrix<BaseFloat> input_part(input, cur_input_pos, chunk_size,
-                                        0, input_dim);
-      computer.Compute(input_part, &output_part);
-      cur_input_pos += chunk_size;
-    } else {
-      computer.Flush(&output_part);
-      cur_input_pos++; // will terminate the loop.
-    }
-    if (output_part.NumRows() != 0) {
-      output2.Range(cur_output_pos, output_part.NumRows(),
-                    0, output_dim).CopyFromMat(output_part);
-      cur_output_pos += output_part.NumRows();
-    }
-  }  
-  AssertEqual(output1, output2);
-  for (int32 i = 0; i < output1.NumRows(); i++) {
-    // just double-check that the frames near the end are right, in case
-    // the test above somehow passed despite that.
-    if (i < 10 || output1.NumRows() - i < 10) {
-      CuSubVector<BaseFloat> vec1(output1, i), vec2(output2, i);
-      AssertEqual(vec1, vec2);
-    }
-  }
-  KALDI_LOG << "OK";
-  delete nnet;
-}
-
-void UnitTestNnetComputeChunked() {
-  int32 input_dim = 10 + rand() % 40, output_dim = 100 + rand() % 500;
-  bool pad_input = true;
-  
-  Nnet *nnet = GenRandomNnet(input_dim, output_dim);
-  int32 num_feats = 100 + rand() % 500;
-  int32 chunk_size = num_feats / (2 + rand() % 10);
-  CuMatrix<BaseFloat> input(num_feats, input_dim);
-  input.SetRandn();
-
-  KALDI_LOG << "Left context = " << nnet->LeftContext() 
-            << ", right context = " << nnet->RightContext() 
-            << ", chunk size = " << chunk_size;
-  KALDI_LOG << "NNet info is " << nnet->Info();
-
-  int32 num_output_rows = num_feats;
-  CuMatrix<BaseFloat> cu_output1(num_output_rows, output_dim);
-  Matrix<BaseFloat> output2(num_output_rows, output_dim);
-  NnetComputation(*nnet, input, pad_input, &cu_output1);
-  NnetComputationChunked(*nnet, Matrix<BaseFloat>(input), chunk_size, 
-                         &output2);
-  Matrix<BaseFloat> output1(cu_output1);
-  AssertEqual(output1, output2);
-  for (int32 i = 0; i < output1.NumRows(); i++) {
-    // just double-check that the frames near the end are right, in case
-    // the test above somehow passed despite that.
-    if (i < 10 || output1.NumRows() - i < 10) {
-      SubVector<BaseFloat> vec1(output1, i), vec2(output2, i);
-      AssertEqual(vec1, vec2);
-    }
-  }
-  KALDI_LOG << "OK";
-  delete nnet;
-}
-
-}  // namespace nnet2
-}  // namespace kaldi
-
-#include "matrix/matrix-functions.h"
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-
-  for (int32 i = 0; i < 10; i++) 
-    UnitTestNnetCompute();
-    UnitTestNnetComputeChunked();
-  return 0;
-}
-  
diff --git a/src/nnet2/nnet-compute.cc b/src/nnet2/nnet-compute.cc
deleted file mode 100644
index 9f2fe1ebcc8..00000000000
--- a/src/nnet2/nnet-compute.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-// nnet2/nnet-compute.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-// Copyright 2015   David Snyder
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-compute.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/*
-  This class does the forward and possibly backward computation for (typically)
-  a whole utterance of contiguous features.  You'll instantiate one of
-  these classes each time you want to do this computation.
-*/
-class NnetComputer {
- public:
-  /* Initializer.  If pad == true, pad input with nnet.LeftContext() frames on
-     the left and nnet.RightContext() frames on the right (duplicate the first
-     and last frames.) */
-  NnetComputer(const Nnet &nnet,
-               const CuMatrixBase<BaseFloat> &input_feats,
-               bool pad, 
-               Nnet *nnet_to_update = NULL);
-  
-  /// The forward-through-the-layers part of the computation.
-  void Propagate();
-  
-  void Backprop(CuMatrix<BaseFloat> *tmp_deriv);
-                
-  
-  /// Computes objf derivative at last layer, and returns objective
-  /// function summed over labels and multiplied by utterance_weight.
-  /// [Note: utterance_weight will normally be 1.0].
-  BaseFloat ComputeLastLayerDeriv(const Posterior &pdf_post,
-                                  CuMatrix<BaseFloat> *deriv) const;
-  
-  CuMatrixBase<BaseFloat> &GetOutput() { return forward_data_.back(); }
-  
- private:  
-  const Nnet &nnet_;
-  std::vector<CuMatrix<BaseFloat> > forward_data_;
-  Nnet *nnet_to_update_; // May be NULL, if just want objective function
-  // but no gradient info or SGD.
-  std::vector <ChunkInfo> chunk_info_;
-};
-
-NnetComputer::NnetComputer(const Nnet &nnet,
-                           const CuMatrixBase<BaseFloat> &input_feats,
-                           bool pad,
-                           Nnet *nnet_to_update):
-    nnet_(nnet), nnet_to_update_(nnet_to_update) {
-  int32 dim = input_feats.NumCols();
-  if (dim != nnet.InputDim()) {
-    KALDI_ERR << "Feature dimension is " << dim << " but network expects "
-              << nnet.InputDim();
-  }
-  forward_data_.resize(nnet.NumComponents() + 1);
-
-  int32 left_context = (pad ? nnet_.LeftContext() : 0),
-       right_context = (pad ? nnet_.RightContext() : 0);
-
-  int32 num_rows = left_context + input_feats.NumRows() + right_context;
-  nnet.ComputeChunkInfo(num_rows, 1, &chunk_info_);
-
-  CuMatrix<BaseFloat> &input(forward_data_[0]);
-  input.Resize(num_rows, dim);
-  input.Range(left_context, input_feats.NumRows(),
-              0, dim).CopyFromMat(input_feats);
-  for (int32 i = 0; i < left_context; i++)
-    input.Row(i).CopyFromVec(input_feats.Row(0));
-  int32 last_row = input_feats.NumRows() - 1;
-  for (int32 i = 0; i < right_context; i++)
-    input.Row(num_rows - i - 1).CopyFromVec(input_feats.Row(last_row));
-}
-
-
-/// This is the forward part of the computation.
-void NnetComputer::Propagate() {
-  for (int32 c = 0; c < nnet_.NumComponents(); c++) {
-    const Component &component = nnet_.GetComponent(c);
-    CuMatrix<BaseFloat> &input = forward_data_[c],
-                     &output = forward_data_[c+1];
-    component.Propagate(chunk_info_[c], chunk_info_[c+1], input, &output);
-    const Component *prev_component = (c == 0 ? NULL : &(nnet_.GetComponent(c-1)));
-    bool will_do_backprop = (nnet_to_update_ != NULL),
-         keep_last_output = will_do_backprop &&
-                             ((c>0 && prev_component->BackpropNeedsOutput()) ||
-                              component.BackpropNeedsInput());
-    if (!keep_last_output)
-      forward_data_[c].Resize(0, 0); // We won't need this data; save memory.
-  }
-}
-
-BaseFloat NnetComputer::ComputeLastLayerDeriv(const Posterior &pdf_post,
-                                              CuMatrix<BaseFloat> *deriv) const {
-  // TODO: convert this to proper CUDA code, c.f. ComputeObjfAndDeriv
-  // in nnet-update.cc (I'm not sure, though, that this code is ever reached.)
-  int32 num_components = nnet_.NumComponents();
-  double tot_objf = 0.0, tot_weight = 0.0;
-  const CuMatrix<BaseFloat> &last_layer_output = forward_data_[num_components];
-  int32 num_frames = last_layer_output.NumRows(),
-          num_pdfs = last_layer_output.NumCols();
-  KALDI_ASSERT(pdf_post.size() == static_cast<size_t>(num_frames));
-  deriv->Resize(num_frames, num_pdfs); // will zero it.
-  for (int32 i = 0; i < deriv->NumRows(); i++) {
-    for (size_t j = 0; j < pdf_post[i].size(); j++) {
-      int32 label = pdf_post[i][j].first;
-      BaseFloat weight = pdf_post[i][j].second;
-      KALDI_ASSERT(label >= 0 && label < num_pdfs);
-      BaseFloat this_prob = last_layer_output(i, label);
-      KALDI_ASSERT(this_prob > 0.99e-20); // We floored to 1.0e-20 in SoftmaxLayer.
-      tot_objf += weight * Log(this_prob);
-      tot_weight += weight;
-      (*deriv)(i, label) += weight / this_prob; // could be "=", assuming the
-      // labels are all distinct.
-    }
-  }
-  KALDI_VLOG(4) << "Objective function is " << (tot_objf/tot_weight) <<
-      " per frame over " << tot_weight << " samples.";
-  return tot_objf;  
-}
-
-
-void NnetComputer::Backprop(CuMatrix<BaseFloat> *tmp_deriv) {
-  KALDI_ASSERT(nnet_to_update_ != NULL); // Or why do backprop?
-  // If later this reasoning changes, we can change this
-  // statement and add logic to make component_to_update, below,
-  // NULL if necessary.
-  
-  for (int32 c = nnet_.NumComponents() - 1; c >= 0; c--) {
-    const Component &component = nnet_.GetComponent(c);
-    Component *component_to_update = &(nnet_to_update_->GetComponent(c));
-    const CuMatrix<BaseFloat>  &input = forward_data_[c],
-                            &output = forward_data_[c+1],
-                      &output_deriv = *tmp_deriv;
-    CuMatrix<BaseFloat> input_deriv;
-    component.Backprop(chunk_info_[c], chunk_info_[c+1], input, output, output_deriv, 
-                       component_to_update, &input_deriv);
-    *tmp_deriv = input_deriv;
-  }
-}
-
-void NnetComputation(const Nnet &nnet,
-                     const CuMatrixBase<BaseFloat> &input,  // features
-                     bool pad_input,
-                     CuMatrixBase<BaseFloat> *output) {
-  NnetComputer nnet_computer(nnet, input, pad_input, NULL);
-  nnet_computer.Propagate();
-  output->CopyFromMat(nnet_computer.GetOutput());
-}
-
-void NnetComputationChunked(const Nnet &nnet,
-                     const Matrix<BaseFloat> &input,  // features
-                     int32 chunk_size,
-                     Matrix<BaseFloat> *output) {
-  int32 num_rows,
-       num_chunks = ceil((BaseFloat)input.NumRows() / chunk_size),
-       dim = input.NumCols(),
-       left_context = nnet.LeftContext(),
-       right_context = nnet.RightContext();
-  Matrix<BaseFloat> full_input;
-  num_rows = left_context + input.NumRows() + right_context;
-  full_input.Resize(num_rows, dim);
-  full_input.Range(left_context, input.NumRows(),
-            0, dim).CopyFromMat(input);
-  for (int32 i = 0; i < left_context; i++)
-    full_input.Row(i).CopyFromVec(input.Row(0));
-  int32 last_row = input.NumRows() - 1;
-  for (int32 i = 0; i < right_context; i++)
-    full_input.Row(num_rows - i - 1).CopyFromVec(input.Row(last_row));
-
-  for (int32 i = 0; i < num_chunks; i++) {
-    int32 index = i * chunk_size,
-          offset = std::min(num_rows - chunk_size * i, 
-                            left_context + chunk_size + right_context);
-    SubMatrix<BaseFloat> chunk_input(full_input, index, offset, 0, dim);
-    CuMatrix<BaseFloat> cu_chunk_input(chunk_input);
-
-    // Note: we have already accounted for input padding, so we pass
-    // pad_input==false to the NnetComputer.
-    NnetComputer nnet_computer(nnet, cu_chunk_input, false, NULL);
-    nnet_computer.Propagate();
-    CuMatrix<BaseFloat> cu_chunk_output(nnet_computer.GetOutput());
-    SubMatrix<BaseFloat> chunk_out(*output, i * chunk_size, 
-                           cu_chunk_output.NumRows(), 0, 
-                           cu_chunk_output.NumCols());
-    chunk_out.CopyFromMat(cu_chunk_output);
-  }
-}
-
-BaseFloat NnetGradientComputation(const Nnet &nnet,
-                                  const CuMatrixBase<BaseFloat> &input,
-                                  bool pad_input,
-                                  const Posterior &pdf_post,
-                                  Nnet *nnet_to_update) {
-  NnetComputer nnet_computer(nnet, input, pad_input, nnet_to_update);
-  nnet_computer.Propagate();
-  CuMatrix<BaseFloat> deriv;
-  BaseFloat ans;
-  ans = nnet_computer.ComputeLastLayerDeriv(pdf_post, &deriv);  
-  nnet_computer.Backprop(&deriv);
-  return ans;
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-compute.h b/src/nnet2/nnet-compute.h
deleted file mode 100644
index 875252fd260..00000000000
--- a/src/nnet2/nnet-compute.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// nnet2/nnet-compute.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-// Copyright 2015  David Snyder
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_COMPUTE_H_
-#define KALDI_NNET2_NNET_COMPUTE_H_
-
-#include "nnet2/nnet-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/* This header provides functionality for doing forward computation and
-   backpropagation for whole chunks of features, e.g. whole utterances.  The
-   code in nnet-update.h is designed for sample-by-sample computation.
-*/
-
-
-/**
-  Does the basic neural net computation, on a sequence of data (e.g.
-  an utterance).  If pad_input==true we'll pad the input with enough
-  frames of context, and the output will be a matrix of #frames by
-  the output-dim of the network, typically representing state-level
-  posteriors.   If pad_input==false we won't do this and the
-  output will have a lower #frames than the input; we lose
-  nnet.LeftContext() at the left and nnet.RightContext() at the
-  output.
-*/
-void NnetComputation(const Nnet &nnet,
-                     const CuMatrixBase<BaseFloat> &input,  // features
-                     bool pad_input,
-                     CuMatrixBase<BaseFloat> *output); // posteriors.
-/**
-  Does the basic neural net computation, on a sequence of data (e.g.
-  an utterance).  This variant of NnetComputation chunks the input
-  according to chunk_size and does the posterior computation chunk 
-  by chunk.  This allows the computation to be performed on the GPU
-  when the input matrix is very large.  Input is padded with enough
-  frames of context so that the output will be a matrix with 
-  input.NumRows().
-*/
-void NnetComputationChunked(const Nnet &nnet,
-                     const Matrix<BaseFloat> &input,  // features
-                     int32 chunk_size,
-                     Matrix<BaseFloat> *output); // posteriors.
-
-/** Does the neural net computation and backprop, given input and labels.
-    Note: if pad_input==true the number of rows of input should be the
-    same as the number of labels, and if false, you should omit
-    nnet.LeftContext() labels on the left and nnet.RightContext() on
-    the right.  If nnet_to_update == &nnet, then this does stochastic
-    gradient descent, otherwise (assuming you have called SetZero(true)
-    on *nnet_to_update) it will compute the gradient on this data.
-    Returns the total objective function summed over the frames, times
-    the utterance weight.
-*/
-BaseFloat NnetGradientComputation(const Nnet &nnet,
-                                  const MatrixBase<BaseFloat> &input,
-                                  bool pad_input,
-                                  BaseFloat utterance_weight,
-                                  const std::vector<int32> &labels,
-                                  Nnet *nnet_to_update);
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_COMPUTE_H_
diff --git a/src/nnet2/nnet-example-functions-test.cc b/src/nnet2/nnet-example-functions-test.cc
deleted file mode 100644
index 03484148f6f..00000000000
--- a/src/nnet2/nnet-example-functions-test.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-// nnet2/nnet-example-functions-test.cc
-
-// Copyright 2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-example-functions.h"
-#include "util/common-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// Note: most of these functions we're testing from the command line,
-// this is just to test the function to solve the packing problem.
-
-void UnitTestSolvePackingProblem() {
-  size_t size = Rand() % 20;
-  std::vector<BaseFloat> item_costs;
-  for (size_t i = 0; i < size; i++) {
-    item_costs.push_back(0.5 * (Rand() % 15));
-  }
-  BaseFloat max_cost = 0.66 + Rand() % 5;
-
-  std::vector<std::vector<size_t> > groups;
-  SolvePackingProblem(max_cost, item_costs, &groups);
-  
-  std::vector<size_t> all_indices;
-  for (size_t i = 0; i < groups.size(); i++) {
-    BaseFloat this_group_cost = 0.0;
-    for (size_t j = 0; j < groups[i].size(); j++) {
-      size_t index = groups[i][j];
-      all_indices.push_back(index);
-      this_group_cost += item_costs[index];
-    }
-    KALDI_ASSERT(!groups[i].empty());
-    KALDI_ASSERT(groups[i].size() == 1 || this_group_cost <= max_cost);
-  }
-  SortAndUniq(&all_indices);
-  KALDI_ASSERT(all_indices.size() == size);
-  if (!all_indices.empty())
-    KALDI_ASSERT(all_indices.back() + 1 == size);
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-  using kaldi::int32;
-  for (int32 i = 0; i < 10; i++)
-    UnitTestSolvePackingProblem();
-}
-
diff --git a/src/nnet2/nnet-example-functions.cc b/src/nnet2/nnet-example-functions.cc
deleted file mode 100644
index 87184cd16e4..00000000000
--- a/src/nnet2/nnet-example-functions.cc
+++ /dev/null
@@ -1,997 +0,0 @@
-// nnet2/nnet-example-functions.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-example-functions.h"
-#include "lat/lattice-functions.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-bool LatticeToDiscriminativeExample(
-    const std::vector<int32> &alignment,
-    const Matrix<BaseFloat> &feats,
-    const CompactLattice &clat,
-    BaseFloat weight,
-    int32 left_context,
-    int32 right_context,
-    DiscriminativeNnetExample *eg) {
-  KALDI_ASSERT(left_context >= 0 && right_context >= 0);
-  int32 num_frames = alignment.size();
-  if (num_frames == 0) {
-    KALDI_WARN << "Empty alignment";
-    return false;
-  }
-  if (num_frames != feats.NumRows()) {
-    KALDI_WARN << "Dimension mismatch: alignment " << num_frames
-               << " versus feats " << feats.NumRows();
-    return false;
-  }
-  std::vector<int32> times;
-  int32 num_frames_clat = CompactLatticeStateTimes(clat, &times);  
-  if (num_frames_clat != num_frames) {
-    KALDI_WARN << "Numerator/frames versus denlat frames mismatch: "
-               << num_frames << " versus " << num_frames_clat;
-    return false;
-  }
-  eg->weight = weight;
-  eg->num_ali = alignment;
-  eg->den_lat = clat;
-
-  int32 feat_dim = feats.NumCols();
-  eg->input_frames.Resize(left_context + num_frames + right_context,
-                          feat_dim);
-  eg->input_frames.Range(left_context, num_frames,
-                         0, feat_dim).CopyFromMat(feats);
-
-  // Duplicate the first and last frames.
-  for (int32 t = 0; t < left_context; t++)
-    eg->input_frames.Row(t).CopyFromVec(feats.Row(0));
-  for (int32 t = 0; t < right_context; t++)
-    eg->input_frames.Row(left_context + num_frames + t).CopyFromVec(
-        feats.Row(num_frames - 1));
-
-  eg->left_context = left_context;
-  eg->Check();
-  return true;
-}
-
-
-
-
-
-
-/**
-   For each frame, judge:
-     - does it produce a nonzero derivative? [this differs MMI vs MPE]
-     - can it be split here [or what is the penalty for splitting here.]
-         - depends whether lattice has just one path at that point.
-
-   Time taken to process segment of a certain length: [must be sub-linear.]
-      [use quadratic function that's max at specified segment length and zero at zero.]
-
-   No penalty for processing frames we don't need to process (already implicit in
-   segment-processing time above.)
-
-   Penalty for splitting where we should not split.  [Make it propto log(#paths).]
-   
- */
-
-
-
-
-
-class DiscriminativeExampleSplitter {
- public:
-  DiscriminativeExampleSplitter(
-      const SplitDiscriminativeExampleConfig &config,
-      const TransitionModel &tmodel,
-      const DiscriminativeNnetExample &eg,
-      std::vector<DiscriminativeNnetExample> *egs_out):
-      config_(config), tmodel_(tmodel), eg_(eg), egs_out_(egs_out) { }
-
-  void Excise(SplitExampleStats *stats) {
-    eg_.Check();
-    PrepareLattice(false);
-    ComputeFrameInfo();
-    if (!config_.excise) {
-      egs_out_->resize(1);
-      (*egs_out_)[0] = eg_;
-    } else {
-      DoExcise(stats);
-    }
-  }
-  
-  void Split(SplitExampleStats *stats) {
-    if (!config_.split) {
-      egs_out_->resize(1);
-      (*egs_out_)[0] = eg_;
-    } else {
-      eg_.Check();    
-      PrepareLattice(true);
-      ComputeFrameInfo();
-      DoSplit(stats);
-    }
-  }
-
- private:
-  typedef LatticeArc Arc;
-  typedef Arc::StateId StateId;
-  typedef Arc::Label Label;
-
-  // converts compact lattice to lat_.  You should set first_time to true if
-  // this is being called from DoSplit, but false if being called from DoExcise
-  // (this saves some time, since we avoid some preparation steps that we know
-  // are unnecessary because they were done before
-  void PrepareLattice(bool first_time); 
-
-  void CollapseTransitionIds(); // Modifies the transition-ids on lat_ so that
-                                // on each frame, there is just one with any
-                                // given pdf-id.  This allows us to determinize
-                                // and minimize more completely.
-  
-  bool ComputeFrameInfo();
-
-  static void RemoveAllOutputSymbols (Lattice *lat);
-
-  void OutputOneSplit(int32 seg_begin, int32 seg_end);
-  
-  void DoSplit(SplitExampleStats *stats);
-
-  void DoExcise(SplitExampleStats *stats);
-  
-  int32 NumFrames() const { return static_cast<int32>(eg_.num_ali.size()); }
-
-  int32 RightContext() { return eg_.input_frames.NumRows() - NumFrames() - eg_.left_context; }
-  
-
-  // Put in lat_out, a slice of "clat" with first frame at time "seg_begin" and
-  // with last frame at time "seg_end - 1".
-  void CreateOutputLattice(int32 seg_begin, int32 seg_end,
-                           CompactLattice *clat_out);
-
-  // Returns the state-id in this output lattice (creates a
-  // new state if needed).
-  StateId GetOutputStateId(StateId s,
-                           unordered_map<StateId, StateId> *state_map,
-                           Lattice *lat_out);           
-
-  struct FrameInfo {
-    int32 den_state_count;
-    int32 den_pdf_count; // number of distinct pdfs in denominator lattice
-    bool multiple_transition_ids; // true if there are multiple distinct
-                                  // transition-ids in the denominator lattice
-                                  // at this point
-    bool num_den_overlap; // true if num and den overlap.
-
-    bool nonzero_derivative; // True if we need to keep this frame because the
-    // derivative is nonzero on this frame.
-    bool can_excise_frame; // True if the frame, if part of a segment, can be
-    // excised, *but ignoring the effect of acoustic
-    // context*.  I.e. true if the likelihoods and
-    // derivatives from this frame do not matter because
-    // the derivatives are zero and the likelihoods don't
-    // affect lattice posteriors (because pdfs are all
-    // the same on this frame, or if doing mpfe,
-    // transition-ids are all the same.
-
-    // start_state says, for a segment starting at frame t, what is the
-    // earliest state in lat_ that we have to consider including in the split
-    // lattice?  This relates to a kind of optimization for efficiency.
-    StateId start_state;
-
-    // end_state says, for a segment whose final frame is time t (i.e.  whose
-    // "segment end" is time t+1), what is the latest state in lat_ that we have
-    // to consider including in the split lattice?  This relates to a kind of
-    // optimization for efficiency.
-    StateId end_state;  
-    FrameInfo(): den_state_count(0), den_pdf_count(0),
-                 multiple_transition_ids(false),
-                 num_den_overlap(false), nonzero_derivative(false),
-                 can_excise_frame(false),
-                 start_state(std::numeric_limits<int32>::max()), end_state(0) { }
-  };
-  
-  
-  // The following variables are set in the initializer:
-  const SplitDiscriminativeExampleConfig &config_;
-  const TransitionModel &tmodel_;
-  const DiscriminativeNnetExample &eg_;
-  std::vector<DiscriminativeNnetExample> *egs_out_;
-  
-  Lattice lat_; // lattice generated from eg_.den_lat, with epsilons removed etc.
-
-
-  // The other variables are computed by Split() or functions called from it.
-
-  std::vector<FrameInfo> frame_info_;
-  
-  // state_times_ says, for each state in lat_, what its start time is.
-  std::vector<int32> state_times_;
-
-};
-
-// Make sure that for any given pdf-id and any given frame, the den-lat has
-// only one transition-id mapping to that pdf-id, on the same frame.
-// It helps us to more completely minimize the lattice.  Note: we
-// can't do this if the criterion is MPFE, because in that case the
-// objective function will be affected by the phone-identities being
-// different even if the pdf-ids are the same.
-void DiscriminativeExampleSplitter::CollapseTransitionIds() {
-  std::vector<int32> times;
-  TopSort(&lat_); // Topologically sort the lattice (required by
-                  // LatticeStateTimes)
-  int32 num_frames = LatticeStateTimes(lat_, &times);  
-  StateId num_states = lat_.NumStates();
-
-  std::vector<std::map<int32, int32> > pdf_to_tid(num_frames);
-  for (StateId s = 0; s < num_states; s++) {
-    int32 t = times[s];
-    for (fst::MutableArcIterator<Lattice> aiter(&lat_, s);
-         !aiter.Done(); aiter.Next()) {
-      KALDI_ASSERT(t >= 0 && t < num_frames);
-      Arc arc = aiter.Value();
-      KALDI_ASSERT(arc.ilabel != 0 && arc.ilabel == arc.olabel);
-      int32 pdf = tmodel_.TransitionIdToPdf(arc.ilabel);
-      if (pdf_to_tid[t].count(pdf) != 0) {
-        arc.ilabel = arc.olabel = pdf_to_tid[t][pdf];
-        aiter.SetValue(arc);
-      } else {
-        pdf_to_tid[t][pdf] = arc.ilabel;
-      }
-    }
-  }    
-}
-
-
-void DiscriminativeExampleSplitter::PrepareLattice(bool first_time) {
-  ::fst::ConvertLattice(eg_.den_lat, &lat_);
-
-  Project(&lat_, fst::PROJECT_INPUT); // Get rid of the word labels and put the
-                                      // transition-ids on both sides.
-  
-  RmEpsilon(&lat_); // Remove epsilons.. this simplifies
-                    // certain things.
-
-  if (first_time) {
-    if (config_.collapse_transition_ids && config_.criterion != "mpfe")
-      CollapseTransitionIds();
-  
-    if (config_.determinize) {
-      if (!config_.minimize) {
-        Lattice det_lat;
-        Determinize(lat_, &det_lat);
-        lat_ = det_lat;
-      } else {
-        Lattice tmp_lat;
-        Reverse(lat_, &tmp_lat);
-        Determinize(tmp_lat, &lat_);
-        Reverse(lat_, &tmp_lat);
-        Determinize(tmp_lat, &lat_);
-        RmEpsilon(&lat_);
-        // Previously we determinized, then did
-        // Minimize(&lat_);
-        // but this was too slow.
-      }
-    }
-  }
-  TopSort(&lat_); // Topologically sort the lattice.
-}
-
-// this function computes various arrays that say something about
-// this frame of the lattice.
-bool DiscriminativeExampleSplitter::ComputeFrameInfo() {
-  
-  int32 num_frames = NumFrames();
-
-  frame_info_.clear();
-  frame_info_.resize(num_frames + 1);
-  
-  LatticeStateTimes(lat_, &state_times_);
-
-  std::vector<std::set<int32> > pdfs_per_frame(num_frames),
-      tids_per_frame(num_frames);
-  
-  int32 num_states = lat_.NumStates();
-  
-  for (int32 state = 0; state < num_states; state++) {
-    int32 t = state_times_[state];
-    KALDI_ASSERT(t >= 0 && t <= num_frames);
-    frame_info_[t].den_state_count++;
-    for (fst::ArcIterator<Lattice> aiter(lat_, state); !aiter.Done();
-         aiter.Next()) {
-      const LatticeArc &arc = aiter.Value();
-      KALDI_ASSERT(arc.ilabel != 0 && arc.ilabel == arc.olabel); 
-      int32 transition_id = arc.ilabel,
-          pdf_id = tmodel_.TransitionIdToPdf(transition_id);
-      tids_per_frame[t].insert(transition_id);
-      pdfs_per_frame[t].insert(pdf_id);
-    }
-    if (t < num_frames)
-      frame_info_[t+1].start_state = std::min(state,
-                                              frame_info_[t+1].start_state);
-    frame_info_[t].end_state = std::max(state,
-                                        frame_info_[t].end_state);
-  }
-
-  for (int32 i = 1; i <= NumFrames(); i++)
-    frame_info_[i].end_state = std::max(frame_info_[i-1].end_state,
-                                        frame_info_[i].end_state);
-  for (int32 i = NumFrames() - 1; i >= 0; i--)
-    frame_info_[i].start_state = std::min(frame_info_[i+1].start_state,
-                                          frame_info_[i].start_state);
-  
-  for (int32 t = 0; t < num_frames; t++) {
-    FrameInfo &frame_info = frame_info_[t];
-    int32 transition_id = eg_.num_ali[t],
-        pdf_id = tmodel_.TransitionIdToPdf(transition_id);
-    frame_info.num_den_overlap = (pdfs_per_frame[t].count(pdf_id) != 0);
-    frame_info.multiple_transition_ids = (tids_per_frame[t].size() > 1);
-    KALDI_ASSERT(!pdfs_per_frame[t].empty());
-    frame_info.den_pdf_count = pdfs_per_frame[t].size();
-
-    if (config_.criterion == "mpfe" || config_.criterion == "smbr") {
-      frame_info.nonzero_derivative = (frame_info.den_pdf_count > 1);
-    } else {
-      KALDI_ASSERT(config_.criterion == "mmi");
-      if (config_.drop_frames) {
-        // With frame dropping, we'll get nonzero derivative only
-        // if num and den overlap, *and* den has >1 active pdf.
-        frame_info.nonzero_derivative = frame_info.num_den_overlap  &&
-            frame_info.den_state_count > 1;
-      } else {
-        // Without frame dropping, we'll get nonzero derivative if num and den
-        // do not overlap , or den has >1 active pdf.
-        frame_info.nonzero_derivative = !frame_info.num_den_overlap ||
-            frame_info.den_state_count > 1;
-      }
-    }
-    // If a frame is part of a segment, but it's not going to contribute
-    // to the derivative and the den lattice has only one pdf active
-    // at that time, then this frame can be excised from the lattice
-    // because it will not affect the posteriors around it.
-    if (config_.criterion == "mpfe") {
-      frame_info.can_excise_frame =
-          !frame_info.nonzero_derivative && \
-          !frame_info.multiple_transition_ids;
-      // in the mpfe case, if there are multiple transition-ids on a
-      // frame there may be multiple phones on a frame, which could
-      // contribute to the objective function even if they share pdf-ids.
-      // (this was an issue that came up during testing).
-    } else {      
-      frame_info.can_excise_frame =
-          !frame_info.nonzero_derivative && frame_info.den_pdf_count == 1;
-    }
-  }
-  return true;
-}
-
-
-/* Excising a frame means removing a frame from the lattice and removing the
-   corresponding feature.  We can only do this if it would not affect the
-   derivatives because the current frame has zero derivative and also all the
-   den-lat pdfs are the same on this frame (so removing the frame doesn't affect
-   the lattice posteriors).  But we can't remove a frame if doing so would
-   affect the acoustic context.  Generally speaking we must keep all frames
-   that are within LeftContext() to the left and RightContext() to the right
-   of a frame that we can't excise, *but* it's OK at the edges of a segment
-   even if they are that close to other frames, because we anyway keep a few
-   frames of context at the edges, and we can just make sure to keep the
-   *right* few frames of context.
-   */
-void DiscriminativeExampleSplitter::DoExcise(SplitExampleStats *stats) {
-  int32 left_context = eg_.left_context,
-      right_context = RightContext(),
-      num_frames = NumFrames();
-  // Compute, for each frame, whether we can excise it.
-  // 
-  std::vector<bool> can_excise(num_frames, false);
-  
-  bool need_some_frame = false;
-  for (int32 t = 0; t < num_frames; t++) {
-    can_excise[t] = frame_info_[t].can_excise_frame;
-    if (!can_excise[t])
-      need_some_frame = true;
-  }
-  if (!need_some_frame) { // We don't need any frame within this file, so simply
-                          // delete the segment.
-    KALDI_WARN << "Example completely removed when excising."; // unexpected,
-    // as the segment should have been deleted when splitting.
-    egs_out_->clear();
-    return;
-  }
-  egs_out_->resize(1);
-  DiscriminativeNnetExample &eg_out = (*egs_out_)[0];
-
-  // start_t and end_t will be the central part of the segment, excluding any
-  // frames at the edges that we can excise.
-  int32 start_t, end_t;
-  for (start_t = 0; can_excise[start_t]; start_t++);
-  for (end_t = num_frames; can_excise[end_t-1]; end_t--);
-
-  // for frames from start_t to end_t-1, do not excise them if
-  // they are within the context-window of a frame that we need to keep.
-  // Note: we do t2 = t - right_context to t + left_context, because we're
-  // concerned whether frame t2 has frame t in its window... it might
-  // seem a bit backwards.
-  std::vector<bool> will_excise(can_excise);
-  for (int32 t = start_t; t < end_t; t++) {
-    for (int32 t2 = t - right_context; t2 <= t + left_context; t2++)
-      if (t2 >= start_t && t2 < end_t && !can_excise[t2])
-        will_excise[t] = false; // can't excise this frame, it's needed for
-                                // context.
-  }
-
-  // Remove all un-needed frames from the lattice by replacing the
-  // symbols with epsilon and then removing the epsilons.
-  // Note, this operation is destructive (it changes lat_).
-  int32 num_states = lat_.NumStates();
-  for (int32 state = 0; state < num_states; state++) {
-    int32 t = state_times_[state];
-    for (::fst::MutableArcIterator<Lattice> aiter(&lat_, state); !aiter.Done();
-         aiter.Next()) {
-      Arc arc = aiter.Value();
-      if (will_excise[t]) {
-        arc.ilabel = arc.olabel = 0;
-        aiter.SetValue(arc);
-      }
-    }
-  }
-  RmEpsilon(&lat_);
-  RemoveAllOutputSymbols(&lat_);
-  ConvertLattice(lat_, &eg_out.den_lat);
-
-  eg_out.num_ali.clear();
-  int32 num_frames_kept = 0;
-  for (int32 t = 0; t < num_frames; t++) {
-    if (!will_excise[t]) {
-      eg_out.num_ali.push_back(eg_.num_ali[t]);
-      num_frames_kept++;
-    }
-  }
-
-  stats->num_frames_kept_after_excise += num_frames_kept;
-  stats->longest_segment_after_excise = std::max(stats->longest_segment_after_excise,
-                                                 num_frames_kept);
-  
-  int32 num_frames_kept_plus = num_frames_kept + left_context + right_context;
-  eg_out.input_frames.Resize(num_frames_kept_plus,
-                             eg_.input_frames.NumCols());
-
-  // the left-context of the output will be shifted to the right by
-  // start_t.
-  for (int32 i = 0; i < left_context; i++) {
-    SubVector<BaseFloat> dst(eg_out.input_frames, i);
-    SubVector<BaseFloat> src(eg_.input_frames, start_t + i);
-    dst.CopyFromVec(src);
-  }
-  // the right-context will also be shifted, we take the frames
-  // to the right of end_t.
-  for (int32 i = 0; i < right_context; i++) {
-    SubVector<BaseFloat> dst(eg_out.input_frames,
-                             num_frames_kept + left_context + i);
-    SubVector<BaseFloat> src(eg_.input_frames,
-                             end_t + left_context + i);
-    dst.CopyFromVec(src);
-  }
-  // now copy the central frames (those that were not excised).
-  int32 dst_t = 0;
-  for (int32 t = start_t; t < end_t; t++) {
-    if (!will_excise[t]) {
-      SubVector<BaseFloat> dst(eg_out.input_frames,
-                               left_context + dst_t);
-      SubVector<BaseFloat> src(eg_.input_frames,
-                               left_context + t);
-      dst.CopyFromVec(src);
-      dst_t++;
-    }
-  }
-  KALDI_ASSERT(dst_t == num_frames_kept);
-
-
-  eg_out.weight = eg_.weight;
-  eg_out.left_context = eg_.left_context;
-  eg_out.spk_info = eg_.spk_info;
-
-  eg_out.Check();
-}
-
-
-void DiscriminativeExampleSplitter::DoSplit(SplitExampleStats *stats) {
-  std::vector<int32> split_points;
-  int32 num_frames = NumFrames();
-  {
-    // Make the "split points" 0 and num_frames, and
-    // any frame that has just one state on it and the previous
-    // frame had >1 state.  This gives us one split for each
-    // "pinch point" in the lattice.  Later we may move each split
-    // to a more optimal location.
-    split_points.push_back(0);
-    for (int32 t = 1; t < num_frames; t++) {
-      if (frame_info_[t].den_state_count == 1 &&
-          frame_info_[t-1].den_state_count > 1)
-        split_points.push_back(t);
-    }
-    split_points.push_back(num_frames);
-  }
-
-  std::vector<bool> is_kept(split_points.size() - 1);
-  { // A "split" is a pair of successive split points.  Work out for each split
-    // whether we must keep it (we must if it contains at least one frame for
-    // which "nonzero_derivative" == true.)
-    for (size_t s = 0; s < is_kept.size(); s++) {
-      int32 start = split_points[s], end = split_points[s+1];
-      bool keep_this_split = false;
-      for (int32 t = start; t < end; t++)
-        if (frame_info_[t].nonzero_derivative)
-          keep_this_split = true;
-      is_kept[s] = keep_this_split;
-    }
-  }
-
-  egs_out_->clear();
-  egs_out_->reserve(is_kept.size());
-
-  stats->num_lattices++;
-  stats->longest_lattice = std::max(stats->longest_lattice, num_frames);
-  stats->num_segments += is_kept.size();
-  stats->num_frames_orig += num_frames;
-  for (int32 t = 0; t < num_frames; t++)
-    if (frame_info_[t].nonzero_derivative)
-      stats->num_frames_must_keep++;
-  
-  for (size_t s = 0; s < is_kept.size(); s++) {
-    if (is_kept[s]) {
-      stats->num_kept_segments++;
-      OutputOneSplit(split_points[s], split_points[s+1]);
-      int32 segment_len = split_points[s+1] - split_points[s];
-      stats->num_frames_kept_after_split += segment_len;
-      stats->longest_segment_after_split =
-          std::max(stats->longest_segment_after_split, segment_len);
-    }
-  }
-}
-
-
-
-void SplitExampleStats::Print() {
-  KALDI_LOG << "Split " << num_lattices << " lattices.  Stats:";
-  double kept_segs_per_lat = num_kept_segments * 1.0 / num_lattices,
-      segs_per_lat = num_segments * 1.0 / num_lattices;
-      
-  KALDI_LOG << "Made on average " << segs_per_lat << " segments per lattice, "
-            << "of which " << kept_segs_per_lat << " were kept.";
-
-  double percent_needed = num_frames_must_keep * 100.0 / num_frames_orig,
-    percent_after_split = num_frames_kept_after_split * 100.0 / num_frames_orig,
-   percent_after_excise = num_frames_kept_after_excise * 100.0 / num_frames_orig;
-      
-  KALDI_LOG << "Needed to keep " << percent_needed << "% of frames, after split "
-            << "kept " << percent_after_split << "%, after excising frames kept "
-            << percent_after_excise << "%.";
-
-  KALDI_LOG << "Longest lattice had " << longest_lattice
-            << " frames, longest segment after splitting had "
-            << longest_segment_after_split
-            << " frames, longest segment after excising had "
-            << longest_segment_after_excise;
-}
-
-void DiscriminativeExampleSplitter::OutputOneSplit(int32 seg_begin,
-                                                   int32 seg_end) {
-  KALDI_ASSERT(seg_begin >= 0 && seg_end > seg_begin && seg_end <= NumFrames());
-  egs_out_->resize(egs_out_->size() + 1);
-  int32 left_context = eg_.left_context, right_context = RightContext(),
-      tot_context = left_context + right_context;
-  DiscriminativeNnetExample &eg_out = egs_out_->back();
-  eg_out.weight = eg_.weight;
-
-  eg_out.num_ali.insert(eg_out.num_ali.end(),
-                        eg_.num_ali.begin() + seg_begin,
-                        eg_.num_ali.begin() + seg_end);
-
-  CreateOutputLattice(seg_begin, seg_end, &(eg_out.den_lat));
-  
-  eg_out.input_frames = eg_.input_frames.Range(seg_begin, seg_end - seg_begin +
-                                               tot_context,
-                                               0, eg_.input_frames.NumCols());
-
-  eg_out.left_context = eg_.left_context;
-
-  eg_out.spk_info = eg_.spk_info;
-  
-  eg_out.Check();  
-}
-
-// static
-void DiscriminativeExampleSplitter::RemoveAllOutputSymbols(Lattice *lat) {
-  for (StateId s = 0; s < lat->NumStates(); s++) {
-    for (::fst::MutableArcIterator<Lattice> aiter(lat, s); !aiter.Done();
-         aiter.Next()) {
-      Arc arc = aiter.Value();
-      arc.olabel = 0;
-      aiter.SetValue(arc);
-    }
-  }  
-}
-
-DiscriminativeExampleSplitter::StateId
-DiscriminativeExampleSplitter::GetOutputStateId(
-    StateId s, unordered_map<StateId, StateId> *state_map, Lattice *lat_out) {
-  if (state_map->count(s) == 0) {
-    return ((*state_map)[s] = lat_out->AddState());
-  } else {
-    return (*state_map)[s];
-  }
-}
-
-void DiscriminativeExampleSplitter::CreateOutputLattice(
-    int32 seg_begin, int32 seg_end,
-    CompactLattice *clat_out) {
-  Lattice lat_out;
-
-  // Below, state_map will map from states in the original lattice
-  // lat_ to ones in the new lattice lat_out.
-  unordered_map<StateId, StateId> state_map;
-
-  // The range of the loop over s could be made over the
-  // entire lattice, but we limit it for efficiency.
-  
-  for (StateId s = frame_info_[seg_begin].start_state;
-       s <= frame_info_[seg_end].end_state; s++) {
-    int32 t = state_times_[s];
-
-    if (t < seg_begin || t > seg_end) // state out of range.
-      continue;
-
-    int32 this_state = GetOutputStateId(s, &state_map, &lat_out);
-
-    if (t == seg_begin) // note: we only split on frames with just one
-      lat_out.SetStart(this_state); // state, so we reach this only once.
-    
-    if (t == seg_end) { // Make it final and don't process its arcs out.
-      if (seg_end == NumFrames()) {
-        lat_out.SetFinal(this_state, lat_.Final(s));
-      } else {
-        lat_out.SetFinal(this_state, LatticeWeight::One());
-      }
-      continue; // don't process arcs out of this state.
-    }
-    
-    for (fst::ArcIterator<Lattice> aiter(lat_, s); !aiter.Done(); aiter.Next()) {
-      const Arc &arc = aiter.Value();
-      StateId next_state = GetOutputStateId(arc.nextstate,
-                                            &state_map, &lat_out);
-      KALDI_ASSERT(arc.ilabel != 0 && arc.ilabel == arc.olabel); // We expect no epsilons.
-      lat_out.AddArc(this_state, Arc(arc.ilabel, arc.olabel, arc.weight,
-                                      next_state));
-    }
-  }
-  Connect(&lat_out); // this is not really necessary, it's only to make sure
-                     // the assert below fails when it should. TODO: remove it.
-  KALDI_ASSERT(lat_out.NumStates() > 0);
-  RemoveAllOutputSymbols(&lat_out);
-  ConvertLattice(lat_out, clat_out);
-}
-
-/*
-void DiscriminativeExampleSplitter::SelfTest() {
-  bool splits_ok = true; // True iff we split only
-                         // on frames where there was
-                         // one arc crossing.
-
-  // we can't do any of this excising frames if we want to
-  // preserve equivalence.
-  std::fill(can_excise_.begin(), can_excise_.end(), false);
-  
-  std::vector<Lattice*> split_lats;
-
-  int32 cur_t = NumFrames();
-  while (cur_t != 0) {
-    Backtrace this_backtrace = backtrace_[cur_t];
-    int32 prev_t = this_backtrace.prev_frame;
-
-    int32 seg_begin = prev_t, seg_end = cur_t;
-    Lattice *new_lat = new Lattice();
-    CreateOutputLattice(seg_begin, seg_end, new_lat);
-    split_lats.push_back(new_lat);
-
-    if (split_penalty_[cur_t] != 0)
-      splits_ok = false; // we split where there was a penalty so we don't
-                         //  expect equivalence.
-    cur_t = prev_t;
-  }
-  KALDI_ASSERT(!split_lats.empty());
-  std::reverse(split_lats.begin(), split_lats.end());
-  for (size_t i = 1; i < split_lats.size(); i++) {
-    // append split_lats[i] to split_lats[0], putting the
-    // result in split_lats[0].
-    Concat(split_lats[0], *(split_lats[i]));
-  }
-  Connect(split_lats[0]);
-  KALDI_ASSERT(split_lats[0]->NumStates() > 0);
-  
-
-  if (!splits_ok) {
-    KALDI_LOG << "Not self-testing because we split where there were "
-              << "multiple paths.";
-    
-  } else {
-    if (!(RandEquivalent(*(split_lats[0]), lat_, 5, 0.01,
-         Rand(), 100))) {
-      KALDI_WARN << "Lattices were not equivalent (self-test failed).";
-      KALDI_LOG << "Original lattice was: ";
-      WriteLattice(std::cerr, false, lat_);
-      KALDI_LOG << "New lattice is:";
-      WriteLattice(std::cerr, false, *(split_lats[0]));
-      {
-        Lattice best_path_orig;
-        ShortestPath(lat_, &best_path_orig);
-        KALDI_LOG << "Original best path was:";
-        WriteLattice(std::cerr, false, best_path_orig);
-      }
-      {
-        Lattice best_path_new;
-        ShortestPath(*(split_lats[0]), &best_path_new);
-        KALDI_LOG << "New best path was:";
-        WriteLattice(std::cerr, false, best_path_new);
-      }
-    }
-  }
-  for (size_t i = 0; i < split_lats.size(); i++)
-    delete split_lats[i];
-}
-*/
-
-
-
-void SplitDiscriminativeExample(
-    const SplitDiscriminativeExampleConfig &config,
-    const TransitionModel &tmodel,
-    const DiscriminativeNnetExample &eg,
-    std::vector<DiscriminativeNnetExample> *egs_out,
-    SplitExampleStats *stats_out) {
-  DiscriminativeExampleSplitter splitter(config, tmodel, eg, egs_out);
-  splitter.Split(stats_out);
-}
-
-
-void ExciseDiscriminativeExample(
-    const SplitDiscriminativeExampleConfig &config,
-    const TransitionModel &tmodel,
-    const DiscriminativeNnetExample &eg,
-    std::vector<DiscriminativeNnetExample> *egs_out,    
-    SplitExampleStats *stats_out) {
-  DiscriminativeExampleSplitter splitter(config, tmodel, eg, egs_out);
-  splitter.Excise(stats_out);
-}
-
-
-void UpdateHash(
-    const TransitionModel &tmodel,
-    const DiscriminativeNnetExample &eg,
-    std::string criterion,
-    bool drop_frames,
-    bool one_silence_class,
-    Matrix<double> *hash,
-    double *num_weight,
-    double *den_weight,
-    double *tot_t) {
-  int32 feat_dim = eg.input_frames.NumCols(),
-      left_context = eg.left_context,
-      num_frames = eg.num_ali.size(),
-      right_context = eg.input_frames.NumRows() - num_frames - left_context,
-      context_width = left_context + 1 + right_context;
-  *tot_t += num_frames;
-  KALDI_ASSERT(right_context >= 0);
-  KALDI_ASSERT(hash != NULL);
-  if (hash->NumRows() == 0) {
-    hash->Resize(tmodel.NumPdfs(), feat_dim);
-  } else {
-    KALDI_ASSERT(hash->NumRows() == tmodel.NumPdfs() &&
-                 hash->NumCols() == feat_dim);
-  }
-
-  Posterior post;
-  std::vector<int32> silence_phones; // we don't let the user specify this
-                                     // because it's not necessary for testing
-                                     // purposes -> leave it empty
-  ExampleToPdfPost(tmodel, silence_phones, criterion, drop_frames,
-                   one_silence_class, eg, &post);
-
-  Vector<BaseFloat> avg_feat(feat_dim);
-  
-  for (int32 t = 0; t < num_frames; t++) {
-    SubMatrix<BaseFloat> context_window(eg.input_frames,
-                                        t, context_width,
-                                        0, feat_dim);
-    // set avg_feat to average over the context-window for this frame.
-    avg_feat.AddRowSumMat(1.0 / context_width, context_window, 0.0);
-    Vector<double> avg_feat_dbl(avg_feat);
-    for (size_t i = 0; i < post[t].size(); i++) {
-      int32 pdf_id = post[t][i].first;
-      BaseFloat weight = post[t][i].second;
-      hash->Row(pdf_id).AddVec(weight, avg_feat_dbl);
-      if (weight > 0.0) *num_weight += weight;
-      else *den_weight += -weight;
-    }
-  }
-}
-
-
-void ExampleToPdfPost(
-    const TransitionModel &tmodel,
-    const std::vector<int32> &silence_phones,    
-    std::string criterion,
-    bool drop_frames,
-    bool one_silence_class,
-    const DiscriminativeNnetExample &eg,
-    Posterior *post) {
-  KALDI_ASSERT(criterion == "mpfe" || criterion == "smbr" || criterion == "mmi");
-  
-  Lattice lat;
-  ConvertLattice(eg.den_lat, &lat);
-  TopSort(&lat);
-  if (criterion == "mpfe" || criterion == "smbr") {
-    Posterior tid_post;
-    LatticeForwardBackwardMpeVariants(tmodel, silence_phones, lat, eg.num_ali,
-                                      criterion, one_silence_class, &tid_post);
-    
-    ConvertPosteriorToPdfs(tmodel, tid_post, post);
-  } else {
-    bool convert_to_pdf_ids = true, cancel = true;
-    LatticeForwardBackwardMmi(tmodel, lat, eg.num_ali,
-                              drop_frames, convert_to_pdf_ids, cancel,
-                              post);
-  }
-  ScalePosterior(eg.weight, post);
-}
-
-
-void SolvePackingProblem(BaseFloat max_cost,
-                         const std::vector<BaseFloat> &costs,
-                         std::vector<std::vector<size_t> > *groups) {
-  groups->clear();
-  std::vector<BaseFloat> group_costs;
-  for (size_t i = 0; i < costs.size(); i++) {
-    bool found_group = false;
-    BaseFloat this_cost = costs[i];
-    for (size_t j = 0; j < groups->size(); j++) {
-      if (group_costs[j] + this_cost <= max_cost) {
-        (*groups)[j].push_back(i);
-        group_costs[j] += this_cost;
-        found_group = true;
-        break;
-      }
-    }
-    if (!found_group) { // Put this object in a newly created group.
-      groups->resize(groups->size() + 1);
-      groups->back().push_back(i);
-      group_costs.push_back(this_cost);
-    }
-  }
-}
-
-void AppendDiscriminativeExamples(
-    const std::vector<const DiscriminativeNnetExample*> &input,
-    DiscriminativeNnetExample *output) {
-  KALDI_ASSERT(!input.empty());
-  const DiscriminativeNnetExample &eg0 = *(input[0]);
-  
-  int32 dim = eg0.input_frames.NumCols() + eg0.spk_info.Dim(),
-      left_context = eg0.left_context,
-      num_frames = eg0.num_ali.size(),
-      right_context = eg0.input_frames.NumRows() - num_frames - left_context;
-
-  int32 tot_frames = eg0.input_frames.NumRows();  // total frames (appended,
-                                                  // with context)
-  for (size_t i = 1; i < input.size(); i++)
-    tot_frames += input[i]->input_frames.NumRows();
-
-  int32 arbitrary_tid = 1;  // arbitrary transition-id that we use to pad the
-                            // num_ali and den_lat members between segments
-                            // (since they're both the same, and the den-lat in
-                            // those parts is linear, they contribute no
-                            // derivative to the training).
-  
-  output->den_lat = eg0.den_lat;
-  output->num_ali = eg0.num_ali;
-  output->input_frames.Resize(tot_frames, dim, kUndefined);
-  output->input_frames.Range(0, eg0.input_frames.NumRows(),
-                             0, eg0.input_frames.NumCols()).CopyFromMat(eg0.input_frames);
-  if (eg0.spk_info.Dim() != 0) {
-    output->input_frames.Range(0, eg0.input_frames.NumRows(),
-                               eg0.input_frames.NumCols(), eg0.spk_info.Dim()).
-        CopyRowsFromVec(eg0.spk_info);
-  }
-  
-  output->num_ali.reserve(tot_frames - left_context - right_context);
-  output->weight = eg0.weight;
-  output->left_context = eg0.left_context;
-  output->spk_info.Resize(0);
-
-  CompactLattice inter_segment_clat;
-  int32 initial = inter_segment_clat.AddState(); // state 0.
-  inter_segment_clat.SetStart(initial);
-  
-  std::vector<int32> inter_segment_ali(left_context + right_context);
-  std::fill(inter_segment_ali.begin(), inter_segment_ali.end(), arbitrary_tid);
-
-  CompactLatticeWeight final_weight = CompactLatticeWeight::One();
-  final_weight.SetString(inter_segment_ali);
-  inter_segment_clat.SetFinal(initial, final_weight);
-  
-  int32 feat_offset = eg0.input_frames.NumRows();
-  
-  for (size_t i = 1; i < input.size(); i++) {
-    const DiscriminativeNnetExample &eg_i = *(input[i]);
-        
-    output->input_frames.Range(feat_offset, eg_i.input_frames.NumRows(),
-                               0, eg_i.input_frames.NumCols()).CopyFromMat(
-                                   eg_i.input_frames);
-    if (eg_i.spk_info.Dim() != 0) {
-      output->input_frames.Range(feat_offset, eg_i.input_frames.NumRows(),
-                                 eg_i.input_frames.NumCols(),
-                                 eg_i.spk_info.Dim()).CopyRowsFromVec(
-                                     eg_i.spk_info);
-      KALDI_ASSERT(eg_i.input_frames.NumCols() +
-                   eg_i.spk_info.Dim() == dim);
-    }
-    
-    output->num_ali.insert(output->num_ali.end(),
-                           inter_segment_ali.begin(), inter_segment_ali.end());
-    output->num_ali.insert(output->num_ali.end(),
-                           eg_i.num_ali.begin(), eg_i.num_ali.end());
-    Concat(&(output->den_lat), inter_segment_clat);
-    Concat(&(output->den_lat), eg_i.den_lat);
-    KALDI_ASSERT(output->weight == eg_i.weight);
-    KALDI_ASSERT(output->left_context == eg_i.left_context);
-    feat_offset += eg_i.input_frames.NumRows();
-  }
-  KALDI_ASSERT(feat_offset == tot_frames);
-}
-  
-void CombineDiscriminativeExamples(
-    int32 max_length,
-    const std::vector<DiscriminativeNnetExample> &input,
-    std::vector<DiscriminativeNnetExample> *output) {
-  
-  std::vector<BaseFloat> costs(input.size());
-  for (size_t i = 0; i < input.size(); i++)
-    costs[i] = static_cast<BaseFloat>(input[i].input_frames.NumRows());
-  std::vector<std::vector<size_t> > groups;
-  SolvePackingProblem(max_length,
-                      costs,
-                      &groups);
-  output->clear();
-  output->resize(groups.size());
-  for (size_t i = 0; i < groups.size(); i++) {
-    std::vector<const DiscriminativeNnetExample*> group_egs;
-    for (size_t j = 0; j < groups[i].size(); j++) {
-      size_t index = groups[i][j];
-      group_egs.push_back(&(input[index]));
-    }
-    AppendDiscriminativeExamples(group_egs, &((*output)[i]));
-  }
-}
-
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-example-functions.h b/src/nnet2/nnet-example-functions.h
deleted file mode 100644
index 82c86dfc046..00000000000
--- a/src/nnet2/nnet-example-functions.h
+++ /dev/null
@@ -1,300 +0,0 @@
-// nnet2/nnet-example-functions.h
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_EXAMPLE_FUNCTIONS_H_
-#define KALDI_NNET2_NNET_EXAMPLE_FUNCTIONS_H_
-
-/** @file
-    Note on how to parse this filename: it contains functions relatied to
-    neural-net training examples, mostly discriminative neural-net training examples,
-   i.e. type DiscriminativeNnetExample    
-*/
-
-#include "nnet2/nnet-nnet.h"
-#include "util/table-types.h"
-#include "lat/kaldi-lattice.h"
-#include "nnet2/nnet-example.h"
-#include "hmm/transition-model.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// Glossary: mmi = Maximum Mutual Information,
-//          mpfe = Minimum Phone Frame Error
-//          smbr = State-level Minimum Bayes Risk
-
-
-// This file relates to the creation of examples for discriminative training
-// (see struct DiscriminativeNnetExample, in ./nnet-example.h).
-
-
-/** Config structure for SplitExample, for splitting discriminative
-    training examples.
-*/
-struct SplitDiscriminativeExampleConfig {
-  // This is the maximum length in frames that any example is allowed to have.
-  // We will split training examples to ensure that they are no longer than
-  // this.  Note: if you make this too short it may have bad effects because
-  // the posteriors start to become inaccurate at the edges of the training
-  // example (since they will be based on the acoustic model that was used to
-  // generate the lattices, not the current one).
-  int32 max_length;
-
-  // criterion can be "smbr" or "mpfe" or "mmi".  This info is only needed to
-  // determine which parts of the lattices will not contribute to training and
-  // can be discarded (for mpe/smbr, any part where the den-lat has only one
-  // path or all den-lat paths map to the same pdf can be discareded; for mmi,
-  // any part where the den-lat's pdfs all have the same value as the num-lat
-  // pdf for that frame, can be discarded.
-  std::string criterion;
-
-  bool collapse_transition_ids;
-
-  bool determinize;
-
-  bool minimize; // we'll push and minimize if this is true.
-  
-  bool test;
-
-  bool drop_frames; // For MMI, true if we will eventually drop frames in which
-                    // the numerator does not appear in the denominator lattice.
-                    // (i.e. we won't backpropagate any derivatives on those
-                    // frames).  We may still need to include those frames in
-                    // the computation in order to get correct posteriors for
-                    // other parts of the lattice.
-
-  bool split; // if false, we won't split at all.
-
-  bool excise; // if false, we will skip the "excise" step.
-  
-  SplitDiscriminativeExampleConfig():
-      max_length(1024), criterion("smbr"), collapse_transition_ids(true),
-      determinize(true), minimize(true), test(false), drop_frames(false),
-      split(true), excise(true) { }
-
-  void Register(OptionsItf *opts) {
-
-    opts->Register("max-length", &max_length, "Maximum length allowed for any "
-                   "segment (i.e. max #frames for any example");
-    //opts->Register("target-length", &target_length, "Target length for a "
-    // "segment");
-    opts->Register("criterion", &criterion, "Criterion, 'mmi'|'mpfe'|'smbr'. "
-                   "Determines which frames may be dropped from lattices.");
-    opts->Register("collapse-transition-ids", &collapse_transition_ids,
-                   "This option included for debugging purposes");
-    opts->Register("determinize", &determinize, "If true, we determinize "
-                   "lattices (as Lattice) before splitting and possibly minimize");
-    opts->Register("minimize", &minimize, "If true, we push and "
-                   "minimize lattices (as Lattice) before splitting");
-    opts->Register("test", &test, "If true, activate self-testing code.");
-    // See "Sequence-discriminative training of deep neural networks", Vesely et al,
-    // ICASSP 2013 for explanation of frame dropping.
-    opts->Register("drop-frames", &drop_frames, "For MMI, if true we drop frames "
-                   "with no overlap of num and den pdf-ids");
-    opts->Register("split", &split, "Set to false to disable lattice-splitting.");
-    opts->Register("excise", &excise, "Set to false to disable excising un-needed "
-                   "frames (option included for debug purposes)");
-  }
-};
-
-/// This struct exists only for diagnostic purposes.  Note: the stats assume
-/// that you call SplitDiscriminative and ExciseDiscriminativeExample in the
-/// same program, and the info printed out will be wrong if this is not the
-/// case... this isn't ideal but it was more convenient.
-struct SplitExampleStats {
-  int32 num_lattices;
-  int32 longest_lattice;
-  int32 num_segments;
-  int32 num_kept_segments;
-  int64 num_frames_orig;
-  int64 num_frames_must_keep;
-  int64 num_frames_kept_after_split;
-  int32 longest_segment_after_split;
-  int64 num_frames_kept_after_excise;
-  int32 longest_segment_after_excise;
-  
-  SplitExampleStats() { memset(this, 0, sizeof(*this)); }
-  void Print();
-};
-
-/** Converts lattice to discriminative training example.  returns true on
-    success, false on failure such as mismatched input (will also warn in this
-    case). */
-bool LatticeToDiscriminativeExample(
-    const std::vector<int32> &alignment,
-    const Matrix<BaseFloat> &feats,
-    const CompactLattice &clat,
-    BaseFloat weight,
-    int32 left_context,
-    int32 right_context,
-    DiscriminativeNnetExample *eg);
-
-
-/** Split a "discriminative example" into multiple pieces,
-    splitting where the lattice has "pinch points".
- */
-void SplitDiscriminativeExample(
-    const SplitDiscriminativeExampleConfig &config,
-    const TransitionModel &tmodel,
-    const DiscriminativeNnetExample &eg,
-    std::vector<DiscriminativeNnetExample> *egs_out,
-    SplitExampleStats *stats_out);
-
-/** Remove unnecessary frames from discriminative training
-    example.  The output egs_out will be of size zero or one
-    (usually one) after being called. */
-void ExciseDiscriminativeExample(
-    const SplitDiscriminativeExampleConfig &config,
-    const TransitionModel &tmodel,
-    const DiscriminativeNnetExample &eg,
-    std::vector<DiscriminativeNnetExample> *egs_out,
-    SplitExampleStats *stats_out);
-
-
-/** Appends the given vector of examples (which must be non-empty) into 
-    a single output example (called by CombineExamples, which might be
-    a more convenient interface).
-
-   When combining examples it directly appends the features, and then adds a
-   "fake" segment to the lattice and alignment in between, padding with
-   transition-ids that are all ones.  This is necessary in case the network
-   needs acoustic context, and only because of a kind of limitation in the nnet
-   training code that doesn't support varying 'chunk' sizes within a minibatch.
-
-   Will fail if all the input examples don't have the same weight (this will
-   normally be 1.0 anyway), or if the feature dimension (i.e. basic feature
-   dimension plus spk_info dimension) differs between the examples.
-*/
-void AppendDiscriminativeExamples(
-    const std::vector<const DiscriminativeNnetExample*> &input,
-    DiscriminativeNnetExample *output);
-
-/**
-   This function is used to combine multiple discriminative-training
-   examples (each corresponding to a segment of a lattice), into one.
-   
-   It combines examples into groups such that each group will have a
-   total length (number of rows of the feature matrix) less than or
-   equal to max_length.  However, if individual examples are longer
-   than max_length they will still be processed; they will be given
-   their own group.
-   
-   See also the documentation for AppendDiscriminativeExamples() which
-   gives more details on how we append the examples.
-
-   Will fail if all the input examples don't have the same weight (this will
-   normally be 1.0 anyway).
-
-   If the spk_info variables are non-empty, it will move them into the features
-   of the output, so the spk_info of the output will be empty but the
-   appropriate speaker vectors will be appended to each row of the features.  */
-
-void CombineDiscriminativeExamples(
-    int32 max_length,
-    const std::vector<DiscriminativeNnetExample> &input,
-    std::vector<DiscriminativeNnetExample> *output);
-                     
-/**
-   This function solves the "packing problem" using the "first fit" algorithm.
-   It groups together the indices 0 through sizes.size() - 1, such that the sum
-   of cost within each group does not exceed max_lcost.  [However, if there
-   are single examples that exceed max_cost, it puts them in their own bin].
-   The algorithm is not particularly efficient-- it's more n^2 than n log(n)
-   which it should be.  */
-void SolvePackingProblem(BaseFloat max_cost,
-                         const std::vector<BaseFloat> &costs,
-                         std::vector<std::vector<size_t> > *groups);
-
-
-
-/**
-   Given a discriminative training example, this function works out posteriors
-   at the pdf level (note: these are "discriminative-training posteriors" that
-   may be positive or negative.  The denominator lattice "den_lat" in the
-   example "eg" should already have had acoustic-rescoring done so that its
-   acoustic probs are up to date, and any acoustic scaling should already have
-   been applied.
-
-   "criterion" may be "mmi" or "mpfe" or "smbr".  If criterion
-   is "mmi", "drop_frames" means we don't include derivatives for frames
-   where the numerator pdf is not in the denominator lattice.
-
-   if "one_silence_class" is true you can get a newer behavior for MPE/SMBR
-   which will tend to reduce insertions.
-
-   "silence_phones" is a list of silence phones (this is only relevant for mpfe
-   or smbr, if we want to treat silence specially).
- */
-void ExampleToPdfPost(
-    const TransitionModel &tmodel,
-    const std::vector<int32> &silence_phones,
-    std::string criterion,
-    bool drop_frames,
-    bool one_silence_class,
-    const DiscriminativeNnetExample &eg,
-    Posterior *post);
-
-/**
-   This function is used in code that tests the functionality that we provide
-   here, about splitting and excising nnet examples.  It adds to a "hash
-   function" that is a function of a set of examples; the hash function is of
-   dimension (number of pdf-ids x features dimension).  The hash function
-   consists of the (denominator - numerator) posteriors over pdf-ids, times the
-   average over the context-window (left-context on the left, right-context on
-   the right), of the features.  This is useful because the various
-   manipulations we do are supposed to preserve this, and if there is a bug
-   it will most likely cause the hash function to change.
-
-   This function will resize the matrix if it is empty.
-
-   Any acoustic scaling of the lattice should be done before you call this
-   function.
-
-   'criterion' should be 'mmi', 'mpfe', or 'smbr'.
-   
-   You should set drop_frames to true if you are doing MMI with drop-frames
-   == true.  Then it will not compute the hash for frames where the numerator
-   pdf-id is not in the denominator lattice.
-
-   You can set one_silence_class to true for a newer optional behavior that will
-   reduce insertions in the trained model (or false for the traditional
-   behavior).
-
-   The function will also accumulate the total numerator and denominator weights
-   used as num_weight and den_weight, for an additional diagnostic, and the total
-   number of frames, as tot_t.
-*/
-void UpdateHash(
-    const TransitionModel &tmodel,
-    const DiscriminativeNnetExample &eg,
-    std::string criterion,
-    bool drop_frames,
-    bool one_silence_class,
-    Matrix<double> *hash,
-    double *num_weight,
-    double *den_weight,
-    double *tot_t);
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_EXAMPLE_FUNCTIONS_H_
diff --git a/src/nnet2/nnet-example.cc b/src/nnet2/nnet-example.cc
deleted file mode 100644
index 4bca059b76e..00000000000
--- a/src/nnet2/nnet-example.cc
+++ /dev/null
@@ -1,309 +0,0 @@
-// nnet2/nnet-example.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author: Daniel Povey)
-//                2014  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-example.h"
-#include "lat/lattice-functions.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// This function returns true if the example has labels which, for each frame,
-// have a single element with probability one; and if so, it outputs them to the
-// vector in the associated pointer.  This enables us to write the egs more
-// compactly to disk in this common case.
-bool HasSimpleLabels(
-    const NnetExample &eg,
-    std::vector<int32> *simple_labels) {
-  size_t num_frames = eg.labels.size();
-  for (int32 t = 0; t < num_frames; t++)
-    if (eg.labels[t].size() != 1 || eg.labels[t][0].second != 1.0)
-      return false;
-  simple_labels->resize(num_frames);
-  for (int32 t = 0; t < num_frames; t++)
-    (*simple_labels)[t] = eg.labels[t][0].first;
-  return true;
-}
-
-
-void NnetExample::Write(std::ostream &os, bool binary) const {
-  // Note: weight, label, input_frames and spk_info are members.  This is a
-  // struct.
-  WriteToken(os, binary, "<NnetExample>");
-
-  // At this point, we write <Lab1> if we have "simple" labels, or
-  // <Lab2> in general.  Previous code (when we had only one frame of
-  // labels) just wrote <Labels>.
-  std::vector<int32> simple_labels;
-  if (HasSimpleLabels(*this, &simple_labels)) {
-    WriteToken(os, binary, "<Lab1>");
-    WriteIntegerVector(os, binary, simple_labels);
-  } else {
-    WriteToken(os, binary, "<Lab2>");
-    int32 num_frames = labels.size();
-    WriteBasicType(os, binary, num_frames);
-    for (int32 t = 0; t < num_frames; t++) {
-      int32 size = labels[t].size();
-      WriteBasicType(os, binary, size);
-      for (int32 i = 0; i < size; i++) {
-        WriteBasicType(os, binary, labels[t][i].first);
-        WriteBasicType(os, binary, labels[t][i].second);
-      }
-    }
-  }
-  WriteToken(os, binary, "<InputFrames>");
-  input_frames.Write(os, binary);
-  WriteToken(os, binary, "<LeftContext>");
-  WriteBasicType(os, binary, left_context);
-  WriteToken(os, binary, "<SpkInfo>");
-  spk_info.Write(os, binary);
-  WriteToken(os, binary, "</NnetExample>");
-}
-
-void NnetExample::Read(std::istream &is, bool binary) {
-  // Note: weight, label, input_frames, left_context and spk_info are members.
-  // This is a struct.
-  ExpectToken(is, binary, "<NnetExample>");
-
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (!strcmp(token.c_str(), "<Lab1>")) {  // simple label format
-    std::vector<int32> simple_labels;
-    ReadIntegerVector(is, binary, &simple_labels);
-    labels.resize(simple_labels.size());
-    for (size_t i = 0; i < simple_labels.size(); i++) {
-      labels[i].resize(1);
-      labels[i][0].first = simple_labels[i];
-      labels[i][0].second = 1.0;
-    }
-  } else if (!strcmp(token.c_str(), "<Lab2>")) {  // generic label format
-    int32 num_frames;
-    ReadBasicType(is, binary, &num_frames);
-    KALDI_ASSERT(num_frames > 0);
-    labels.resize(num_frames);
-    for (int32 t = 0; t < num_frames; t++) {
-      int32 size;
-      ReadBasicType(is, binary, &size);
-      KALDI_ASSERT(size >= 0);
-      labels[t].resize(size);
-      for (int32 i = 0; i < size; i++) {
-        ReadBasicType(is, binary, &(labels[t][i].first));
-        ReadBasicType(is, binary, &(labels[t][i].second));
-      }
-    }
-  } else if (!strcmp(token.c_str(), "<Labels>")) {  // back-compatibility
-    labels.resize(1);  // old format had 1 frame of labels.
-    int32 size;
-    ReadBasicType(is, binary, &size);
-    labels[0].resize(size);
-    for (int32 i = 0; i < size; i++) {
-      ReadBasicType(is, binary, &(labels[0][i].first));
-      ReadBasicType(is, binary, &(labels[0][i].second));
-    }
-  } else {
-    KALDI_ERR << "Expected token <Lab1>, <Lab2> or <Labels>, got " << token;
-  }
-  ExpectToken(is, binary, "<InputFrames>");
-  input_frames.Read(is, binary);
-  ExpectToken(is, binary, "<LeftContext>"); // Note: this member is
-  // recently added, but I don't think we'll get too much back-compatibility
-  // problems from not handling the old format.
-  ReadBasicType(is, binary, &left_context);
-  ExpectToken(is, binary, "<SpkInfo>");
-  spk_info.Read(is, binary);
-  ExpectToken(is, binary, "</NnetExample>");
-}
-
-void NnetExample::SetLabelSingle(int32 frame, int32 pdf_id, BaseFloat weight) {
-  KALDI_ASSERT(static_cast<size_t>(frame) < labels.size());
-  labels[frame].clear();
-  labels[frame].push_back(std::make_pair(pdf_id, weight));
-}
-
-int32 NnetExample::GetLabelSingle(int32 frame, BaseFloat *weight) {
-  BaseFloat max = -1.0;
-  int32 pdf_id = -1;
-  KALDI_ASSERT(static_cast<size_t>(frame) < labels.size());
-  for (int32 i = 0; i < labels[frame].size(); i++) {
-    if (labels[frame][i].second > max) {
-      pdf_id = labels[frame][i].first;
-      max = labels[frame][i].second;
-    }
-  }
-  if (weight != NULL) *weight = max;
-  return pdf_id;
-}
-
-
-
-static bool nnet_example_warned_left = false, nnet_example_warned_right = false;
-
-// Self-constructor that can reduce the number of frames and/or context.
-NnetExample::NnetExample(const NnetExample &input,
-                         int32 start_frame,
-                         int32 new_num_frames,
-                         int32 new_left_context,
-                         int32 new_right_context): spk_info(input.spk_info) {
-  int32 num_label_frames = input.labels.size();
-  if (start_frame < 0) start_frame = 0;  // start_frame is offset in the labeled
-                                         // frames.
-  KALDI_ASSERT(start_frame < num_label_frames);
-  if (start_frame + new_num_frames > num_label_frames || new_num_frames == -1)
-    new_num_frames = num_label_frames - start_frame;
-  // compute right-context of input.
-  int32 input_right_context =
-      input.input_frames.NumRows() - input.left_context - num_label_frames;
-  if (new_left_context == -1) new_left_context = input.left_context;
-  if (new_right_context == -1) new_right_context = input_right_context;
-  if (new_left_context > input.left_context) {
-    if (!nnet_example_warned_left) {
-      nnet_example_warned_left = true;
-      KALDI_WARN << "Requested left-context " << new_left_context
-                 << " exceeds input left-context " << input.left_context
-                 << ", will not warn again.";
-    }
-    new_left_context = input.left_context;
-  }
-  if (new_right_context > input_right_context) {
-    if (!nnet_example_warned_right) {
-      nnet_example_warned_right = true;
-      KALDI_WARN << "Requested right-context " << new_right_context
-                 << " exceeds input right-context " << input_right_context
-                 << ", will not warn again.";
-    }
-    new_right_context = input_right_context;
-  }
-
-  int32 new_tot_frames = new_left_context + new_num_frames + new_right_context,
-      left_frames_lost = (input.left_context - new_left_context) + start_frame;
-  
-  CompressedMatrix new_input_frames(input.input_frames,
-                                    left_frames_lost,
-                                    new_tot_frames,
-                                    0, input.input_frames.NumCols());
-  new_input_frames.Swap(&input_frames);  // swap with class-member.
-  left_context = new_left_context;  // set class-member.
-  labels.clear();
-  labels.insert(labels.end(),
-                input.labels.begin() + start_frame,
-                input.labels.begin() + start_frame + new_num_frames);
-}
-
-void ExamplesRepository::AcceptExamples(
-    std::vector<NnetExample> *examples) {
-  KALDI_ASSERT(!examples->empty());
-  empty_semaphore_.Wait();
-  KALDI_ASSERT(examples_.empty());
-  examples_.swap(*examples);
-  full_semaphore_.Signal();
-}
-
-void ExamplesRepository::ExamplesDone() {
-  empty_semaphore_.Wait();
-  KALDI_ASSERT(examples_.empty());
-  done_ = true;
-  full_semaphore_.Signal();
-}
-
-bool ExamplesRepository::ProvideExamples(
-    std::vector<NnetExample> *examples) {
-  full_semaphore_.Wait();
-  if (done_) {
-    KALDI_ASSERT(examples_.empty());
-    full_semaphore_.Signal(); // Increment the semaphore so
-    // the call by the next thread will not block.
-    return false; // no examples to return-- all finished.
-  } else {
-    KALDI_ASSERT(!examples_.empty() && examples->empty());
-    examples->swap(examples_);
-    empty_semaphore_.Signal();
-    return true;
-  }
-}
-
-
-void DiscriminativeNnetExample::Write(std::ostream &os,
-                                              bool binary) const {
-  // Note: weight, num_ali, den_lat, input_frames, left_context and spk_info are
-  // members.  This is a struct.
-  WriteToken(os, binary, "<DiscriminativeNnetExample>");
-  WriteToken(os, binary, "<Weight>");
-  WriteBasicType(os, binary, weight);
-  WriteToken(os, binary, "<NumAli>");
-  WriteIntegerVector(os, binary, num_ali);
-  if (!WriteCompactLattice(os, binary, den_lat)) {
-    // We can't return error status from this function so we
-    // throw an exception. 
-    KALDI_ERR << "Error writing CompactLattice to stream";
-  }
-  WriteToken(os, binary, "<InputFrames>");
-  {
-    CompressedMatrix cm(input_frames); // Note: this can be read as a regular
-                                       // matrix.
-    cm.Write(os, binary);
-  }
-  WriteToken(os, binary, "<LeftContext>");
-  WriteBasicType(os, binary, left_context);
-  WriteToken(os, binary, "<SpkInfo>");
-  spk_info.Write(os, binary);
-  WriteToken(os, binary, "</DiscriminativeNnetExample>");
-}
-
-void DiscriminativeNnetExample::Read(std::istream &is,
-                                             bool binary) {
-  // Note: weight, num_ali, den_lat, input_frames, left_context and spk_info are
-  // members.  This is a struct.
-  ExpectToken(is, binary, "<DiscriminativeNnetExample>");
-  ExpectToken(is, binary, "<Weight>");
-  ReadBasicType(is, binary, &weight);
-  ExpectToken(is, binary, "<NumAli>");
-  ReadIntegerVector(is, binary, &num_ali);
-  CompactLattice *den_lat_tmp = NULL;
-  if (!ReadCompactLattice(is, binary, &den_lat_tmp) || den_lat_tmp == NULL) {
-    // We can't return error status from this function so we
-    // throw an exception. 
-    KALDI_ERR << "Error reading CompactLattice from stream";
-  }
-  den_lat = *den_lat_tmp;
-  delete den_lat_tmp;
-  ExpectToken(is, binary, "<InputFrames>");
-  input_frames.Read(is, binary);
-  ExpectToken(is, binary, "<LeftContext>");
-  ReadBasicType(is, binary, &left_context);
-  ExpectToken(is, binary, "<SpkInfo>");
-  spk_info.Read(is, binary);
-  ExpectToken(is, binary, "</DiscriminativeNnetExample>");
-}
-
-void DiscriminativeNnetExample::Check() const {
-  KALDI_ASSERT(weight > 0.0);
-  KALDI_ASSERT(!num_ali.empty());
-  int32 num_frames = static_cast<int32>(num_ali.size());
-
-
-  std::vector<int32> times;
-  int32 num_frames_den = CompactLatticeStateTimes(den_lat, &times);
-  KALDI_ASSERT(num_frames == num_frames_den);
-  KALDI_ASSERT(input_frames.NumRows() >= left_context + num_frames);
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-example.h b/src/nnet2/nnet-example.h
deleted file mode 100644
index 0b5c473f69f..00000000000
--- a/src/nnet2/nnet-example.h
+++ /dev/null
@@ -1,191 +0,0 @@
-// nnet2/nnet-example.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-//           2014  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_EXAMPLE_H_
-#define KALDI_NNET2_NNET_EXAMPLE_H_
-
-#include "nnet2/nnet-nnet.h"
-#include "util/table-types.h"
-#include "lat/kaldi-lattice.h"
-#include "util/kaldi-semaphore.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/// NnetExample is the input data and corresponding label (or labels) for one
-/// or more frames of input, used for standard cross-entropy training of neural
-/// nets (and possibly for other objective functions).  In the normal case there
-/// will be just one frame, and one label, with a weight of 1.0.
-struct NnetExample {
-
-  /// The label(s) for each frame in a sequence of frames; in the normal case,
-  /// this will be just [ [ (pdf-id, 1.0) ] ], i.e. one frame with one label.
-  /// Top-level index is the frame index; then for each frame, a list of pdf-ids
-  /// each with its weight.
-  /// In some contexts, we will require that labels.size() == 1.
-  std::vector<std::vector<std::pair<int32, BaseFloat> > > labels;  
-  
-  /// The input data, with NumRows() >= labels.size() + left_context; it
-  /// includes features to the left and right as needed for the temporal context
-  /// of the network.  The features corresponding to labels[0] would be in
-  /// the row with index equal to left_context.
-  CompressedMatrix input_frames; 
-
-  /// The number of frames of left context (we can work out the #frames
-  /// of right context from input_frames.NumRows(), labels.size(), and this).
-  int32 left_context;
-
-  /// The speaker-specific input, if any, or an empty vector if
-  /// we're not using this features.  We'll append this to the
-  /// features for each of the frames.
-  Vector<BaseFloat> spk_info; 
-  
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary);
-
-  NnetExample() { }
-
-  /// This constructor can be used to extract one or more frames from an example
-  /// that has multiple frames, and possibly truncate the context.  Most of its
-  /// behavior is obvious from the variable names, but note the following: if
-  /// left_context is -1, we use the left-context of the input; the same for
-  /// right_context.  If start_frame < 0 we start the labels from frame 0 of the
-  /// labeled frames of ,input; if num_frames == -1 we go to the end of the
-  /// labeled input from start_frame.  If start_frame + num_frames is greater
-  /// than the number of frames of labels of input, we output as much as we can
-  /// instead of crashing.  The same with left_context and right_context-- if we
-  /// can't provide the requested context we won't crash but will provide as
-  /// much as we can, although in this case we'll print a warning (once).
-  NnetExample(const NnetExample &input,
-              int32 start_frame,
-              int32 num_frames,
-              int32 left_context,
-              int32 right_context);
-  
-  /// Set the label of this frame of this example to the specified pdf_id with
-  /// the specified weight.
-  void SetLabelSingle(int32 frame, int32 pdf_id, BaseFloat weight = 1.0);
-
-  /// Get the maximum weight label (pdf_id and weight) of this frame of this
-  /// example.
-  int32 GetLabelSingle(int32 frame, BaseFloat *weight = NULL);
-};
-
-
-typedef TableWriter<KaldiObjectHolder<NnetExample > > NnetExampleWriter;
-typedef SequentialTableReader<KaldiObjectHolder<NnetExample > > SequentialNnetExampleReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<NnetExample > > RandomAccessNnetExampleReader;
-
-
-/** This class stores neural net training examples to be used in
-    multi-threaded training.  */
-class ExamplesRepository {
- public:
-  /// The following function is called by the code that reads in the examples,
-  /// with a batch of examples.  [It will empty the vector "examples").
-  void AcceptExamples(std::vector<NnetExample> *examples);
-
-  /// The following function is called by the code that reads in the examples,
-  /// when we're done reading examples.
-  void ExamplesDone();
-  
-  /// This function is called by the code that does the training.  It gets the
-  /// training examples, and if they are available, puts them in "examples" and
-  /// returns true.  It returns false when there are no examples left and
-  /// ExamplesDone() has been called.
-  bool ProvideExamples(std::vector<NnetExample> *examples);
-  
-  ExamplesRepository(): empty_semaphore_(1), done_(false) { }
- private:
-  Semaphore full_semaphore_;
-  Semaphore empty_semaphore_;
-
-  std::vector<NnetExample> examples_;
-  bool done_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(ExamplesRepository);
-};
-
-
-/**
-   This struct is used to store the information we need for discriminative training
-   (MMI or MPE).  Each example corresponds to one chunk of a file (for better randomization
-   and to prevent instability, we may split files in the middle).
-   The example contains the numerator alignment, the denominator lattice, and the
-   input features (extended at the edges according to the left-context and right-context
-   the network needs).  It may also contain a speaker-vector (note: this is
-   not part of any standard recipe right now but is included in case it's useful
-   in the future).
- */
-struct DiscriminativeNnetExample {
-  /// The weight we assign to this example;
-  /// this will typically be one, but we include it
-  /// for the sake of generality.  
-  BaseFloat weight; 
-
-  /// The numerator alignment
-  std::vector<int32> num_ali; 
-
-  /// The denominator lattice.  Note: any acoustic
-  /// likelihoods in the denominator lattice will be
-  /// recomputed at the time we train.
-  CompactLattice den_lat; 
-
-  /// The input data-- typically with a number of frames [NumRows()] larger than
-  /// labels.size(), because it includes features to the left and right as
-  /// needed for the temporal context of the network.  (see also the
-  /// left_context variable).
-  /// Caution: when we write this to disk, we do so as CompressedMatrix.
-  /// Because we do various manipulations on these things in memory, such
-  /// as splitting, we don't want it to be a CompressedMatrix in memory
-  /// as this would be wasteful in time and also would lead to further loss of
-  /// accuracy.
-  Matrix<BaseFloat> input_frames;
-
-  /// The number of frames of left context in the features (we can work out the
-  /// #frames of right context from input_frames.NumRows(), num_ali.size(), and
-  /// this).
-  int32 left_context;
-  
-
-  /// spk_info contains any component of the features that varies slowly or not
-  /// at all with time (and hence, we would lose little by averaging it over
-  /// time and storing the average).  We'll append this to each of the input
-  /// features, if used.
-  Vector<BaseFloat> spk_info; 
-
-  void Check() const; // will crash if invalid.
-  
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary);
-};
-
-// Yes, the length of typenames is getting out of hand.
-typedef TableWriter<KaldiObjectHolder<DiscriminativeNnetExample > >
-   DiscriminativeNnetExampleWriter;
-typedef SequentialTableReader<KaldiObjectHolder<DiscriminativeNnetExample > >
-   SequentialDiscriminativeNnetExampleReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<DiscriminativeNnetExample > >
-   RandomAccessDiscriminativeNnetExampleReader;
-
-
-}
-} // namespace
-
-#endif // KALDI_NNET2_NNET_EXAMPLE_H_
diff --git a/src/nnet2/nnet-fix.cc b/src/nnet2/nnet-fix.cc
deleted file mode 100644
index 79a10ceccdb..00000000000
--- a/src/nnet2/nnet-fix.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-// nnet2/nnet-fix.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-fix.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/* See the header for what we're doing.
-   The pattern we're looking for is AffineComponent followed by
-   a NonlinearComponent of type SigmoidComponent or TanhComponent.
-*/
-
-void FixNnet(const NnetFixConfig &config, Nnet *nnet) {
-  for (int32 c = 0; c + 1 < nnet->NumComponents(); c++) {
-    AffineComponent *ac = dynamic_cast<AffineComponent*>(
-        &(nnet->GetComponent(c)));
-    NonlinearComponent *nc = dynamic_cast<NonlinearComponent*>(
-        &(nnet->GetComponent(c + 1)));
-    if (ac == NULL || nc == NULL) continue;
-    // We only want to process this if it's of type SigmoidComponent
-    // or TanhComponent.
-    BaseFloat max_deriv; // The maximum derivative of this nonlinearity.
-    bool is_relu = false;
-    {
-      SigmoidComponent *sc = dynamic_cast<SigmoidComponent*>(nc);
-      TanhComponent *tc = dynamic_cast<TanhComponent*>(nc);
-      RectifiedLinearComponent *rc = dynamic_cast<RectifiedLinearComponent*>(nc);
-      if (sc != NULL) max_deriv = 0.25;
-      else if (tc != NULL) max_deriv = 1.0;
-      else if (rc != NULL) { max_deriv = 1.0; is_relu = true; }
-      else continue; // E.g. SoftmaxComponent; we don't handle this.
-    }
-    double count = nc->Count();
-    Vector<double> deriv_sum (nc->DerivSum());
-    if (count == 0.0 || deriv_sum.Dim() == 0) {
-      KALDI_WARN << "Cannot fix neural net because no statistics are stored.";
-      continue;
-    }
-    Vector<BaseFloat> bias_params(ac->BiasParams());
-    Matrix<BaseFloat> linear_params(ac->LinearParams());
-    int32 dim = nc->InputDim(), num_small_deriv = 0, num_large_deriv = 0;
-    for (int32 d = 0; d < dim; d++) {
-      // deriv ratio is the ratio of the computed average derivative to the
-      // maximum derivative of that nonlinear function.
-      BaseFloat deriv_ratio = deriv_sum(d) / (count * max_deriv);
-      KALDI_ASSERT(deriv_ratio >= 0.0 && deriv_ratio < 1.01); // Or there is an
-                                                              // error in the
-      // math.
-      if (deriv_ratio < config.min_average_deriv) {
-        // derivative is too small, meaning we've gone off into the "flat part"
-        // of the sigmoid (or for ReLU, we're always-off).
-        if (is_relu) {
-          bias_params(d) += config.relu_bias_change;
-        } else {
-          BaseFloat parameter_factor = std::min(config.min_average_deriv /
-                                                deriv_ratio,
-                                                config.parameter_factor);
-          // we need to reduce the parameters, so multiply by 1/parameter factor.
-          bias_params(d) *= 1.0 / parameter_factor;
-          linear_params.Row(d).Scale(1.0 / parameter_factor);
-        }
-        num_small_deriv++;
-      } else if (deriv_ratio > config.max_average_deriv) {
-        // derivative is too large, meaning we're only in the linear part of the
-        // sigmoid, in the middle.  (or for ReLU, we're always-on.
-        if (is_relu) {
-          bias_params(d) -= config.relu_bias_change;
-        } else {
-          BaseFloat parameter_factor = std::min(deriv_ratio / config.max_average_deriv,
-                                                config.parameter_factor);
-          // we need to increase the factors, so multiply by parameter_factor.
-          bias_params(d) *= parameter_factor;
-          linear_params.Row(d).Scale(parameter_factor);
-        }
-        num_large_deriv++;
-      }
-    }
-    if (is_relu) {
-      KALDI_LOG << "For layer " << c << " (ReLU units), increased bias for "
-                << num_small_deriv << " indexes and decreased it for "
-                << num_large_deriv << ", out of a total of " << dim;
-    } else {
-      KALDI_LOG << "For layer " << c << ", decreased parameters for "
-                << num_small_deriv << " indexes, and increased them for "
-                << num_large_deriv << " out of a total of " << dim;
-    }
-    ac->SetParams(bias_params, linear_params);
-  }
-}
-  
-  
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-fix.h b/src/nnet2/nnet-fix.h
deleted file mode 100644
index 3da5f97c25f..00000000000
--- a/src/nnet2/nnet-fix.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// nnet2/nnet-fix.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_FIX_H_
-#define KALDI_NNET2_NNET_FIX_H_
-
-#include "nnet2/nnet-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/* This header provides a function FixNnet(), and associated config, which
-   is responsible for fixing certain pathologies in a neural network during
-   training.
-
-   For Sigmoid/Tanh units: it identifies neurons whose parameters are getting so large that
-   they are maxing out the sigmoid, and scales down those parameters by a
-   specified factor.  It also identifies neurons that have the opposite pathology
-   that they are just in the linear part of the sigmoid, and it scales up
-   their parameters.
-
-   For ReLU (rectified linear) units, it identifies neurons that are always zero
-   or close to zero, re-randomizes the corresponding parameters, increasing the bias.
-*/
-
-struct NnetFixConfig {
-  BaseFloat min_average_deriv; // Minimum average derivative that we allow,
-  // as a proportion of the maximum derivative of the nonlinearity (1.0 for tanh, 0.25 for sigmoid).
-  // If average derivative is less, we scale up the parameters.
-  BaseFloat max_average_deriv; // Maximum average derivative that we allow,
-  // also expressed relative to the maximum derivative of the nonlinearity.
-  BaseFloat parameter_factor; // Factor (>1.0) by which we change the parameters if
-  // the exceed the bounds above
-  BaseFloat relu_bias_change; // Change in bias for relus that are usually close to zero.
-
-  NnetFixConfig(): min_average_deriv(0.1), max_average_deriv(0.75),
-                   parameter_factor(2.0), relu_bias_change(1.0) { }
-  void Register(OptionsItf *opts) {
-    opts->Register("min-average-deriv", &min_average_deriv, "Miniumum derivative, "
-                   "averaged over the training data, that we allow for a nonlinearity,"
-                   "expressed relative to the maximum derivative of the nonlinearity,"
-                   "i.e. 1.0 for tanh or 0.25 for sigmoid, 1.0 for rectified linear.");
-    opts->Register("max-average-deriv", &max_average_deriv, "Maximum derivative, "
-                   "averaged over the training data, that we allow for the nonlinearity "
-                   "associated with one neuron.");
-    opts->Register("parameter-factor", &parameter_factor, "Maximum factor by which we change "
-                   "the set of parameters associated with a neuron.");
-    opts->Register("relu-bias-change", &relu_bias_change, "For ReLUs, change in bias when "
-                   "we identify a component that's too frequently on or off.");
-  }
-};
-
-void FixNnet(const NnetFixConfig &config, Nnet *nnet);
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_FIX_H_
diff --git a/src/nnet2/nnet-functions.cc b/src/nnet2/nnet-functions.cc
deleted file mode 100644
index e1a53d668e5..00000000000
--- a/src/nnet2/nnet-functions.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// nnet2/nnet-functions.cc
-
-// Copyright 2011-2012  Karel Vesely
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-nnet.h"
-#include "util/stl-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-int32 IndexOfSoftmaxLayer(const Nnet &nnet) {
-  int32 index = -1, nc = nnet.NumComponents();
-  for (int32 c = 0; c < nc; c++) {
-    const Component *component = &(nnet.GetComponent(c));
-    if (dynamic_cast<const SoftmaxComponent*>(component) != NULL) {
-      if (index != -1) return -1; // >1 softmax components.
-      else index = c;
-    }
-  }
-  return index;
-}
-
-void InsertComponents(const Nnet &src_nnet,
-                      int32 c_to_insert, // component-index before which to insert.
-                      Nnet *dest_nnet) {
-  KALDI_ASSERT(c_to_insert >= 0 && c_to_insert <= dest_nnet->NumComponents());
-  int32 c_tot = dest_nnet->NumComponents() + src_nnet.NumComponents();
-  std::vector<Component*> components(c_tot);
-  for (int32 c = 0; c < c_to_insert; c++)
-    components[c] = dest_nnet->GetComponent(c).Copy();
-  for (int32 c = 0; c < src_nnet.NumComponents(); c++)
-    components[c + c_to_insert] = src_nnet.GetComponent(c).Copy();
-  for (int32 c = c_to_insert; c < dest_nnet->NumComponents(); c++)
-    components[c + src_nnet.NumComponents()] = dest_nnet->GetComponent(c).Copy();
-  // Re-initialize "dest_nnet" from the resulting list of components.
-
-  // The Init method will take ownership of the pointers in the vector:
-  dest_nnet->Init(&components);
-}
-
-
-void ReplaceLastComponents(const Nnet &src_nnet,
-                           int32 num_to_remove,
-                           Nnet *dest_nnet) {
-  KALDI_ASSERT(num_to_remove >= 0 && num_to_remove <= dest_nnet->NumComponents());
-  int32 c_orig = dest_nnet->NumComponents() - num_to_remove;
-
-  std::vector<Component*> components;
-  for (int32 c = 0; c < c_orig; c++)
-    components.push_back(dest_nnet->GetComponent(c).Copy());
-  for (int32 c = 0; c < src_nnet.NumComponents(); c++)
-    components.push_back(src_nnet.GetComponent(c).Copy());
-
-  // Re-initialize "dest_nnet" from the resulting list of components.
-  // The Init method will take ownership of the pointers in the vector:
-  dest_nnet->Init(&components);
-}
-
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-functions.h b/src/nnet2/nnet-functions.h
deleted file mode 100644
index f0d2023fca8..00000000000
--- a/src/nnet2/nnet-functions.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// nnet2/nnet-functions.h
-
-// Copyright  2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_FUNCTIONS_H_
-#define KALDI_NNET2_NNET_FUNCTIONS_H_
-
-#include "base/kaldi-common.h"
-#include "util/kaldi-io.h"
-#include "matrix/matrix-lib.h"
-#include "nnet2/nnet-component.h"
-#include "nnet2/nnet-nnet.h"
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-
-namespace kaldi {
-namespace nnet2 {
-
-// Here we declare various functions for manipulating the neural net,
-// such as adding new hidden layers; we'll add things like "mixing up"
-// to here.
-
-
-/// If "nnet" has exactly one softmax layer, this function will return
-/// its index; otherwise it will return -1.
-int32 IndexOfSoftmaxLayer(const Nnet &nnet);
-
-/**
-   Inserts the components of one neural network into a particular place in the
-   other one.  This is useful for adding hidden layers to a neural net.  Inserts
-   the components of "src_nnet" before component index c of "dest_nnet".
-*/
-void InsertComponents(const Nnet &src_nnet,
-                      int32 c,
-                      Nnet *dest_nnet);
-
-/**
-   Removes the last "num_to_remove" components and
-   adds the components from "src_nnet".
- */
-void ReplaceLastComponents(const Nnet &src_nnet,
-                           int32 num_to_remove,
-                           Nnet *dest_nnet);
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
-
-
diff --git a/src/nnet2/nnet-limit-rank.cc b/src/nnet2/nnet-limit-rank.cc
deleted file mode 100644
index 4d303056c91..00000000000
--- a/src/nnet2/nnet-limit-rank.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// nnet2/nnet-limit-rank.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-limit-rank.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-class LimitRankClass {
- public:
-  LimitRankClass(const NnetLimitRankOpts &opts,
-                 int32 c,
-                 Nnet *nnet): opts_(opts), c_(c), nnet_(nnet) { }
-  void operator () () {
-    AffineComponent *ac = dynamic_cast<AffineComponent*>(
-        &(nnet_->GetComponent(c_)));
-    KALDI_ASSERT(ac != NULL);
-
-    // We'll limit the rank of just the linear part, keeping the bias vector full.
-    Matrix<BaseFloat> M (ac->LinearParams());
-    int32 rows = M.NumRows(), cols = M.NumCols(), rc_min = std::min(rows, cols);
-    Vector<BaseFloat> s(rc_min);
-    Matrix<BaseFloat> U(rows, rc_min), Vt(rc_min, cols);
-    // Do the destructive svd M = U diag(s) V^T.  It actually outputs the transpose of V.
-    M.DestructiveSvd(&s, &U, &Vt);
-    SortSvd(&s, &U, &Vt); // Sort the singular values from largest to smallest.
-
-    int32 d = GetRetainedDim(rows, cols);
-    BaseFloat old_svd_sum = s.Sum();
-    U.Resize(rows, d, kCopyData);
-    s.Resize(d, kCopyData);
-    Vt.Resize(d, cols, kCopyData);
-    BaseFloat new_svd_sum = s.Sum();
-    KALDI_LOG << "For component " << c_ << " of dimension " << rows
-              << " x " << cols << ", reduced rank from "
-              << rc_min <<  " to " << d << ", SVD sum reduced from "
-              << old_svd_sum << " to " << new_svd_sum;
-    Vt.MulRowsVec(s); // Vt <-- diag(s) Vt.
-    M.AddMatMat(1.0, U, kNoTrans, Vt, kNoTrans, 0.0); // Reconstruct with reduced
-    // rank.
-    Vector<BaseFloat> bias_params(ac->BiasParams());
-    ac->SetParams(bias_params, M);
-  }
-
-  int32 GetRetainedDim(int32 rows, int32 cols) {
-    if (opts_.parameter_proportion <= 0.0 || opts_.parameter_proportion > 1.0)
-      KALDI_ERR << "bad --parameter-proportion " << opts_.parameter_proportion;
-    // If we do SVD to dimension d, so that it's U diag(s) V^T where
-    // U is rows * d, s is d, and V is cols * d, then the #params is as follows...
-    //   the first column of U has free parameters (#rows - 1) [the -1 is due to
-    //   the length constraint]; the second has (#rows - 2) [subtract 1 for the
-    //   length constraint and one for orthogonality with the previous row], etc.
-    //   Total is params(U) = (rows * d) - ((d(d+1))/2),
-    //            params(s) = d,
-    //            params(V) = (cols * d) - ((d(d+1))/2),
-    //   So total is (rows + cols) * d - d * d .
-    //   For example, if d = #rows, this equals (#rows * #cols)
-    //   We are solving for:
-    //   (rows * cols) * parameter_proportion = (rows + cols) * d - d * d, or
-    //   d^2 - d * (rows + cols) + (rows*cols)*parameter_proportion
-    //   In quadratic equation
-    //   a = 1.0,
-    //   b = -(rows + cols)
-    //   c = rows * cols * parameter_proportion.
-    //   Take smaller solution.
-    BaseFloat a = 1.0, b = -(rows + cols),
-        c = rows * cols * opts_.parameter_proportion;
-    BaseFloat x = (-b - sqrt(b * b - 4 * a * c)) / (2.0 * a);
-    int32 ans = static_cast<int32>(x);
-    KALDI_ASSERT(ans > 0 && ans <= std::min(rows, cols));
-    return ans;
-  }
-  
-  ~LimitRankClass() { }
- private:
-  const NnetLimitRankOpts &opts_;
-  int32 c_;
-  Nnet *nnet_;
-};
-
-
-void LimitRankParallel(const NnetLimitRankOpts &opts,
-                            Nnet *nnet) {
-  TaskSequencerConfig task_config;
-  task_config.num_threads = opts.num_threads;
-  TaskSequencer<LimitRankClass> tc(task_config);
-  for (int32 c = 0; c < nnet->NumComponents(); c++) {
-    if (dynamic_cast<AffineComponent*>(&(nnet->GetComponent(c))) != NULL)
-      tc.Run(new LimitRankClass(opts, c, nnet));
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-limit-rank.h b/src/nnet2/nnet-limit-rank.h
deleted file mode 100644
index 1628f6ca4f6..00000000000
--- a/src/nnet2/nnet-limit-rank.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// nnet2/nnet-limit-rank.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_LIMIT_RANK_H_
-#define KALDI_NNET2_NNET_LIMIT_RANK_H_
-
-#include "nnet2/nnet-nnet.h"
-#include "util/table-types.h"
-#include "util/kaldi-semaphore.h"
-#include "util/kaldi-thread.h"
-#include "nnet2/nnet-update.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-struct NnetLimitRankOpts {
-  int32 num_threads;
-  BaseFloat parameter_proportion;
-  
-  NnetLimitRankOpts(): num_threads(1), parameter_proportion(0.75) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("num-threads", &num_threads, "Number of threads used for "
-                   "rank-limiting operation; note, will never use more than "
-                   "#layers.");
-    opts->Register("parameter-proportion", &parameter_proportion, "Proportion of "
-                   "dimension of each transform to limit the rank to.");
-  }  
-};
-
-
-/// This function limits the rank of each affine transform in the
-/// neural net, by zeroing out the smallest singular values.  The number of
-/// singular values to zero out is determined on a layer by layer basis, using
-/// "parameter_proportion" to set the proportion of parameters to remove.
-void LimitRankParallel(const NnetLimitRankOpts &opts,
-                       Nnet *nnet);
-
-
-/// Also see the function LimitRankOfLastLayer in class Nnet.                            
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_LIMIT_RANK_H_
diff --git a/src/nnet2/nnet-nnet-test.cc b/src/nnet2/nnet-nnet-test.cc
deleted file mode 100644
index 32c312fe234..00000000000
--- a/src/nnet2/nnet-nnet-test.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// nnet2/nnet-nnet-test.cc
-
-// Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void UnitTestNnet() {
-  int32 input_dim = 40, output_dim = 500;
-  Nnet *nnet = GenRandomNnet(input_dim, output_dim);
-
-  bool binary = (rand() % 2 == 0);
-  std::ostringstream os;
-  nnet->Write(os, binary);
-  Nnet nnet2;
-  std::istringstream is(os.str());
-  nnet2.Read(is, binary);
-
-  std::ostringstream os2;
-  nnet2.Write(os2, binary);
-
-  KALDI_ASSERT(os2.str() == os.str());
-  delete nnet;
-}
-
-} // namespace nnet2
-} // namespace kaldi
-
-#include "matrix/matrix-functions.h"
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-
-  UnitTestNnet();
-  return 0;
-}
-  
diff --git a/src/nnet2/nnet-nnet.cc b/src/nnet2/nnet-nnet.cc
deleted file mode 100644
index 9fe10a4f5aa..00000000000
--- a/src/nnet2/nnet-nnet.cc
+++ /dev/null
@@ -1,846 +0,0 @@
-// nnet2/nnet-nnet.cc
-
-// Copyright 2011-2012  Karel Vesely
-//           2012-2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <set>
-#include <string>
-#include "nnet2/nnet-nnet.h"
-#include "util/stl-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-int32 Nnet::OutputDim() const {
-  KALDI_ASSERT(!components_.empty());
-  return components_.back()->OutputDim();
-}
-
-int32 Nnet::InputDim() const {
-  KALDI_ASSERT(!components_.empty());
-  return components_.front()->InputDim();
-}
-
-
-int32 Nnet::LeftContext() const {
-  KALDI_ASSERT(!components_.empty());
-  int32 ans = 0;
-  for (size_t i = 0; i < components_.size(); i++) {
-    ans += components_[i]->Context().front();
-  }
-  return -1*ans;
-  // nnet-components return left context as a non-positive integer
-  // however the nnet-update, nnet-compute expect a
-  // non-negative left context. In addition, the NnetExample also stores data
-  // left context as positive integer. To be compatible with these other classes
-  // Nnet::LeftContext() returns a non-negative left context.
-}
-
-int32 Nnet::RightContext() const {
-  KALDI_ASSERT(!components_.empty());
-  int32 ans = 0;
-  for (size_t i = 0; i < components_.size(); i++) {
-    ans += components_[i]->Context().back();
-  }
-  return ans;
-}
-
-void Nnet::ComputeChunkInfo(int32 input_chunk_size,
-                            int32 num_chunks,
-                            std::vector<ChunkInfo> *chunk_info_out) const {
-  // First compute the output-chunk indices for the last component in the
-  // network. we assume that the numbering of the input starts from zero.
-  int32 output_chunk_size = input_chunk_size - LeftContext() - RightContext();
-  KALDI_ASSERT(output_chunk_size > 0);
-  std::vector<int32> current_output_inds;
-  for (int32 i = 0; i < output_chunk_size; i++)
-    current_output_inds.push_back(i + LeftContext());
-
-  (*chunk_info_out).resize(NumComponents() + 1);
-
-  // indexes for last component is empty, since the last component's chunk is
-  // always contiguous
-  // component's output is always contiguous
-  (*chunk_info_out)[NumComponents()] = ChunkInfo(
-      GetComponent(NumComponents() - 1).OutputDim(),
-      num_chunks, current_output_inds.front(),
-      current_output_inds.back());
-
-  std::vector<int32> current_input_inds;
-  for (int32 i = NumComponents() - 1; i >= 0; i--) {
-    std::vector<int32> current_context = GetComponent(i).Context();
-    std::set<int32> current_input_ind_set;
-    for (size_t j = 0; j < current_context.size(); j++)
-      for (size_t k = 0; k < current_output_inds.size(); k++)
-        current_input_ind_set.insert(current_context[j] +
-                                     current_output_inds[k]);
-    current_output_inds.resize(current_input_ind_set.size());
-    std::copy(current_input_ind_set.begin(),
-              current_input_ind_set.end(),
-              current_output_inds.begin());
-
-    // checking if the vector has contiguous data
-    // assign indexes only if the data is not contiguous
-    if (current_output_inds.size() !=
-        current_output_inds.back() - current_output_inds.front() + 1) {
-      (*chunk_info_out)[i] = ChunkInfo(GetComponent(i).InputDim(),
-                                       num_chunks,
-                                       current_output_inds);
-    } else  {
-      (*chunk_info_out)[i] = ChunkInfo(GetComponent(i).InputDim(),
-                                       num_chunks,
-                                       current_output_inds.front(),
-                                       current_output_inds.back());
-    }
-  }
-
-  // TODO: Make a set of components which can deal with data rearrangement.
-  // Define this set in an appropriate place so that
-  // users adding new components can simply update the list.
-  const char *dinit[] = {"SpliceComponent", "SpliceMaxComponent"};
-  std::vector< std::string > data_rearrange_components(dinit, dinit + 2);
-
-  // Ensuring that all components until the first component capable of data
-  // rearrangement (e.g. SpliceComponent|SpliceMaxComponent) operate on
-  // contiguous chunks at the input
-  for (size_t i = 0 ; i < NumComponents() ; i++) {
-      (*chunk_info_out)[i].MakeOffsetsContiguous();
-      // Check if the current component is present in the set of components
-      // capable of data rearrangement.
-      if (std::find(data_rearrange_components.begin(),
-                    data_rearrange_components.end(),
-                    components_[i]->Type())
-          != data_rearrange_components.end())
-          break;
-  }
-
-  // sanity testing for chunk_info_out vector
-  for (size_t i = 0; i < chunk_info_out->size(); i++) {
-    (*chunk_info_out)[i].Check();
-    // (*chunk_info_out)[i].ToString();
-  }
-}
-
-const Component& Nnet::GetComponent(int32 component) const {
-  KALDI_ASSERT(static_cast<size_t>(component) < components_.size());
-  return *(components_[component]);
-}
-
-Component& Nnet::GetComponent(int32 component) {
-  KALDI_ASSERT(static_cast<size_t>(component) < components_.size());
-  return *(components_[component]);
-}
-
-void Nnet::SetZero(bool treat_as_gradient) {
-  for (size_t i = 0; i < components_.size(); i++) {
-    UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(components_[i]);
-    if (uc != NULL) uc->SetZero(treat_as_gradient);
-    NonlinearComponent *nc = dynamic_cast<NonlinearComponent*>(components_[i]);
-    if (nc != NULL) nc->Scale(0.0);
-  }
-}
-
-void Nnet::Write(std::ostream &os, bool binary) const {
-  Check();
-  WriteToken(os, binary, "<Nnet>");
-  int32 num_components = components_.size();
-  WriteToken(os, binary, "<NumComponents>");
-  WriteBasicType(os, binary, num_components);
-  WriteToken(os, binary, "<Components>");
-  for (int32 c = 0; c < num_components; c++) {
-    components_[c]->Write(os, binary);
-    if (!binary) os << std::endl;
-  }
-  WriteToken(os, binary, "</Components>");
-  WriteToken(os, binary, "</Nnet>");
-}
-
-void Nnet::Read(std::istream &is, bool binary) {
-  Destroy();
-  ExpectToken(is, binary, "<Nnet>");
-  int32 num_components;
-  ExpectToken(is, binary, "<NumComponents>");
-  ReadBasicType(is, binary, &num_components);
-  ExpectToken(is, binary, "<Components>");
-  components_.resize(num_components);
-  for (int32 c = 0; c < num_components; c++)
-    components_[c] = Component::ReadNew(is, binary);
-  ExpectToken(is, binary, "</Components>");
-  ExpectToken(is, binary, "</Nnet>");
-  SetIndexes();
-  Check();
-}
-
-
-void Nnet::ZeroStats() {
-  for (size_t i = 0; i < components_.size(); i++) {
-    NonlinearComponent *nonlinear_component =
-        dynamic_cast<NonlinearComponent*>(components_[i]);
-    if (nonlinear_component != NULL)
-      nonlinear_component->Scale(0.0);  // Zero the stats this way.
-  }
-}
-void Nnet::Destroy() {
-  while (!components_.empty()) {
-    delete components_.back();
-    components_.pop_back();
-  }
-}
-
-void Nnet::ComponentDotProducts(
-    const Nnet &other,
-    VectorBase<BaseFloat> *dot_prod) const {
-  KALDI_ASSERT(dot_prod->Dim() == NumUpdatableComponents());
-  int32 index = 0;
-  for (size_t i = 0; i < components_.size(); i++) {
-    UpdatableComponent *uc1 = dynamic_cast<UpdatableComponent*>(components_[i]);
-    const UpdatableComponent *uc2 = dynamic_cast<const UpdatableComponent*>(
-        &(other.GetComponent(i)));
-    KALDI_ASSERT((uc1 != NULL) == (uc2 != NULL));
-    if (uc1 != NULL) {
-      (*dot_prod)(index) = uc1->DotProduct(*uc2);
-      index++;
-    }
-  }
-  KALDI_ASSERT(index == NumUpdatableComponents());
-}
-
-
-Nnet::Nnet(const Nnet &other): components_(other.components_.size()) {
-  for (size_t i = 0; i < other.components_.size(); i++)
-    components_[i] = other.components_[i]->Copy();
-  SetIndexes();
-  Check();
-}
-
-Nnet::Nnet(const Nnet &other1, const Nnet &other2) {
-  int32 dim1 = other1.OutputDim(), dim2 = other2.InputDim();
-  if (dim1 != dim2)
-    KALDI_ERR << "Concatenating neural nets: dimension mismatch "
-              << dim1 << " vs. " << dim2;
-  for (size_t i = 0; i < other1.components_.size(); i++)
-    components_.push_back(other1.components_[i]->Copy());
-  for (size_t i = 0; i < other2.components_.size(); i++)
-    components_.push_back(other2.components_[i]->Copy());
-  SetIndexes();
-  Check();
-}
-
-
-Nnet &Nnet::operator = (const Nnet &other) {
-  Destroy();
-  components_.resize(other.components_.size());
-  for (size_t i = 0; i < other.components_.size(); i++)
-    components_[i] = other.components_[i]->Copy();
-  SetIndexes();
-  Check();
-  return *this;
-}
-
-std::string Nnet::Info() const {
-  std::ostringstream ostr;
-  ostr << "num-components " << NumComponents() << std::endl;
-  ostr << "num-updatable-components " << NumUpdatableComponents() << std::endl;
-  ostr << "left-context " << LeftContext() << std::endl;
-  ostr << "right-context " << RightContext() << std::endl;
-  ostr << "input-dim " << InputDim() << std::endl;
-  ostr << "output-dim " << OutputDim() << std::endl;
-  ostr << "parameter-dim " << GetParameterDim() << std::endl;
-  for (int32 i = 0; i < NumComponents(); i++)
-    ostr << "component " << i << " : " << components_[i]->Info() << std::endl;
-  return ostr.str();
-}
-
-void Nnet::Check() const {
-  for (size_t i = 0; i + 1 < components_.size(); i++) {
-    KALDI_ASSERT(components_[i] != NULL);
-    int32 output_dim = components_[i]->OutputDim(),
-      next_input_dim = components_[i+1]->InputDim();
-    KALDI_ASSERT(output_dim == next_input_dim);
-    KALDI_ASSERT(components_[i]->Index() == static_cast<int32>(i));
-  }
-}
-
-void Nnet::Init(std::istream &is) {
-  Destroy();
-  std::string line;
-  /* example config file as follows.  The things in brackets specify the context
-     splicing for each layer, and after that is the info about the actual layer.
-     Imagine the input dim is 13, and the speaker dim is 40, so (13 x 9) + 40 =  527.
-     The config file might be as follows; the lines beginning with # are comments.
-
-     # layer-type layer-options
-     AffineLayer 0.01 0.001 527 1000 0.04356
-  */
-  components_.clear();
-  while (getline(is, line)) {
-    std::istringstream line_is(line);
-    line_is >> std::ws;  // Eat up whitespace.
-    if (line_is.peek() == '#' || line_is.eof()) continue;  // Comment or empty.
-    Component *c = Component::NewFromString(line);
-    KALDI_ASSERT(c != NULL);
-    components_.push_back(c);
-  }
-  SetIndexes();
-  Check();
-}
-
-void Nnet::Init(std::vector<Component*> *components) {
-  Destroy();
-  components_.swap(*components);
-  SetIndexes();
-  Check();
-}
-
-
-void Nnet::ScaleLearningRates(BaseFloat factor) {
-  std::ostringstream ostr;
-  for (int32 c = 0; c < NumComponents(); c++) {
-    UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(components_[c]);
-    if (uc != NULL) {  // Updatable component...
-      uc->SetLearningRate(uc->LearningRate() * factor);
-      ostr << uc->LearningRate() << " ";
-    }
-  }
-  KALDI_LOG << "Scaled learning rates by " << factor
-            << ", new learning rates are "
-            << ostr.str();
-}
-
-void Nnet::ScaleLearningRates(std::map<std::string, BaseFloat> scale_factors) {
-  std::ostringstream ostr;
-  for (int32 c = 0; c < NumComponents(); c++) {
-    UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(components_[c]);
-    if (uc != NULL) {  // Updatable component...
-      // check if scaling factor was specified for a component of this type
-      std::map<std::string, BaseFloat>::const_iterator lr_iterator =
-        scale_factors.find(uc->Type());
-      if (lr_iterator != scale_factors.end())  {
-        uc->SetLearningRate(uc->LearningRate() * lr_iterator->second);
-        ostr << uc->LearningRate() << " ";
-      }
-    }
-  }
-  KALDI_LOG << "Scaled learning rates by component-type specific factor, "
-            << "new learning rates are "
-            << ostr.str();
-}
-
-void Nnet::SetLearningRates(BaseFloat learning_rate) {
-  for (int32 c = 0; c < NumComponents(); c++) {
-    UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(components_[c]);
-    if (uc != NULL) {  // Updatable component...
-      uc->SetLearningRate(learning_rate);
-    }
-  }
-  KALDI_LOG << "Set learning rates to " << learning_rate;
-}
-
-void Nnet::ResizeOutputLayer(int32 new_num_pdfs) {
-  KALDI_ASSERT(new_num_pdfs > 0);
-  KALDI_ASSERT(NumComponents() > 2);
-  int32 nc = NumComponents();
-  SumGroupComponent *sgc =
-      dynamic_cast<SumGroupComponent*>(components_[nc - 1]);
-  if (sgc != NULL) {
-    // Remove it.  We'll resize things later.
-    delete sgc;
-    components_.erase(components_.begin() + nc - 1,
-                      components_.begin() + nc);
-    nc--;
-  }
-  SoftmaxComponent *sc;
-  if ((sc = dynamic_cast<SoftmaxComponent*>(components_[nc - 1])) == NULL)
-    KALDI_ERR << "Expected last component to be SoftmaxComponent.";
-
-  // check if nc-1 has a FixedScaleComponent
-  bool has_fixed_scale_component = false;
-  int32 fixed_scale_component_index = -1;
-  int32 final_affine_component_index = nc - 2;
-  int32 softmax_component_index = nc - 1;
-  FixedScaleComponent *fsc =
-      dynamic_cast<FixedScaleComponent*>(
-          components_[final_affine_component_index]);
-  if (fsc != NULL)  {
-    has_fixed_scale_component = true;
-    fixed_scale_component_index = nc - 2;
-    final_affine_component_index = nc - 3;
-  }
-  // note: it could be child class of AffineComponent.
-  AffineComponent *ac = dynamic_cast<AffineComponent*>(
-      components_[final_affine_component_index]);
-  if (ac == NULL)
-    KALDI_ERR << "Network doesn't have expected structure (didn't find final "
-              << "AffineComponent).";
-  if (has_fixed_scale_component)  {
-    // collapse the fixed_scale_component with the affine_component before it
-    AffineComponent *ac_new =
-        dynamic_cast<AffineComponent*>(ac->CollapseWithNext(*fsc));
-    KALDI_ASSERT(ac_new != NULL);
-    delete fsc;
-    delete ac;
-    components_.erase(components_.begin() + fixed_scale_component_index,
-                      components_.begin() + (fixed_scale_component_index + 1));
-    components_[final_affine_component_index] = ac_new;
-    ac = ac_new;
-    softmax_component_index = softmax_component_index - 1;
-  }
-  ac->Resize(ac->InputDim(), new_num_pdfs);
-  // Remove the softmax component, and replace it with a new one
-  delete components_[softmax_component_index];
-  components_[softmax_component_index] = new SoftmaxComponent(new_num_pdfs);
-  this->SetIndexes();  // used for debugging
-  this->Check();
-}
-
-int32 Nnet::NumUpdatableComponents() const {
-  int32 ans = 0;
-  for (int32 i = 0; i < NumComponents(); i++)
-    if (dynamic_cast<const UpdatableComponent*>(&(GetComponent(i))) != NULL)
-      ans++;
-  return ans;
-}
-
-void Nnet::ScaleComponents(const VectorBase<BaseFloat> &scale_params) {
-  KALDI_ASSERT(scale_params.Dim() == this->NumUpdatableComponents());
-  int32 i = 0;
-  for (int32 j = 0; j < NumComponents(); j++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(GetComponent(j)));
-    if (uc!= NULL) {
-      uc->Scale(scale_params(i));
-      i++;
-    }
-  }
-  KALDI_ASSERT(i == scale_params.Dim());
-}
-
-// Scales all UpdatableComponents and all NonlinearComponents.
-void Nnet::Scale(BaseFloat scale) {
-  for (int32 i = 0; i < NumComponents(); i++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(GetComponent(i)));
-    if (uc != NULL) uc->Scale(scale);
-    NonlinearComponent *nc =
-        dynamic_cast<NonlinearComponent*>(&(GetComponent(i)));
-    if (nc != NULL) nc->Scale(scale);
-  }
-}
-
-void Nnet::CopyStatsFrom(const Nnet &other) {
-  KALDI_ASSERT(NumComponents() == other.NumComponents());
-  for (int32 i = 0; i < NumComponents(); i++) {
-    NonlinearComponent *nc =
-        dynamic_cast<NonlinearComponent*>(&(GetComponent(i)));
-    const NonlinearComponent *nc_other =
-        dynamic_cast<const NonlinearComponent*>(&(other.GetComponent(i)));
-    if (nc != NULL) {
-      nc->Scale(0.0);
-      nc->Add(1.0, *nc_other);
-    }
-  }
-}
-
-void Nnet::SetLearningRates(const VectorBase<BaseFloat> &learning_rates) {
-  KALDI_ASSERT(learning_rates.Dim() == this->NumUpdatableComponents());
-  KALDI_ASSERT(learning_rates.Min() >= 0.0);  // we allow zero learning rate.
-  int32 i = 0;
-  for (int32 j = 0; j < NumComponents(); j++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(GetComponent(j)));
-    if (uc!= NULL) {
-      uc->SetLearningRate(learning_rates(i));
-      i++;
-    }
-  }
-  KALDI_ASSERT(i == learning_rates.Dim());
-}
-
-void Nnet::GetLearningRates(VectorBase<BaseFloat> *learning_rates) const {
-  KALDI_ASSERT(learning_rates->Dim() == this->NumUpdatableComponents());
-  int32 i = 0;
-  for (int32 j = 0; j < NumComponents(); j++) {
-    const UpdatableComponent *uc =
-        dynamic_cast<const UpdatableComponent*>(&(GetComponent(j)));
-    if (uc!= NULL) {
-      (*learning_rates)(i) = uc->LearningRate();
-      i++;
-    }
-  }
-  KALDI_ASSERT(i == learning_rates->Dim());
-}
-
-void Nnet::Resize(int32 new_size) {
-  KALDI_ASSERT(new_size <= static_cast<int32>(components_.size()));
-  for (size_t i = new_size; i < components_.size(); i++)
-    delete components_[i];
-  components_.resize(new_size);
-}
-
-void Nnet::RemoveDropout() {
-  std::vector<Component*> components;
-  int32 removed = 0;
-  for (size_t i = 0; i < components_.size(); i++) {
-    if (dynamic_cast<DropoutComponent*>(components_[i]) != NULL ||
-        dynamic_cast<AdditiveNoiseComponent*>(components_[i]) != NULL) {
-      delete components_[i];
-      removed++;
-    } else {
-      components.push_back(components_[i]);
-    }
-  }
-  components_ = components;
-  if (removed > 0)
-    KALDI_LOG << "Removed " << removed << " dropout components.";
-  SetIndexes();
-  Check();
-}
-
-void Nnet::SetDropoutScale(BaseFloat scale) {
-  size_t n_set = 0;
-  for (size_t i = 0; i < components_.size(); i++) {
-    DropoutComponent *dc =
-        dynamic_cast<DropoutComponent*>(components_[i]);
-    if (dc != NULL) {
-      dc->SetDropoutScale(scale);
-      n_set++;
-    }
-  }
-  KALDI_LOG << "Set dropout scale to " << scale
-            << " for " << n_set << " components.";
-}
-
-
-void Nnet::RemovePreconditioning() {
-  for (size_t i = 0; i < components_.size(); i++) {
-    if (dynamic_cast<AffineComponentPreconditioned*>(components_[i]) != NULL) {
-      AffineComponent *ac = new AffineComponent(
-          *(dynamic_cast<AffineComponent*>(components_[i])));
-      delete components_[i];
-      components_[i] = ac;
-    } else if (dynamic_cast<AffineComponentPreconditionedOnline*>(
-        components_[i]) != NULL) {
-      AffineComponent *ac = new AffineComponent(
-          *(dynamic_cast<AffineComponent*>(components_[i])));
-      delete components_[i];
-      components_[i] = ac;
-    }
-  }
-  SetIndexes();
-  Check();
-}
-
-
-void Nnet::SwitchToOnlinePreconditioning(int32 rank_in, int32 rank_out,
-                                         int32 update_period,
-                                         BaseFloat num_samples_history,
-                                         BaseFloat alpha) {
-  int32 switched = 0;
-  for (size_t i = 0; i < components_.size(); i++) {
-    if (dynamic_cast<AffineComponent*>(components_[i]) != NULL) {
-      AffineComponentPreconditionedOnline *ac =
-          new AffineComponentPreconditionedOnline(
-              *(dynamic_cast<AffineComponent*>(components_[i])),
-              rank_in, rank_out, update_period, num_samples_history, alpha);
-      delete components_[i];
-      components_[i] = ac;
-      switched++;
-    }
-  }
-  KALDI_LOG << "Switched " << switched << " components to use online "
-            << "preconditioning, with (input, output) rank = "
-            << rank_in << ", " << rank_out << " and num_samples_history = "
-            << num_samples_history;
-  SetIndexes();
-  Check();
-}
-
-
-void Nnet::AddNnet(const VectorBase<BaseFloat> &scale_params,
-                   const Nnet &other) {
-  KALDI_ASSERT(scale_params.Dim() == this->NumUpdatableComponents());
-  int32 i = 0;
-  for (int32 j = 0; j < NumComponents(); j++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(GetComponent(j)));
-    const UpdatableComponent *uc_other =
-        dynamic_cast<const UpdatableComponent*>(&(other.GetComponent(j)));
-    if (uc != NULL) {
-      KALDI_ASSERT(uc_other != NULL);
-      BaseFloat alpha = scale_params(i);
-      uc->Add(alpha, *uc_other);
-      i++;
-    }
-  }
-  KALDI_ASSERT(i == scale_params.Dim());
-}
-
-void Nnet::AddNnet(BaseFloat alpha,
-                   const Nnet &other) {
-  for (int32 i = 0; i < NumComponents(); i++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(GetComponent(i)));
-    const UpdatableComponent *uc_other =
-        dynamic_cast<const UpdatableComponent*>(&(other.GetComponent(i)));
-    if (uc != NULL) {
-      KALDI_ASSERT(uc_other != NULL);
-      uc->Add(alpha, *uc_other);
-    }
-    NonlinearComponent *nc =
-        dynamic_cast<NonlinearComponent*>(&(GetComponent(i)));
-    const NonlinearComponent *nc_other =
-        dynamic_cast<const NonlinearComponent*>(&(other.GetComponent(i)));
-    if (nc != NULL) {
-      KALDI_ASSERT(nc_other != NULL);
-      nc->Add(alpha, *nc_other);
-    }
-  }
-}
-
-void Nnet::AddNnet(BaseFloat alpha,
-                   Nnet *other,
-                   BaseFloat beta) {
-  for (int32 i = 0; i < NumComponents(); i++) {
-    UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(GetComponent(i)));
-    UpdatableComponent *uc_other =
-        dynamic_cast<UpdatableComponent*>(&(other->GetComponent(i)));
-    if (uc != NULL) {
-      KALDI_ASSERT(uc_other != NULL);
-      uc->Add(alpha, *uc_other);
-      uc_other->Scale(beta);
-    }
-    NonlinearComponent *nc =
-        dynamic_cast<NonlinearComponent*>(&(GetComponent(i)));
-    NonlinearComponent *nc_other =
-        dynamic_cast<NonlinearComponent*>(&(other->GetComponent(i)));
-    if (nc != NULL) {
-      KALDI_ASSERT(nc_other != NULL);
-      nc->Add(alpha, *nc_other);
-      nc_other->Scale(beta);
-    }
-  }
-}
-
-
-void Nnet::Append(Component *new_component) {
-  components_.push_back(new_component);
-  SetIndexes();
-  Check();
-}
-
-void Nnet::SetComponent(int32 c, Component *component) {
-  KALDI_ASSERT(static_cast<size_t>(c) < components_.size());
-  delete components_[c];
-  components_[c] = component;
-  SetIndexes();
-  Check();  // Check that all the dimensions still match up.
-}
-
-int32 Nnet::GetParameterDim() const {
-  int32 ans = 0;
-  for (int32 c = 0; c < NumComponents(); c++) {
-    const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(
-        &(GetComponent(c)));
-    if (uc != NULL)
-      ans += uc->GetParameterDim();
-  }
-  return ans;
-}
-
-void Nnet::Vectorize(VectorBase<BaseFloat> *params) const {
-  int32 offset = 0;
-  for (int32 c = 0; c < NumComponents(); c++) {
-    const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(
-        &(GetComponent(c)));
-    if (uc != NULL) {
-      int32 size = uc->GetParameterDim();
-      SubVector<BaseFloat> temp(*params, offset, size);
-      uc->Vectorize(&temp);
-      offset += size;
-    }
-  }
-  KALDI_ASSERT(offset == GetParameterDim());
-}
-
-void Nnet::ResetGenerators() {
-  // resets random-number generators for all random
-  // components.
-  for (int32 c = 0; c < NumComponents(); c++) {
-    RandomComponent *rc = dynamic_cast<RandomComponent*>(
-        &(GetComponent(c)));
-    if (rc != NULL)
-      rc->ResetGenerator();
-  }
-}
-
-void Nnet::UnVectorize(const VectorBase<BaseFloat> &params) {
-  int32 offset = 0;
-  for (int32 c = 0; c < NumComponents(); c++) {
-    UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(
-        &(GetComponent(c)));
-    if (uc != NULL) {
-      int32 size = uc->GetParameterDim();
-      uc->UnVectorize(params.Range(offset, size));
-      offset += size;
-    }
-  }
-  KALDI_ASSERT(offset == GetParameterDim());
-}
-
-void Nnet::LimitRankOfLastLayer(int32 dim) {
-  for (int32 i = components_.size() - 1; i >= 0; i--) {
-    AffineComponent *a = NULL, *b = NULL,
-        *c = dynamic_cast<AffineComponent*>(components_[i]);
-    if (c != NULL) {
-      c->LimitRank(dim, &a, &b);
-      delete c;
-      components_[i] = a;
-      components_.insert(components_.begin() + i + 1, b);
-      this->SetIndexes();
-      this->Check();
-      return;
-    }
-  }
-  KALDI_ERR << "No affine component found in neural net.";
-}
-
-void Nnet::SetIndexes() {
-  for (size_t i = 0; i < components_.size(); i++)
-    components_[i]->SetIndex(i);
-}
-
-void Nnet::Collapse(bool match_updatableness) {
-  int32 num_collapsed = 0;
-  bool changed = true;
-  while (changed) {
-    changed = false;
-    for (size_t i = 0; i + 1 < components_.size(); i++) {
-      AffineComponent *a1 = dynamic_cast<AffineComponent*>(components_[i]),
-          *a2 = dynamic_cast<AffineComponent*>(components_[i + 1]);
-      FixedAffineComponent
-          *f1 = dynamic_cast<FixedAffineComponent*>(components_[i]),
-          *f2 = dynamic_cast<FixedAffineComponent*>(components_[i + 1]);
-      Component *c = NULL;
-      if (a1 != NULL && a2 != NULL) {
-        c = a1->CollapseWithNext(*a2);
-      } else if (a1 != NULL && f2 != NULL && !match_updatableness) {
-        c = a1->CollapseWithNext(*f2);
-      } else if (f1 != NULL && a2 != NULL && !match_updatableness) {
-        c = a2->CollapseWithPrevious(*f1);
-      }
-      if (c != NULL) {
-        delete components_[i];
-        delete components_[i + 1];
-        components_[i] = c;
-        // This was causing valgrind errors, so doing it differently.  Either
-        // a standard-library bug or I misunderstood something.
-        // components_.erase(components_.begin() + i + i,
-        //                   components_.begin() + i + 2);
-        for (size_t j = i + 1; j + 1 < components_.size(); j++)
-          components_[j] = components_[j + 1];
-        components_.pop_back();
-        changed = true;
-        num_collapsed++;
-      }
-    }
-  }
-  this->SetIndexes();
-  this->Check();
-  KALDI_LOG << "Collapsed " << num_collapsed << " components."
-            << (num_collapsed == 0 && match_updatableness == true ?
-                "  Try --match-updatableness=false." : "");
-}
-
-Nnet *GenRandomNnet(int32 input_dim,
-                    int32 output_dim) {
-  std::vector<Component*> components;
-  int32 cur_dim = input_dim;
-  // have up to 10 layers before the final one.
-  for (size_t i = 0; i < 10; i++) {
-    if (rand() % 2 == 0) {
-      // add an affine component.
-      int32 next_dim = 50 + rand() % 100;
-      BaseFloat learning_rate = 0.0001, param_stddev = 0.001,
-          bias_stddev = 0.1;
-      AffineComponent *component = new AffineComponent();
-      component->Init(learning_rate, cur_dim, next_dim,
-                      param_stddev, bias_stddev);
-      components.push_back(component);
-      cur_dim = next_dim;
-    } else if (rand() % 2 == 0) {
-      components.push_back(new SigmoidComponent(cur_dim));
-    } else if (rand() % 2 == 0 && cur_dim < 200) {
-      SpliceComponent *component = new SpliceComponent();
-      std::vector<int32> context;
-      while (true) {
-        context.clear();
-        for (int32 i = -3; i <= 3; i++) {
-          if (rand() % 3 == 0)
-            context.push_back(i);
-        }
-        if (!context.empty() && context.front() <= 0 &&
-            context.back() >= 0)
-          break;
-      }
-      component->Init(cur_dim, context);
-      components.push_back(component);
-      cur_dim = cur_dim * context.size();
-    } else {
-      break;
-    }
-  }
-
-  {
-    AffineComponent *component = new AffineComponent();
-    BaseFloat learning_rate = 0.0001, param_stddev = 0.001,
-        bias_stddev = 0.1;
-    component->Init(learning_rate, cur_dim, output_dim,
-                    param_stddev, bias_stddev);
-    components.push_back(component);
-    cur_dim = output_dim;
-  }
-
-  components.push_back(new SoftmaxComponent(cur_dim));
-
-  Nnet *ans = new Nnet();
-  ans->Init(&components);
-  return ans;
-}
-
-int32 Nnet::FirstUpdatableComponent() const {
-  for (int32 i = 0; i < NumComponents(); i++) {
-    if (dynamic_cast<UpdatableComponent*>(components_[i]) != NULL)
-      return i;
-  }
-  return NumComponents();
-}
-
-
-int32 Nnet::LastUpdatableComponent() const {
-  for (int32 i = NumComponents() - 1; i >= 0; i--)
-    if (dynamic_cast<UpdatableComponent*>(components_[i]) != NULL)
-      return i;
-  return -1;
-}
-
-}  // namespace nnet2
-}  // namespace kaldi
-
diff --git a/src/nnet2/nnet-nnet.h b/src/nnet2/nnet-nnet.h
deleted file mode 100644
index c9c14cf2bae..00000000000
--- a/src/nnet2/nnet-nnet.h
+++ /dev/null
@@ -1,306 +0,0 @@
-// nnet2/nnet-nnet.h
-
-// Copyright 2011-2012  Karel Vesely
-//           2012-2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_NNET_H_
-#define KALDI_NNET2_NNET_NNET_H_
-
-#include "base/kaldi-common.h"
-#include "util/kaldi-io.h"
-#include "matrix/matrix-lib.h"
-#include "nnet2/nnet-component.h"
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include <map>
-
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/*
-  This neural net is basically a series of Components, and is a fairly
-  passive object that mainly acts as a store for the Components.  Training
-  is handled by a separate class NnetTrainer(), and extracting likelihoods
-  for decoding is handled by DecodableNnetCpu(). 
-  
-  There are a couple of things that make this class more than just a vector of
-  Components.
-
-   (1) It handles frame splicing (temporal context.)
-   We'd like to encompass the approach described in
-   http://www.fit.vutbr.cz/research/groups/speech/publi/2011/vesely_asru2011_00042.pdf
-   where at a certain point they splice together frames -10, -5, 0, +5 and +10.  It
-   seems that it's not necessarily best to splice together a contiguous sequence
-   of frames.
-
-   (2) It handles situations where the input features have two parts--
-   a "frame-specific" part (the normal features), and a "speaker-specific", or at
-   least utterance-specific part that does not vary with the frame index.
-   These features are provided separately from the frame-specific ones, to avoid
-   redundancy.
-*/
-
-
-class Nnet {
- public:
-  
-  /// Returns number of components-- think of this as similar to # of layers, but
-  /// e.g. the nonlinearity and the linear part count as separate components,
-  /// so the number of components will be more than the number of layers.
-  int32 NumComponents() const { return components_.size(); }
-
-  const Component &GetComponent(int32 c) const;
-  
-  Component &GetComponent(int32 c);
-
-  /// Sets the c'th component to "component", taking ownership of the pointer
-  /// and deleting the corresponding one that we own.
-  void SetComponent(int32 c, Component *component);
-  
-  /// Returns the left-context summed over all the Components... this is the
-  /// entire left-context in frames, that the network requires.
-  int32 LeftContext() const;
-
-  /// Returns the right-context summed over all the Components... this is the
-  /// entire right-context in frames, that the network requires.
-  int32 RightContext() const;
-  
-  /// The output dimension of the network -- typically
-  /// the number of pdfs.
-  int32 OutputDim() const;
-
-  /// Dimension of the input features, e.g. 13 or 40.  Does not
-  /// take account of frame splicing-- that is done with the "chunk"
-  /// mechanism, where you provide chunks of features over time.
-  int32 InputDim() const; 
-  
-  /// Uses the output of the Context() functions of the network, to compute a
-  /// vector of size NumComponents() + 1 indexed by component-index c, of the
-  /// chunk-info at the input of each layer c, where the c+1'th element contains
-  /// the chunk-info at the output of that layer.
-  /// The "input_chunk_size" is the time extent of the input.  If you want to
-  /// produce exactly 1 output frame per chunk, then this should equal 1 +
-  /// LeftContext() + RightContext().
-  void ComputeChunkInfo(int32 input_chunk_size,
-                        int32 num_chunks,
-                        std::vector<ChunkInfo> *chunk_info_out) const;
-
-  void ZeroStats(); // zeroes the stats on the nonlinear layers.
-
-  /// Copies only the statistics in layers of type NonlinearComponewnt, from
-  /// this neural net, leaving everything else fixed.
-  void CopyStatsFrom(const Nnet &nnet);
-
-  /// Returns the index of the lowest-numbered component which is updatable, or
-  /// NumComponents() if none are updatable.
-  int32 FirstUpdatableComponent() const;
-  
-  /// Returns the index of the highest-numbered component which is updatable, or
-  /// -1 if none are updatable.
-  int32 LastUpdatableComponent() const;
-
-  /// Returns the number of updatable components.
-  int32 NumUpdatableComponents() const;
-  
-  /// Scales the parameters of each of the updatable components.
-  /// Here, scale_params is a vector of size equal to
-  /// NumUpdatableComponents()
-  void ScaleComponents(const VectorBase<BaseFloat> &scales);
-
-  /// Excise any components of type DropoutComponent or AdditiveNoiseComponent
-  void RemoveDropout();
-
-  /// Calls SetDropoutScale for all the dropout nodes.
-  void SetDropoutScale(BaseFloat scale);
-  
-  /// Replace any components of type AffineComponentPreconditioned with
-  /// components of type AffineComponent.
-  void RemovePreconditioning();
-
-  /// Replaces any components of type AffineComponent or derived classes, with
-  /// components of type AffineComponentPreconditionedOnline.  E.g. rank_in =
-  /// 20, rank_out = 80, num_samples_history = 2000.0, alpha = 4.0
-  void SwitchToOnlinePreconditioning(int32 rank_in, int32 rank_out,
-                                     int32 update_period,
-                                     BaseFloat num_samples_history,
-                                     BaseFloat alpha);
-  
-  /// For each updatatable component, adds to it
-  /// the corresponding element of "other" times the
-  /// appropriate element of "scales" (which has the
-  /// same format as for ScaleComponents(), i.e.
-  /// one entry for each updatable component).
-  void AddNnet(const VectorBase<BaseFloat> &scales,
-               const Nnet &other);
-
-  /// Scales all the Components with the same scale.  This applies to
-  /// UpdatableComponents, and (unlike the ScaleComponents function) to
-  /// SoftmaxComponents.
-  void Scale(BaseFloat scale);
-
-
-  /// Adds to *this, the other neural net times the scale "alpha".  This applies
-  /// to UpdatableComponents, and (unlike the other AddNnet function) to
-  /// SoftmaxComponents.
-  void AddNnet(BaseFloat alpha,
-               const Nnet &other);
-
-  /// Turns the last affine layer into two layers of the same type, with a
-  /// smaller dimension in between-- we're keeping the top singular values of
-  /// the matrix.
-  void LimitRankOfLastLayer(int32 dimension);
-
-  /// This version of AddNnet adds to *this, alpha times *other, and then scales
-  /// *other by beta.  The reason why we make this a separate function is for
-  /// multithreading reasons (otherwise you could do AddNnet(alpha, *iter) and then
-  /// other->Scale(beta).
-  void AddNnet(BaseFloat alpha,
-               Nnet *other,
-               BaseFloat beta);
-
-  /// Removes final components from the neural network (used for
-  /// debugging).
-  void Resize(int32 num_components);
-
-
-  /// Where possible, collapse multiple affine or linear components in a
-  /// sequence into a single one by composing the transforms.  If
-  /// match_updatableness=true, this will not collapse, say, an
-  /// AffineComponent with a FixedAffineComponent or FixedLinearComponent.
-  /// If false, it will collapse them.  This function won't necessarily
-  /// work for all pairs of such layers.  It currently only works where
-  /// one of each pair is an AffineComponent.
-  void Collapse(bool match_updatableness);
-  
-
-  /// Sets the index_ values of the components.
-  void SetIndexes(); 
-  
-  Nnet(const Nnet &other); // Copy constructor.
-
-  Nnet(const Nnet &nnet1, const Nnet &nnet2); // Constructor that takes two
-  // nnets: it concatenates the layers.
-  
-  Nnet() {}
-
-  Nnet &operator = (const Nnet &other); // assignment operator.
-
-  /// Initialize from config file.
-  /// Each line of the config is either a comment line starting
-  /// with whitespace then #, or it is a line that specifies one
-  /// layer of the network, as accepted by Component::InitFromString().
-  /// An example non-comment line is:
-  /// AffineComponent learning-rate=0.01 l2-penalty=0.001 input-dim=10 output-dim=15 param-stddev=0.1
-  void Init(std::istream &is);
-
-  /// This Init method works from a vector of components.  It will take
-  /// ownership of the pointers and will resize the vector to zero to avoid a
-  /// chance of the caller deallocating them (but the vector itself is not
-  /// deleted).
-  void Init(std::vector<Component*> *components);
-
-  /// Appends this component to the components already in the neural net.
-  /// Takes ownership of the pointer.
-  void Append(Component *new_component);
-  
-  virtual ~Nnet() { Destroy(); }
-
-  std::string Info() const; // some human-readable summary info.
-
-  void Destroy();
-  
-  void Write(std::ostream &os, bool binary) const;
-
-  void Read(std::istream &is, bool binary);
-
-  void SetZero(bool treat_as_gradient); // Sets all parameters to zero and if
-  // treat_as_gradient == true, also tells components to "think of themselves as
-  // gradients" (affects some of the update code).  Also zeroes stats stored
-  // with things of type NonlinearComponent.
-
-
-  /// This function is used when doing transfer learning to a new system.  It
-  /// resizes the final affine and softmax components.  If your system has a
-  /// SumGroupComponent before the final softmax, it will be discarded.
-  void ResizeOutputLayer(int32 new_num_pdfs);
-  
-
-  /// Scale all the learning rates in the neural net by this factor.
-  void ScaleLearningRates(BaseFloat factor);
-
-  /// Scale all the learning rates in the neural net by the factors indexed
-  /// by the type of component.
-  void ScaleLearningRates(std::map<std::string, BaseFloat> scale_factors);
-
-  /// Set all the learning rates in the neural net to this value.
-  void SetLearningRates(BaseFloat learning_rates);
-
-  /// Set all the learning rates in the neural net to these values
-  /// (one for each updatable layer).
-  void SetLearningRates(const VectorBase<BaseFloat> &learning_rates);
-
-  /// Get all the learning rates in the neural net (the output
-  /// must have dim equal to NumUpdatableComponents()).
-  void GetLearningRates(VectorBase<BaseFloat> *learning_rates) const;
-  
-  // This sets *dot_prod to the dot prod of *this . validation_gradient,
-  // separately for each updatable component.  The vector must have size equal
-  // to this->NumUpdatableComponents().  Warning: previously it had to have size
-  // equal to this->NumComponents()).  This is used in updating learning rates
-  // and shrinkage rates.
-  void ComponentDotProducts(
-      const Nnet &other,
-      VectorBase<BaseFloat> *dot_prod) const;
-
-  void Check() const; // Consistency check.
-
-
-  void ResetGenerators(); // resets random-number generators for all
-  // random components.  You must also set sRand() for this to be
-  // effective.
-
-  // The following three functions are used for vectorizing
-  // the parameters-- used, for example, in L-BFGS.
-  virtual int32 GetParameterDim() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-  
-  friend class NnetUpdater;
-  friend class DecodableNnet;
- private:
-  std::vector<Component*> components_;
-};
-
-
-/// This function generates a random neural net, for testing purposes.
-/// It will contain a random number of SigmoidComponent, AffineComponent
-/// and SpliceComponent, followed by a final AffineComponent and
-/// SoftmaxComponent.  The parameters will all be randomly initialized.
-Nnet *GenRandomNnet(int32 input_dim,
-                    int32 output_dim);
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/nnet-precondition-online-test.cc b/src/nnet2/nnet-precondition-online-test.cc
deleted file mode 100644
index b0306db72a2..00000000000
--- a/src/nnet2/nnet-precondition-online-test.cc
+++ /dev/null
@@ -1,342 +0,0 @@
-// nnet2/nnet-precondition-online-test.cc
-
-// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-precondition-online.h"
-#include "util/common-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// Simple version of OnlinePreconditioner that we use to make
-// sure it is behaving as advertised.
-class OnlinePreconditionerSimple {
- public:
-  OnlinePreconditionerSimple(): rank_(40), num_samples_history_(2000.0), alpha_(4.0),
-                                epsilon_(1.0e-10), delta_(5.0e-04) { }
-
-  void SetRank(int32 rank) { rank_ = rank; }
-
-  void PreconditionDirections(
-      CuMatrixBase<BaseFloat> *R,
-      CuVectorBase<BaseFloat> *row_prod,
-      BaseFloat *scale);
-
-
- private:
-  BaseFloat Eta(int32 N) const;
-
-  void PreconditionDirectionsCpu(
-      MatrixBase<double> *R,
-      VectorBase<double> *row_prod,
-      BaseFloat *scale);
-
-
-  void Init(const MatrixBase<double> &R0);
-
-  void InitDefault(int32 D);
-
-  int32 rank_;
-  double num_samples_history_;
-  double alpha_;
-  double epsilon_;
-  double delta_;
-
-  // Fisher matrix defined as F_t = R_t^T diag(d_t) R_t + rho_t I.
-  Vector<double> d_t_;
-  Matrix<double> R_t_;
-  double rho_t_;
-};
-
-
-void OnlinePreconditionerSimple::PreconditionDirections(
-      CuMatrixBase<BaseFloat> *R,
-      CuVectorBase<BaseFloat> *row_prod,
-      BaseFloat *scale) {
-  Matrix<BaseFloat> R_cpu(*R);
-  Vector<BaseFloat> row_prod_cpu(*row_prod);
-  Matrix<double> R_cpu_dbl(R_cpu);
-  Vector<double> row_prod_cpu_dbl(row_prod_cpu);
-  PreconditionDirectionsCpu(&R_cpu_dbl,
-                            &row_prod_cpu_dbl,
-                            scale);
-  row_prod_cpu.CopyFromVec(row_prod_cpu_dbl);
-  R_cpu.CopyFromMat(R_cpu_dbl);
-  R->CopyFromMat(R_cpu);
-  row_prod->CopyFromVec(row_prod_cpu);
-}
-
-void OnlinePreconditionerSimple::InitDefault(int32 D) {
-  if (rank_ >= D) {
-    KALDI_WARN << "Rank " << rank_ << " of online preconditioner is >= dim " << D
-               << ", setting it to "
-               << (D - 1) << " (but this is probably still too high)";
-    rank_ = D - 1;
-  }
-  int32 R = rank_;
-  R_t_.Resize(R, D);
-  for (int32 r = 0; r < R; r++) {
-    std::vector<int32> cols;
-    for (int32 c = r; c < D; c += R)
-      cols.push_back(c);
-    for (int32 i = 0; i < cols.size(); i++) {
-      int32 c = cols[i];
-      R_t_(r, c) = (i == 0 ? 1.1 : 1.0) /
-          sqrt(1.1 * 1.1 + cols.size() - 1);
-    }
-  }
-  d_t_.Resize(R);
-  d_t_.Set(epsilon_);
-  rho_t_ = epsilon_;
-}
-
-void OnlinePreconditionerSimple::Init(const MatrixBase<double> &R0) {
-  int32 D = R0.NumCols(), N = R0.NumRows();
-  InitDefault(D);
-  int32 num_init_iters = 3;
-  for (int32 i = 0; i < num_init_iters; i++) {
-    CuMatrix<BaseFloat> R0_copy(R0);
-    CuVector<BaseFloat> row_products(N);
-    BaseFloat scale;
-    PreconditionDirections(&R0_copy, &row_products, &scale);
-  }
-}
-
-BaseFloat OnlinePreconditionerSimple::Eta(int32 N) const {
-  KALDI_ASSERT(num_samples_history_ > 0.0);
-  BaseFloat ans = 1.0 - exp(-N / num_samples_history_);
-  if (ans > 0.9) ans = 0.9;
-  return ans;
-}
-
-
-void OnlinePreconditionerSimple::PreconditionDirectionsCpu(
-    MatrixBase<double> *X_t,
-    VectorBase<double> *row_prod,
-    BaseFloat *scale) {
-  if (R_t_.NumRows() == 0)
-    Init(*X_t);
-  int32 R = R_t_.NumRows(), D = R_t_.NumCols(), N = X_t->NumRows();
-  BaseFloat eta = Eta(N);
-
-  SpMatrix<double> F_t(D);
-  // F_t =(def) R_t^T D_t R_t + \rho_t I
-  F_t.AddToDiag(rho_t_);
-  F_t.AddMat2Vec(1.0, R_t_, kTrans, d_t_, 1.0);
-
-  // Make sure F_t is +ve definite.
-  {
-    KALDI_ASSERT(d_t_.Min() > 0);
-    Vector<double> eigs(D);
-    F_t.Eig(&eigs, NULL);
-    KALDI_ASSERT(eigs.Min() > 0);
-  }
-
-  // S_t =(def) 1/N X_t^T X_t.
-  SpMatrix<double> S_t(D);
-  S_t.AddMat2(1.0 / N, *X_t, kTrans, 0.0);
-
-  // T_t =(def) \eta S_t + (1-\eta) F_t
-  SpMatrix<double> T_t(D);
-  T_t.AddSp(eta, S_t);
-  T_t.AddSp(1.0 - eta, F_t);
-
-  // Y_t =(def) R_t T_t
-  Matrix<double> Y_t(R, D);
-  Y_t.AddMatSp(1.0, R_t_, kNoTrans, T_t, 0.0);
-
-  // Z_t =(def) Y_t Y_t^T
-  SpMatrix<double> Z_t(R);
-  Z_t.AddMat2(1.0, Y_t, kNoTrans, 0.0);
-
-  Matrix<double> U_t(R, R);
-  Vector<double> c_t(R);
-  // decompose Z_t = U_t C_t U_t^T
-  Z_t.Eig(&c_t, &U_t);
-  SortSvd(&c_t, &U_t);
-  double c_t_floor = pow(rho_t_ * (1.0 - eta), 2);
-  int32 nf;
-  c_t.ApplyFloor(c_t_floor, &nf);
-  if (nf > 0) {
-    KALDI_WARN << "Floored " << nf << " elements of c_t.";
-  }
-  // KALDI_LOG << "c_t is " << c_t;
-  // KALDI_LOG << "U_t is " << U_t;
-  // KALDI_LOG << "Z_t is " << Z_t;
-
-  Vector<double> sqrt_c_t(c_t);
-  sqrt_c_t.ApplyPow(0.5);
-  Vector<double> inv_sqrt_c_t(sqrt_c_t);
-  inv_sqrt_c_t.InvertElements();
-  Matrix<double> R_t1(R, D);
-  // R_{t+1} = C_t^{-0.5} U_t^T Y_t
-  R_t1.AddMatMat(1.0, U_t, kTrans, Y_t, kNoTrans, 0.0);
-  R_t1.MulRowsVec(inv_sqrt_c_t);
-
-  double rho_t1 = (1.0 / (D - R)) *
-      (eta * S_t.Trace() + (1.0 - eta) * (D * rho_t_ + d_t_.Sum()) - sqrt_c_t.Sum());
-
-  Vector<double> d_t1(sqrt_c_t);
-  d_t1.Add(-rho_t1);
-
-  double floor_val = std::max(epsilon_, delta_ * sqrt_c_t.Max());
-  if (rho_t1 < floor_val) {
-    KALDI_WARN << "flooring rho_{t+1} to " << floor_val << ", was " << rho_t1;
-    rho_t1 = floor_val;
-  }
-  d_t1.ApplyFloor(floor_val, &nf);
-  if (nf > 0) {
-    KALDI_VLOG(3) << "d_t1 was " << d_t1;
-    KALDI_WARN << "Floored " << nf << " elements of d_{t+1}.";
-  }
-  // a check.
-  if (nf == 0 && rho_t1 > floor_val) {
-    double tr_F_t1 = D * rho_t1 + d_t1.Sum(), tr_T_t = T_t.Trace();
-    AssertEqual(tr_F_t1, tr_T_t);
-  }
-
-  // G_t = F_t + alpha/D tr(F_t)
-  SpMatrix<double> G_t(F_t);
-  G_t.AddToDiag(alpha_ / D * F_t.Trace());
-  SpMatrix<double> G_t_inv(G_t);
-  G_t_inv.Invert();
-
-  double beta_t = rho_t_ + alpha_/D * F_t.Trace();
-  // X_hat_t = beta_t X_t G_t^{-1}.
-  Matrix<double> X_hat_t(N, D);
-  X_hat_t.AddMatSp(beta_t, *X_t, kNoTrans, G_t_inv, 0.0);
-
-  double tr_x_x = TraceMatMat(*X_t, *X_t, kTrans),
-      tr_Xhat_Xhat = TraceMatMat(X_hat_t, X_hat_t, kTrans);
-  double gamma = (tr_Xhat_Xhat == 0 ? 1.0 : sqrt(tr_x_x / tr_Xhat_Xhat));
-
-  X_t->CopyFromMat(X_hat_t);
-  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-  *scale = gamma;
-
-  // Update the parameters
-  rho_t_ = rho_t1;
-  d_t_.CopyFromVec(d_t1);
-  R_t_.CopyFromMat(R_t1);
-
-  KALDI_VLOG(3) << "rho_t_ = " << rho_t_;
-  KALDI_VLOG(3) << "d_t_ = " << d_t_;
-  KALDI_VLOG(3) << "R_t_ = " << R_t_;
-
-
-  { // check that R_t_ R_t_^T = I.
-    SpMatrix<double> unit(R);
-    unit.AddMat2(1.0, R_t_, kNoTrans, 0.0);
-    if (!unit.IsUnit(1.0e-03)) {
-      KALDI_WARN  << "R is not orthogonal, reorthogonalizing.";
-      for (int32 i = 0; i < R; i++) {
-        SubVector<double> row(R_t_, i);
-        for (int32 j = 0; j < i; j++) {
-          SubVector<double> row_j(R_t_, j);
-          row.AddVec(-VecVec(row_j, row), row_j);
-        }
-        row.Scale(1.0 / row.Norm(2.0));
-      }
-    }
-    unit.AddMat2(1.0, R_t_, kNoTrans, 0.0);
-    KALDI_ASSERT(unit.IsUnit(1.0e-03));
-  }
-}
-
-
-void UnitTestPreconditionDirectionsOnline() {
-  MatrixIndexT R = 1 + Rand() % 30,  // rank of correction
-      N = (2 * R) + Rand() % 30,  // batch size
-      D = R + 1 + Rand() % 20; // problem dimension.  Must be > R.
-
-  // Test sometimes with features that are all-zero or all-one; this will
-  // help to make sure low-rank or zero input doesn't crash the code.
-  bool zero = false;
-  bool one = false;
-  if (Rand() % 3 == 0) zero = true;
-  //else if (Rand() % 2 == 0) one = true;
-
-  CuVector<BaseFloat> row_prod1(N), row_prod2(N);
-  BaseFloat gamma1, gamma2;
-  BaseFloat big_eig_factor = RandInt(1, 20);
-  big_eig_factor = big_eig_factor * big_eig_factor;
-  Vector<BaseFloat> big_eig_vector(D);
-  big_eig_vector.SetRandn();
-  big_eig_vector.Scale(big_eig_factor);
-
-  OnlinePreconditionerSimple preconditioner1;
-  OnlinePreconditioner preconditioner2;
-  preconditioner1.SetRank(R);
-  preconditioner2.SetRank(R);
-  preconditioner2.TurnOnDebug();
-
-  int32 num_iters = 100;
-  for (int32 iter = 0; iter < num_iters; iter++) {
-    Matrix<BaseFloat> M_cpu(N, D);
-    if (one) M_cpu.Set(1.0);
-    else if (!zero) {
-      M_cpu.SetRandn();
-      Vector<BaseFloat> rand_vec(N);
-      rand_vec.SetRandn();
-      M_cpu.AddVecVec(1.0, rand_vec, big_eig_vector);
-    }
-    CuMatrix<BaseFloat> M(M_cpu);
-
-    CuMatrix<BaseFloat> Mcopy1(M), Mcopy2(M);
-
-    preconditioner1.PreconditionDirections(&Mcopy1, &row_prod1, &gamma1);
-
-    preconditioner2.PreconditionDirections(&Mcopy2, &row_prod2, &gamma2);
-
-    BaseFloat trace1 = TraceMatMat(M, M, kTrans),
-        trace2 = TraceMatMat(Mcopy1, Mcopy1, kTrans);
-    AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02);
-
-    AssertEqual(Mcopy1, Mcopy2);
-    AssertEqual<BaseFloat>(row_prod1, row_prod2, 1.0e-02);
-    AssertEqual(gamma1, gamma2, 1.0e-02);
-
-    // make sure positive definite
-    CuVector<BaseFloat> inner_prods(M.NumRows());
-    inner_prods.AddDiagMatMat(1.0, M, kNoTrans, Mcopy1, kTrans, 0.0);
-    KALDI_ASSERT(inner_prods.Min() >= 0.0);
-  }
-  return;
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-  for (int32 loop = 0; loop < 2; loop++) {
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SetDebugStrideMode(true);
-    if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
-    else
-      CuDevice::Instantiate().SelectGpuId("optional"); // -2 .. automatic selection
-#endif
-    for (int32 i = 0; i < 10; i++) {
-      UnitTestPreconditionDirectionsOnline();
-    }
-  }
-}
diff --git a/src/nnet2/nnet-precondition-online.cc b/src/nnet2/nnet-precondition-online.cc
deleted file mode 100644
index 51e7c5b13c6..00000000000
--- a/src/nnet2/nnet-precondition-online.cc
+++ /dev/null
@@ -1,641 +0,0 @@
-// nnet2/nnet-precondition-online.cc
-
-// Copyright 2013-2015   Johns Hopkins University (author: Daniel Povey)
-//                2015   Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-precondition-online.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-OnlinePreconditioner::OnlinePreconditioner():
-    rank_(40), update_period_(1), num_samples_history_(2000.0), alpha_(4.0),
-    epsilon_(1.0e-10), delta_(5.0e-04), t_(-1),
-    num_updates_skipped_(0), self_debug_(false) { }
-
-
-/**
-  This function creates a matrix with orthonormal rows that is like the
-  following matrix, except with each row normalized to have unit 2-norm:
-  [  1.1 0   1   0   1   0
-     0   1.1 0   1   0   1  ]
-  The reason why the first element in each row is 1.1 and not 1, is for
-  symmetry-breaking... we don't want any weighted sum of all these rows to be
-  all ones, because the derivative in that direction can be zero in some
-  architectures and it causes us to have to do an inefficient CPU-based
-  renormalization.
- */
-// static
-void OnlinePreconditioner::InitOrthonormalSpecial(CuMatrixBase<BaseFloat> *R) {
-  int32 num_rows = R->NumRows(), num_cols = R->NumCols();
-  KALDI_ASSERT(num_cols >= num_rows);
-  R->SetZero();
-  std::vector<MatrixElement<BaseFloat> > elems;
-  elems.reserve(num_cols);
-  BaseFloat first_elem = 1.1;
-  for (int32 r = 0; r < num_rows; r++) {
-    std::vector<int32> cols;  // columns that have an entry for this row
-    for (int32 c = r; c < num_cols; c += num_rows)
-      cols.push_back(c);
-    BaseFloat normalizer = 1.0 / sqrt(first_elem * first_elem +
-                                      cols.size() - 1);
-    for (size_t i = 0; i < cols.size(); i++) {
-      int32 c = cols[i];
-      MatrixElement<BaseFloat> e = { r, c,
-                                     normalizer * (i == 0 ? first_elem :
-                                                   BaseFloat(1.0)) };
-      elems.push_back(e);
-    }
-  }
-  R->AddElements(1.0, elems);
-  { // TODO: remove this testing code.
-    CuMatrix<BaseFloat> prod(num_rows, num_rows);
-    prod.AddMatMat(1.0, *R, kNoTrans, *R, kTrans, 0.0);
-    KALDI_ASSERT(prod.IsUnit());
-  }
-}
-
-
-void OnlinePreconditioner::InitDefault(int32 D) {
-  if (rank_ >= D) {
-    KALDI_WARN << "Rank " << rank_ << " of online preconditioner is >= dim " << D
-               << ", setting it to "
-               << (D - 1) << " (but this is probably still too high)";
-    rank_ = D - 1;
-  }
-  if (rank_ == 0) {
-    // Dimension of input data was 1, so the natural gradient preconditioner
-    // would always be the unit matrix.
-    // We'll handle this as a special case, for generality.
-    return;
-  }
-  KALDI_ASSERT(num_samples_history_ > 0.0 && num_samples_history_ <= 1.0e+6);
-  KALDI_ASSERT(alpha_ >= 0.0);
-  KALDI_ASSERT(rank_ > 0);
-  KALDI_ASSERT(epsilon_ > 0.0 && epsilon_ <= 1.0e-05);  // plausible values.
-  KALDI_ASSERT(delta_ > 0.0 && delta_ <= 1.0e-02);  // plausible values.
-
-  // to initialize, in the equation
-  //   F_t =(def) R_t^T D_t R_t + \rho_t I
-  // we will set the orthogonal R_t to a special orthogonal matrix with no zero
-  // rows or columns (see the function), rho_t to epsilon,
-  // and D_t to epsilon.  But we don't store R_t directly.  Instead, we store
-  //   W_t =(def)  E_t^{0.5} R_t,
-  // where E_t =(def)  1/\beta_t (D_t^{-1} + 1/\beta_t I)^{-1}
-  // from (eqn:tii),
-  //  e_{tii} =   1/(\beta_t/d_{tii} + 1),
-  // where
-  // \beta_t =(def) \rho_t + \alpha/D tr(F_t)
-  //         =      epsilon + alpha/D * (epsilon * D + epsilon * rank)
-  //         =     epsilon * (1 + alpha * (D + rank) / D)
-  // And  d_{tii} is epsilon, so
-  //  e_{tii} =   1/((1 + alpha * (D + rank) / D) + 1)  [for each i.]
-  //          =   1/(2 + alpha * (D + rank) / D)).
-  BaseFloat epsilon = epsilon_;  // we could make this a bit more.
-  rho_t_ = epsilon;
-  d_t_.Resize(rank_, kUndefined);
-  d_t_.Set(epsilon);
-  W_t_.Resize(rank_, D, kUndefined);
-  // after the next line, W_ will store the orthogonal matrix R_t.
-  InitOrthonormalSpecial(&W_t_);
-  BaseFloat E_tii = 1.0 / ( 2.0 + (D + rank_) * alpha_ / D );
-  // W_t =(def) E_t^{0.5} R_t.
-  W_t_.Scale(sqrt(E_tii));
-  t_ = 0;
-}
-
-void OnlinePreconditioner::Init(const CuMatrixBase<BaseFloat> &R0) {
-  int32 D = R0.NumCols();
-  // for locking reasons it's better to use a different object.
-  OnlinePreconditioner this_copy(*this);
-  this_copy.InitDefault(D);
-
-  CuMatrix<BaseFloat> R0_copy(R0.NumRows(), R0.NumCols(), kUndefined);
-  // number of iterations with the same data from a pseudorandom start.
-  // this is a faster way of starting than doing eigenvalue decomposition.
-  int32 num_init_iters = 3;
-  for (int32 i = 0; i < num_init_iters; i++) {
-    BaseFloat scale;
-    R0_copy.CopyFromMat(R0);
-    this_copy.PreconditionDirections(&R0_copy, NULL, &scale);
-  }
-  rank_ = this_copy.rank_;
-  W_t_.Swap(&this_copy.W_t_);
-  d_t_.Swap(&this_copy.d_t_);
-  rho_t_ = this_copy.rho_t_;
-  t_ = 0;
-}
-
-void OnlinePreconditioner::PreconditionDirections(
-    CuMatrixBase<BaseFloat> *X_t,
-    CuVectorBase<BaseFloat> *row_prod,
-    BaseFloat *scale) {
-  if (X_t->NumCols() == 1) {
-    // If the dimension of the space equals one then our natural gradient update
-    // with rescaling becomes a no-op, but the code wouldn't naturally handle it
-    // because rank would be zero.  Support this as a special case.
-    if (row_prod)
-      row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-    *scale = 1.0;
-    return;
-  }
-
-  if (row_prod == NULL) {
-    CuVector<BaseFloat> row_prod_tmp(X_t->NumRows());
-    PreconditionDirections(X_t, &row_prod_tmp, scale);
-    return;
-  }
-
-  read_write_mutex_.lock();
-  if (t_ == -1) // not initialized
-    Init(*X_t);
-
-  // Now t_ >= 0.
-  // We create local copies  of the class variables... this is intended for
-  // multi-threaded safety so we can't read them in an inconsistent state,
-  // but we don't really waste anything here (a copy of W_t is needed anyway,
-  // if we're to update it).
-  int32 t = t_, R = W_t_.NumRows(), D = W_t_.NumCols();
-  // space for W_t, J_t, K_t, L_t.
-  CuMatrix<BaseFloat> WJKL_t(2 * R, D + R);
-  WJKL_t.Range(0, R, 0, D).CopyFromMat(W_t_);
-  BaseFloat rho_t(rho_t_);
-  Vector<BaseFloat> d_t(d_t_);
-  read_write_mutex_.unlock();
-  PreconditionDirectionsInternal(t, rho_t, d_t, &WJKL_t, X_t, row_prod, scale);
-}
-
-void OnlinePreconditioner::ReorthogonalizeXt1(
-    const VectorBase<BaseFloat> &d_t1,
-    BaseFloat rho_t1,
-    CuMatrixBase<BaseFloat> *W_t1,
-    CuMatrixBase<BaseFloat> *temp_W,
-    CuMatrixBase<BaseFloat> *temp_O) {
-  // threshold is a configuration value: a desired threshold on orthogonality,
-  // below which we won't reorthogonalize.
-  const BaseFloat threshold = 1.0e-03;
-
-  int32 R = W_t1->NumRows(), D = W_t1->NumCols();
-  BaseFloat beta_t1 = rho_t1 * (1.0 + alpha_) + alpha_ * d_t1.Sum() / D;
-  Vector<BaseFloat> e_t1(R, kUndefined), sqrt_e_t1(R, kUndefined),
-      inv_sqrt_e_t1(R, kUndefined);
-  ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
-
-  temp_O->SymAddMat2(1.0, *W_t1, kNoTrans, 0.0);
-  // O_t =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}
-  Matrix<BaseFloat> O_mat(*temp_O);
-  SpMatrix<BaseFloat> O(O_mat, kTakeLower);
-  for (int32 i = 0; i < R; i++) {
-    BaseFloat i_factor = inv_sqrt_e_t1(i);
-    for (int32 j = 0; j <= i; j++) {
-      BaseFloat j_factor = inv_sqrt_e_t1(j);
-      O(i, j) *= i_factor * j_factor;
-    }
-  }
-  if (O.IsUnit(threshold)) {
-    if (self_debug_) {
-      KALDI_WARN << "Not reorthogonalizing since already orthognoal: " << O;
-    }
-    return;
-  }
-  TpMatrix<BaseFloat> C(R);
-  try {
-    C.Cholesky(O);
-    C.Invert();  // Now it's C^{-1}.
-    if (!(C.Max() < 100.0))
-      KALDI_ERR << "Cholesky out of expected range, "
-                << "reorthogonalizing with Gram-Schmidt";
-  } catch (...) {
-    // We do a Gram-Schmidt orthogonalization, which is a bit less efficient but
-    // more robust than the method using Cholesky.
-    KALDI_WARN << "Cholesky or Invert() failed while re-orthogonalizing R_t. "
-               << "Re-orthogonalizing on CPU.";
-    Matrix<BaseFloat> cpu_W_t1(*W_t1);
-    cpu_W_t1.OrthogonalizeRows();
-    W_t1->CopyFromMat(cpu_W_t1);
-    // at this point cpu_W_t1 represents R_{t+1}- it has orthonormal
-    // rows.  Do: W_{t+1} = E_{t+1}^{0.5} R_{t+1}
-    CuVector<BaseFloat> sqrt_e_t1_gpu(sqrt_e_t1);
-    W_t1->MulRowsVec(sqrt_e_t1_gpu);
-    return;
-  }
-  // Next, compute (E_t^{0.5} C^{-1} E_t^{-0.5})
-  // but it's really t+1, not t.
-  for (int32 i = 0; i < R; i++) {
-    BaseFloat i_factor = sqrt_e_t1(i);
-    for (int32 j = 0; j < i; j++) {
-      // skip j == i because i_factor * j_factor == 1 for j == i.
-      BaseFloat j_factor = inv_sqrt_e_t1(j);
-      C(i, j) *= i_factor * j_factor;
-    }
-  }
-  O_mat.CopyFromTp(C);
-  temp_O->CopyFromMat(O_mat);
-  temp_W->CopyFromMat(*W_t1);
-  W_t1->AddMatMat(1.0, *temp_O, kNoTrans, *temp_W, kNoTrans, 0.0);
-}
-
-// makes sure certain invariants are being preserved
-void OnlinePreconditioner::SelfTest() const {
-  KALDI_ASSERT(rho_t_ >= epsilon_);
-  BaseFloat d_t_max = d_t_.Max(), d_t_min = d_t_.Min();
-  KALDI_ASSERT(d_t_min >= epsilon_);
-  KALDI_ASSERT(d_t_min > 0.9 * delta_ * d_t_max);
-  KALDI_ASSERT(rho_t_ > 0.9 * delta_ * d_t_max);
-
-  int32 D = W_t_.NumCols(), R = W_t_.NumRows();
-  BaseFloat beta_t = rho_t_ * (1.0 + alpha_) + alpha_ * d_t_.Sum() / D;
-  Vector<BaseFloat> e_t(R, kUndefined), sqrt_e_t(R, kUndefined),
-      inv_sqrt_e_t(R, kUndefined);
-  ComputeEt(d_t_, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
-
-  CuSpMatrix<BaseFloat> S(R);
-  S.AddMat2(1.0, W_t_, kNoTrans, 0.0);
-  SpMatrix<BaseFloat> O(S);
-  for (int32 i = 0; i < R; i++) {
-    BaseFloat i_factor = inv_sqrt_e_t(i);
-    for (int32 j = 0; j <= i; j++) {
-      BaseFloat j_factor = inv_sqrt_e_t(j);
-      O(i, j) *= i_factor * j_factor;
-    }
-  }
-  if (!O.IsUnit(1.0e-04) || O(0, 0) != O(0, 0)) {
-    BaseFloat worst_error = 0.0;
-    int32 worst_i = 0, worst_j = 0;
-    for (int32 i = 0; i < R; i++) {
-      for (int32 j = 0; j < R; j++) {
-        BaseFloat elem = O(i, j);
-        BaseFloat error = fabs(elem - (i == j ? 1.0 : 0.0));
-        if (error > worst_error || error != error) {
-          worst_error = error;
-          worst_i = i;
-          worst_j = j;
-        }
-      }
-    }
-    if (worst_error > 1.0e-02 || worst_error != worst_error) {
-      KALDI_WARN << "Failed to verify W_t (worst error: O[" << worst_i << ','
-                 << worst_j << "] = " << O(worst_i, worst_j)
-                 << ", d_t = " << d_t_;
-    }
-  }
-}
-
-void OnlinePreconditioner::PreconditionDirectionsInternal(
-    const int32 t,
-    const BaseFloat rho_t,
-    const Vector<BaseFloat> &d_t,
-    CuMatrixBase<BaseFloat> *WJKL_t,
-    CuMatrixBase<BaseFloat> *X_t,
-    CuVectorBase<BaseFloat> *row_prod,
-    BaseFloat *scale) {
-  int32 N = X_t->NumRows(),  // Minibatch size.
-      D = X_t->NumCols(),  // Dimensions of vectors we're preconditioning
-      R = rank_;  // Rank of correction to unit matrix.
-  KALDI_ASSERT(R > 0 && R < D);
-  BaseFloat eta = Eta(N);
-
-  CuMatrix<BaseFloat> H_t(N, R);
-  const CuSubMatrix<BaseFloat> W_t(*WJKL_t, 0, R, 0, D);
-  // Below, WJ_t and LK_t are combinations of two matrices,
-  // which we define in order to combine two separate multiplications into one.
-  CuSubMatrix<BaseFloat> J_t(*WJKL_t, R, R, 0, D),
-      L_t(*WJKL_t, 0, R, D, R),
-      K_t(*WJKL_t, R, R, D, R),
-      WJ_t(*WJKL_t, 0, 2 * R, 0, D),
-      LK_t(*WJKL_t, 0, 2 * R, D, R);
-
-  H_t.AddMatMat(1.0, *X_t, kNoTrans, W_t, kTrans, 0.0);  // H_t = X_t W_t^T
-
-  bool locked = update_mutex_.try_lock();
-  if (locked) {
-    // Just hard-code it here that we do 10 updates before skipping any.
-    const int num_initial_updates = 10;
-    if (t_ > t || (num_updates_skipped_ < update_period_ - 1 &&
-                   t_ >= num_initial_updates)) {
-      update_mutex_.unlock();
-      // We got the lock but we were already beaten to it by another thread, or
-      // we don't want to update yet due to update_period_ > 1 (this saves
-      // compute), so release the lock.
-      locked = false;
-    }
-  }
-
-  if (!locked) {
-    // We're not updating the parameters, either because another thread is
-    // working on updating them, or because another thread already did so from
-    // the same or later starting point (making our update stale), or because
-    // update_period_ > 1.  We just apply the preconditioning and return.
-
-    // note: we don't bother with any locks before incrementing
-    // num_updates_skipped_ below, because the worst that could happen is that,
-    // on very rare occasions, we could skip one or two more updates than we
-    // intended.
-    num_updates_skipped_++;
-
-    BaseFloat tr_Xt_XtT = TraceMatMat(*X_t, *X_t, kTrans);
-    // X_hat_t = X_t - H_t W_t
-    X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);
-    // each element i of row_prod will be inner product of row i of X_hat_t with
-    // itself.
-    row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-    BaseFloat tr_Xhat_XhatT = row_prod->Sum();
-    KALDI_ASSERT(tr_Xhat_XhatT == tr_Xhat_XhatT);  // Check for NaN.
-    BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
-                         sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
-    *scale = gamma_t;
-    return;
-  }
-  J_t.AddMatMat(1.0, H_t, kTrans, *X_t, kNoTrans, 0.0);  // J_t = H_t^T X_t
-
-  bool compute_lk_together = (N > D);
-
-  if (compute_lk_together) {
-    // do the following two multiplies in one operation...
-    // note
-    // L_t = W_t J_t^T
-    // K_t = J_t J_t^T
-    // Note: L_t was defined as L_t = J_t W_t^T, but it's actually symmetric,
-    // so we can compute it as L_t = W_t J_t^T.
-    LK_t.AddMatMat(1.0, WJ_t, kNoTrans, J_t, kTrans, 0.0);
-  } else {
-    K_t.SymAddMat2(1.0, J_t, kNoTrans, 0.0);
-    L_t.SymAddMat2(1.0, H_t, kTrans, 0.0);
-  }
-
-  Matrix<BaseFloat> LK_cpu(LK_t);  // contains L and K on the CPU.
-  SubMatrix<BaseFloat> L_t_cpu(LK_cpu, 0, R, 0, R),
-      K_t_cpu(LK_cpu, R, R, 0, R);
-  if (!compute_lk_together) {
-    // the SymAddMat2 operations only set the lower triangle and diagonal.
-    L_t_cpu.CopyLowerToUpper();
-    K_t_cpu.CopyLowerToUpper();
-  }
-
-  // beta_t = \rho_t(1+\alpha) + \alpha/D tr(D_t)
-  BaseFloat beta_t = rho_t * (1.0 + alpha_) + alpha_ * d_t.Sum() / D;
-  Vector<BaseFloat> e_t(R), sqrt_e_t(R), inv_sqrt_e_t(R);
-  ComputeEt(d_t, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
-  KALDI_VLOG(5) << "e_t = " << e_t;
-
-  // The double-precision Z_t here, and the scaling, is to avoid potential
-  // overflow, because Z_t is proportional to the fourth power of data.
-  SpMatrix<double> Z_t_double(R);
-  ComputeZt(N, rho_t, d_t, inv_sqrt_e_t, K_t_cpu, L_t_cpu, &Z_t_double);
-  BaseFloat z_t_scale = std::max<double>(1.0, Z_t_double.Trace());
-  Z_t_double.Scale(1.0 / z_t_scale);
-  SpMatrix<BaseFloat> Z_t_scaled(Z_t_double);
-
-  Matrix<BaseFloat> U_t(R, R);
-  Vector<BaseFloat> c_t(R);
-  // do the symmetric eigenvalue decomposition Z_t = U_t C_t U_t^T.
-  Z_t_scaled.Eig(&c_t, &U_t);
-  SortSvd(&c_t, &U_t);
-  c_t.Scale(z_t_scale);
-
-  const BaseFloat condition_threshold = 1.0e+06;
-  // must_reorthogonalize will be true if the last diagonal element of c_t is
-  // negative, since we don't take the absolute value, but this is the right
-  // thing anyway.
-  bool must_reorthogonalize = (c_t(0) > condition_threshold * c_t(R - 1));
-
-  BaseFloat c_t_floor = pow(rho_t * (1 - eta), 2);
-  int32 nf;
-  c_t.ApplyFloor(c_t_floor, &nf);
-  if (nf > 0)
-    must_reorthogonalize = true;
-  if (nf > 0 && self_debug_) {
-    KALDI_WARN << "Floored " << nf << " elements of C_t.";
-  }
-  BaseFloat tr_Xt_XtT_check;
-  if (self_debug_)
-    tr_Xt_XtT_check = TraceMatMat(*X_t, *X_t, kTrans);
-
-  X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);  // X_hat_t = X_t - H_t W_t
-  // set *row_prod to inner products of each row of X_hat_t with itself.
-  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-
-  BaseFloat tr_Xhat_XhatT = row_prod->Sum();
-  //  tr(X_t X_t^T) = tr(X_hat_t X_hat_t^T) - tr(L_t E_t) + 2 tr(L_t)
-  double tr_Xt_XtT = tr_Xhat_XhatT;
-  for (int32 i = 0; i < R; i++)
-    tr_Xt_XtT += L_t_cpu(i, i) * (2.0 - e_t(i));
-  if (self_debug_) {
-    KALDI_ASSERT(ApproxEqual(tr_Xt_XtT, tr_Xt_XtT_check));
-  }
-  BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
-                       sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
-  *scale = gamma_t;
-
-  Vector<BaseFloat> sqrt_c_t(c_t);
-  sqrt_c_t.ApplyPow(0.5);
-
-  // \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
-  BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_Xt_XtT
-                                      + (1-eta)*(D * rho_t + d_t.Sum())
-                                      - sqrt_c_t.Sum());
-  // D_{t+1} = C_t^{0.5} - \rho_{t+1} I
-  Vector<BaseFloat> d_t1(sqrt_c_t);
-  d_t1.Add(-rho_t1);
-  BaseFloat floor_val = std::max(epsilon_, delta_ * sqrt_c_t.Max());
-  if (rho_t1 < floor_val)
-    rho_t1 = floor_val;
-  d_t1.ApplyFloor(floor_val);
-
-  CuMatrix<BaseFloat> W_t1(R, D);  // W_{t+1}
-  ComputeWt1(N, d_t, d_t1, rho_t, rho_t1, U_t, sqrt_c_t, inv_sqrt_e_t,
-             W_t, &J_t, &W_t1);
-
-  if (must_reorthogonalize) {
-    if (self_debug_) {
-      KALDI_WARN << "Reorthogonalizing.";
-    }
-    ReorthogonalizeXt1(d_t1,
-                       rho_t1,
-                       &W_t1,
-                       &J_t,
-                       &L_t);
-  }
-
-  // Commit the new parameters.
-  read_write_mutex_.lock();
-  KALDI_ASSERT(t_ == t);  // we already ensured this.
-  t_ = t + 1;
-  num_updates_skipped_ = 0;
-  W_t_.Swap(&W_t1);
-  d_t_.CopyFromVec(d_t1);
-  rho_t_ = rho_t1;
-
-  if (self_debug_)
-    SelfTest();
-
-  read_write_mutex_.unlock();
-  update_mutex_.unlock();
-}
-
-BaseFloat OnlinePreconditioner::Eta(int32 N) const {
-  KALDI_ASSERT(num_samples_history_ > 0.0);
-  BaseFloat ans = 1.0 - exp(-N / num_samples_history_);
-  // Don't let eta approach 1 too closely, as it can lead to NaN's appearing if
-  // the input is all zero.
-  if (ans > 0.9) ans = 0.9;
-  return ans;
-}
-
-void OnlinePreconditioner::ComputeWt1(int32 N,
-                                       const VectorBase<BaseFloat> &d_t,
-                                       const VectorBase<BaseFloat> &d_t1,
-                                       BaseFloat rho_t,
-                                       BaseFloat rho_t1,
-                                       const MatrixBase<BaseFloat> &U_t,
-                                       const VectorBase<BaseFloat> &sqrt_c_t,
-                                       const VectorBase<BaseFloat> &inv_sqrt_e_t,
-                                       const CuMatrixBase<BaseFloat> &W_t,
-                                       CuMatrixBase<BaseFloat> *J_t,
-                                       CuMatrixBase<BaseFloat> *W_t1) const {
-
-  int32 R = d_t.Dim(), D = W_t.NumCols();
-  BaseFloat eta = Eta(N);
-
-  // \beta_{t+1} = \rho_{t+1} (1+\alpha) + \alpha/D tr(D_{t+1})
-  BaseFloat beta_t1 = rho_t1 * (1.0 + alpha_) + alpha_ * d_t1.Sum() / D;
-  KALDI_ASSERT(beta_t1 > 0.0);
-  Vector<BaseFloat> e_t1(R, kUndefined), sqrt_e_t1(R, kUndefined),
-      inv_sqrt_e_t1(R, kUndefined);
-  ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
-  Vector<BaseFloat> inv_sqrt_c_t(sqrt_c_t);
-  inv_sqrt_c_t.InvertElements();
-
-  Vector<BaseFloat> w_t_coeff(R);
-  for (int32 i = 0; i < R; i++)
-    w_t_coeff(i) = (1.0 - eta) / (eta/N) * (d_t(i) + rho_t);
-  CuVector<BaseFloat> w_t_coeff_gpu(w_t_coeff);
-  // B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t
-  J_t->AddDiagVecMat(1.0, w_t_coeff_gpu, W_t, kNoTrans, 1.0);
-
-  // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5} B_t
-  Matrix<BaseFloat> A_t(U_t, kTrans);
-  for (int32 i = 0; i < R; i++) {
-    BaseFloat i_factor = (eta / N) * sqrt_e_t1(i) * inv_sqrt_c_t(i);
-    for (int32 j = 0; j < R; j++) {
-      BaseFloat j_factor = inv_sqrt_e_t(j);
-      A_t(i, j) *= i_factor * j_factor;
-    }
-  }
-  // W_{t+1} = A_t B_t
-  CuMatrix<BaseFloat> A_t_gpu(A_t);
-  W_t1->AddMatMat(1.0, A_t_gpu, kNoTrans, *J_t, kNoTrans, 0.0);
-}
-
-void OnlinePreconditioner::ComputeZt(int32 N,
-                                     BaseFloat rho_t,
-                                     const VectorBase<BaseFloat> &d_t,
-                                     const VectorBase<BaseFloat> &inv_sqrt_e_t,
-                                     const MatrixBase<BaseFloat> &K_t,
-                                     const MatrixBase<BaseFloat> &L_t,
-                                     SpMatrix<double> *Z_t) const {
-  // Use doubles because the range of quantities in Z_t can get large (fourth
-  // power of data), and we want to avoid overflow.  This routine is fast.
-  BaseFloat eta = Eta(N);
-  Vector<BaseFloat> d_t_rho_t(d_t);
-  d_t_rho_t.Add(rho_t);  // now d_t_rho_t is diag(D_t + \rho_t I).
-  double etaN = eta / N, eta1 = 1.0 - eta,
-      etaN_sq = etaN * etaN, eta1_sq = eta1 * eta1,
-      etaN_eta1 = etaN * eta1;
-  int32 R = d_t.Dim();
-  for (int32 i = 0; i < R; i++) {
-    double inv_sqrt_e_t_i = inv_sqrt_e_t(i), d_t_rho_t_i = d_t_rho_t(i);
-    for (int32 j = 0; j <= i; j++) {
-      double inv_sqrt_e_t_j = inv_sqrt_e_t(j), d_t_rho_t_j = d_t_rho_t(j),
-          L_t_i_j = 0.5 * (L_t(i, j) + L_t(j, i)),
-          K_t_i_j = 0.5 * (K_t(i, j) + K_t(j, i));
-      // See (eqn:Zt) in header.
-      (*Z_t)(i, j) = etaN_sq * inv_sqrt_e_t_i * K_t_i_j * inv_sqrt_e_t_j
-          + etaN_eta1 * inv_sqrt_e_t_i * L_t_i_j * inv_sqrt_e_t_j * d_t_rho_t_j
-          + etaN_eta1 * d_t_rho_t_i * inv_sqrt_e_t_i * L_t_i_j * inv_sqrt_e_t_j
-          + (i == j ? eta1_sq * d_t_rho_t_i * d_t_rho_t_i : 0.0);
-    }
-  }
-}
-
-void OnlinePreconditioner::ComputeEt(const VectorBase<BaseFloat> &d_t,
-                                     BaseFloat beta_t,
-                                     VectorBase<BaseFloat> *e_t,
-                                     VectorBase<BaseFloat> *sqrt_e_t,
-                                     VectorBase<BaseFloat> *inv_sqrt_e_t) const {
-  // e_{tii} = 1/(\beta_t/d_{tii} + 1)
-  int32 D = d_t.Dim();
-  const BaseFloat *d = d_t.Data();
-  BaseFloat *e = e_t->Data();
-  for (int32 i = 0; i < D; i++)
-    e[i] = 1.0 / (beta_t / d[i]  +  1);
-  sqrt_e_t->CopyFromVec(*e_t);
-  sqrt_e_t->ApplyPow(0.5);
-  inv_sqrt_e_t->CopyFromVec(*sqrt_e_t);
-  inv_sqrt_e_t->InvertElements();
-}
-
-
-OnlinePreconditioner::OnlinePreconditioner(const OnlinePreconditioner &other):
-    rank_(other.rank_), update_period_(other.update_period_),
-    num_samples_history_(other.num_samples_history_),
-    alpha_(other.alpha_), epsilon_(other.epsilon_), delta_(other.delta_),
-    t_(other.t_), num_updates_skipped_(other.num_updates_skipped_),
-    self_debug_(other.self_debug_), W_t_(other.W_t_),
-    rho_t_(other.rho_t_), d_t_(other.d_t_) {
-  // use default constructor for the mutexes.
-}
-
-OnlinePreconditioner& OnlinePreconditioner::operator = (
-    const OnlinePreconditioner &other) {
-  rank_ = other.rank_;
-  update_period_ = other.update_period_;
-  num_samples_history_ = other.num_samples_history_;
-  alpha_ = other.alpha_;
-  epsilon_ = other.epsilon_;
-  delta_ = other.delta_;
-  t_ = other.t_;
-  self_debug_ = other.self_debug_;
-  W_t_ = other.W_t_;
-  rho_t_ = other.rho_t_;
-  d_t_ = other.d_t_;
-  return *this;
-}
-
-void OnlinePreconditioner::SetRank(int32 rank) {
-  KALDI_ASSERT(rank > 0);
-  rank_ = rank;
-}
-void OnlinePreconditioner::SetUpdatePeriod(int32 update_period) {
-  KALDI_ASSERT(update_period > 0);
-  update_period_ = update_period;
-}
-void OnlinePreconditioner::SetNumSamplesHistory(BaseFloat num_samples_history) {
-  KALDI_ASSERT(num_samples_history > 0.0 &&
-               num_samples_history < 1.0e+6);
-  num_samples_history_ = num_samples_history;
-}
-void OnlinePreconditioner::SetAlpha(BaseFloat alpha) {
-  KALDI_ASSERT(alpha >= 0.0);
-  alpha_ = alpha;
-}
-
-
-}
-}
diff --git a/src/nnet2/nnet-precondition-online.h b/src/nnet2/nnet-precondition-online.h
deleted file mode 100644
index 7758d47831f..00000000000
--- a/src/nnet2/nnet-precondition-online.h
+++ /dev/null
@@ -1,574 +0,0 @@
-// nnet2/nnet-precondition-online.h
-
-// Copyright 2013-2015   Johns Hopkins University (author: Daniel Povey)
-//                2015   Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_PRECONDITION_ONLINE_H_
-#define KALDI_NNET2_NNET_PRECONDITION_ONLINE_H_
-
-#include <iostream>
-#include <mutex>
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-#include "cudamatrix/cu-matrix-lib.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/**
-   Keywords for search: natural gradient, naturalgradient, NG-SGD
-
-   This method is explained in the paper
-   "Parallel training of DNNs with Natural Gradient and Parameter Averaging"
-   by D. Povey, X. Zhang and S. Khudanpur, ICLR Workshop, 2015, where
-   it is referred to as online NG-SGD.  Note that the method exported
-   from this header is just the core of the algorithm, and some outer-level parts
-   of it are implemented in class NaturalGradientAffineComponent.
-
-  The rest of this extended comment describes the way we keep updated an estimate
-  of the inverse of a scatter matrix, in an online way.  This is the same as the
-  estimation of one of the A or B quantities in the paper.  This comment is slightly
-  redundant with the paper- actually it precedes the paper- but we keep it in case it
-  is useful in understanging our method.
-
-  We consider the problem of doing online estimation of a (scaled-identity plus low-rank)
-  approximation of a Fisher matrix... since the Fisher matrix is a scatter of vector-valued derivatives
-  and we will be given the derivatives (or at least terms in a factorization of the derivatives
-  which need not concern us right now), we can just think of the present task as being
-  the online accumulation of a (low-rank plus scaled-identity) approximation to a variance
-  of a distribution with mean zero.
-
-  Later on we'll think about how to get easy access to the inverse of this approximate
-  variance, which is what we really need.
-
-  Our approximation to the Fisher matrix (the scatter of derivatives) will be of the following form
-  (and just think of this as an approximate variance matrix of some arbitrary quantities).
-
-     F_t =(def) R_t^T D_t R_t + \rho_t I
-
-  (t is the minibatch index), where R_t is an R by D matrix with orthonormal
-  rows (1 <= R < D is our chosen rank), D_t is a positive-definite diagonal matrix, and
-  \rho_t > 0.  Suppose the dimension of F_t is D.  Let the vectors whose variance
-  we are approximating be provided in minibatches of size M (M can vary from
-  iteration to iteration, but it won't vary in the normal case, so we omit the
-  subscript t).  The batch of gradients is given as X_t \in Re^{M \times D},
-  i.e. each row is one of the vectors whose scatter we're estimating.  On the
-  t'th iteration, define the scatter S_t of the input vectors X_t as:
-
-     S_t =(def) 1/N X_t^T X_t           (eqn:St)
-
-  (where N is the minibatch size).  Be careful not to confuse the rank R with
-  with input X_t (we would typeface X_t in bold if this were not plain text, to
-  make the distinction clearer).  We want F_t to approach some kind of
-  time-weighted average of the S_t quantities, to the extent permitted by the
-  limitation of the rank R.  We want the F_t quantities to stay "fresh" (since
-  we'll be doing this in a SGD context and the parameters will be slowly
-  changing).  We use a constant 0 < \eta < 1 to control the updating rate.  Our
-  update for R_t is based on the power method.  Define the smoothed scatter
-
-   T_t =(def) \eta S_t + (1-\eta) F_t
-
-  we'll use this in place of the observed scatter S_t, to slow down the update.
-  Defining
-
-   Y_t =(def) R_t T_t
-
-  which can be expanded as follows:
-       Y_t = R_t ( \eta S_t + (1-\eta) F_t )
-           = R_t ( \eta S_t + (1-\eta) (R_t^T D_t R_t + \rho_t I) )
-           = R_t ( \eta S_t + (1-\eta) (R_t^T D_t R_t + \rho_t I) )
-           = \eta R_t S_t + (1-\eta) (D_t + \rho_t I) R_t
-
-  It is useful to think of Y_t as having each of the top eigenvectors of the
-  scatter scaled by the corresponding eigenvalue \lambda_i.
-  We compute the following R by R matrix:
-    Z_t =(def) Y_t Y_t^T
-  and do the symmetric eigenvalue decomposition
-    Z_t = U_t C_t U_t^T
-  where C_t is diagonal and U_t orthogonal; the diagonal elements of C_t will be
-  positive (since \rho_t > 0, T_t is positive definite; since R_t has full row rank
-  and T_t is positive definite, Y_t has full row rank; hence Z_t is positive definite).
-  The diagonal elements of C_t can be thought of as corresponding to the squares of
-  our current estimate of the top eigenvalues of the scatter matrix.
-  [we should check that no element of C_t is <= 0.]
-
-  It is easy to show that C_t^{-0.5} U_t^T Z_t U_t C_t^{-0.5} = I, so
-     (C_t^{-0.5} U_t^T Y_t) (Y_t^T U_t C_t^{-0.5}) = I.  Define
-    R_{t+1} =(def) C_t^{-0.5} U_t^T Y_t
-
-  and it's clear that R_{t+1} R_{t+1}^T = I.
-  We will set
-     D_{t+1} =(def) C_t^{0.5} - \rho_{t+1} I             (eqn:dt1)
-
-  which ensures that for each row r of R_{t+1}, the variance of our scatter
-  matrix F_{t+1} will be the square root of the corresponding diagonal element
-  of C_t.  This makes sense because, as we have pointed out, the diagonal
-  elements of C_t can be thought of as corresponding to squared eigenvalues.
-  But a proper treatment of this would require convergence analysis that would
-  get quite complicated.  We will choose \rho_{t+1} in order to ensure that
-  tr(F_{t+1}) = tr(T_t).
-
-  For any t,
-     tr(F_t) = D \rho_t + tr(D_t)
-     tr(T_t) = \eta tr(S_t) + (1-\eta) tr(F_t)
-             = \eta tr(S_t) + (1-\eta) (D \rho_t + tr(D_t))
-  Expanding out D_{t+1} from (eqn:dt1) in the expression for tr(F_{t+1}) below:
-      tr(F_{t+1})  = D \rho_{t+1} +  tr(D_{t+1})
-      tr(F_{t+1})  = D \rho_{t+1} +  tr(C_t^{0.5} - \rho_{t+1} I)
-                   = (D - R) \rho_{t+1} + tr(C_t^{0.5})
-   and equating tr(F_{t+1}) with T_t (since F_{t+1} is supposed to be a low-rank
-   approximation to T_t), we have
-                          tr(F_{t+1}) = tr(T_t)
-  (D - R) \rho_{t+1} + tr(C_t^{0.5})  = \eta tr(S_t) + (1-\eta) (D \rho_t + tr(D_t))
-
-  Solving for \rho_{t+1},
-       \rho_{t+1} = 1/(D - R) (\eta tr(S_t) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).   (eqn:rhot1)
-
-  Note that it is technically possible that diagonal elements of
-  of D_{t+1} may be negative, but we can still show that F_{t+1} is strictly
-  positive definite if F_t was strictly positive definite.
-
-  If the quantities for which we are computing the Fisher matrix are all zero
-  for some, reason, the sequence of F_t will geometrically approach zero, which
-  would cause problems with inversion; to prevent this happening, after setting
-  D_{t+1} and \rho_{t+1} as above, we floor \rho_{t+1} to a small value (like
-  1.0e-10).
-
-  OK, we have described the updating of R_t, D_t and \rho_t.  Next, we need to
-  figure out how to efficiently multiply by the inverse of F_t.  Our experience
-  from working with the old preconditioning method was that it's best not to use
-  the inverse of the Fisher matrix itself, but a version of the Fisher matrix
-  that's smoothed with some constant times the identity.  Below, (\alpha is a
-  configuration value, e.g. 4.0 seemed to work well).  The following formula is
-  designed to ensure that the smoothing varies proportionally with the scale of F_t:
-
-        G_t =(def) F_t +  \alpha/D tr(F_t) I
-            =     R_t^T D_t R_t + (\rho_t + \alpha/D tr(F_t)) I
-            =     R_t^T D_t R_t + \beta_t I
-  where
-    \beta_t =(def) \rho_t + \alpha/D tr(F_t)
-            =      \rho_t(1+\alpha) + \alpha/D tr(D_t)       (eqn:betat2)
-
-  Define
-     \hat{X}_t =(def)  \beta_t X_t G_t^{-1}.
-  the factor of \beta_t is inserted arbitrarily as it just happens to be convenient
-  to put unit scale on X_t in the formula for \hat{X}_t; it will anyway be canceled out
-  in the next step.  Then our final preconditioned minibatch of vectors is:
-     \bar{X}_t = \gamma_t \hat{X}_t
-  where
-     \gamma_t = sqrt(tr(X_t X_t^T)  / tr(\hat{X}_t \hat{X}_t^T).
-  The factor of \gamma ensures that \bar{X}_t is scaled to have the same overall
-  2-norm as the input X_t.  We found in previous versions of this method that this
-  rescaling was helpful, as otherwise there are certain situations (e.g. at the
-  start of training) where the preconditioned derivatives can get very large.  Note
-  that this rescaling introduces a small bias into the training, because now the
-  scale applied to a given sample depends on that sample itself, albeit in an
-  increasingly diluted way as the minibatch size gets large.
-
-  To efficiently compute G_t^{-1}, we will use the Woodbury matrix identity.
-  Writing the Woodbury formula for the symmetric case,
-    (A + U D U^T)^{-1} = A^{-1} - A^{-1} U (D^{-1} + U^T A^{-1} U)^{-1} U^T A^{-1}
-  Substituting A = \beta_t I, D = D_t and U = R_t^T, this becomes
-       G_t^{-1} = 1/\beta_t I - 1/\beta_t^2 R_t^T (D_t^{-1} + 1/\beta_t I)^{-1} R_t
-                = 1/\beta_t (I - R_t^T E_t R_t)
-  where
-        E_t =(def)  1/\beta_t (D_t^{-1} + 1/\beta_t I)^{-1},         (eqn:etdef)
-  so
-    e_{tii} =   1/\beta_t * 1/(1/d_{tii} + 1/\beta_t)                (eqn:tii)
-            =   1/(\beta_t/d_{tii} + 1)
-
-  We would like an efficient-to-compute expression for \hat{X}_t, without too many separate
-  invocations of kernels on the GPU.
-     \hat{X}_t = \beta_t X_t G_t^{-1}
-         = X_t - X_t R_t^T E_t R_t
-  For efficient operation on the GPU, we want to reduce the number of high-dimensional
-  operations that we do (defining "high-dimension" as anything involving D or M, but not
-  R, since R is likely small, such as 20).  We define
-     W_t =(def)  E_t^{0.5} R_t.
-  We will actually be storing W_t on the GPU rather than R_t, in order to reduce the
-  number of operations on the GPU.  We can now write:
-
-        \hat{X}_t = X_t - X_t W_t^T W_t       (eqn:pt2)
-
-  The following, which we'll compute on the GPU, are going to be useful in computing
-  quantities like Z_t:
-
-     H_t =(def) X_t W_t^T     (dim is N by R)
-     J_t =(def) H_t^T X_t     (dim is R by D)
-         =      W_t X_t^T X_t
-     K_t =(def) J_t J_t^T     (dim is R by R, symmetric).. transfer this to CPU.
-     L_t =(def) H_t^T H_t     (dim is R by R, symmetric).. transfer this to CPU.
-         =      W_t X_t^T X_t W_t^T
-     Note: L_t may also be computed as
-     L_t = J_t W_t^T
-     which may be more efficient if D < N.
-
-  Note: after we have computed H_t we can directly compute
-     \hat{X}_t = X_t - H_t W_t
-
-  We need to determine how Y_t and Z_t relate to the quantities we just defined.
-  First, we'll expand out H_t, J_t, K_t and L_t in terms of the more fundamental quantities.
-     H_t = X_t R_t^T E_t^{0.5}
-     J_t = E_t^{0.5} R_t X_t^T X_t
-     K_t = E_t^{0.5} R_t X_t^T X_t X_t^T X_t R_t^T E_t^{0.5}
-     L_t = E_t^{0.5} R_t X_t^T X_t R_t^T E_t^{0.5}
-
-  we wrote above that
-      Y_t = \eta R_t S_t + (1-\eta) (D_t + \rho_t I) R_t
-  so
-      Y_t = \eta/N R_t X_t^T X_t   + (1-\eta) (D_t + \rho_t I) R_t
-          = \eta/N E_t^{-0.5} J_t  + (1-\eta) (D_t + \rho_t I) R_t     (eqn:yt)
-  We will expand Z_t using the expression for Y_t in the line above:
-      Z_t = Y_t Y_t^T
-          =  (\eta/N)^2 E_t^{-0.5} J_t J_t^T E_t^{-0.5}
-            +(\eta/N)(1-\eta) E_t^{-0.5} J_t R_t^T (D_t + \rho_t I)
-            +(\eta/N)(1-\eta) (D_t + \rho_t I) R_t J_t^T E_t^{-0.5}
-            +(1-\eta)^2 (D_t + \rho_t I)^2
-          = (\eta/N)^2 E_t^{-0.5} K_t E_t^{-0.5}
-           +(\eta/N)(1-\eta) E_t^{-0.5} L_t E_t^{-0.5} (D_t + \rho_t I)
-           +(\eta/N)(1-\eta) (D_t + \rho_t I) E_t^{-0.5} L_t E_t^{-0.5}
-           +(1-\eta)^2 (D_t + \rho_t I)^2                              (eqn:Zt)
-  We compute Z_t on the CPU using the expression above, and then do the symmetric
-  eigenvalue decomposition (also on the CPU):
-      Z_t = U_t C_t U_t^T.
-  and we make sure the eigenvalues are sorted from largest to smallest, for
-  reasons that will be mentioned later.
-
-  Mathematically, no diagonal element of C_t can be less than (1-\eta)^2
-  \rho_t^2, and since negative or zero elements of C_t would cause us a problem
-  later, we floor C_t to this value.  (see below regarding how we ensure R_{t+1}
-  has orthonormal rows).
-
-  We will continue the discussion below regarding what we do with C_t and U_t.
-  Next, we need to digress briefly and describe how to compute
-  tr(\hat{X}_t \hat{X}_t^T) and tr(X_t X_t^2), since these appear in expressions for
-  \gamma_t (needed to produce the output \bar{X}_t), and for \rho_{t+1}.  It happens
-  that we need, for purposes of appying "max_change" in the neural net code, the
-  squared 2-norm of each row of the output \bar{X}_t.  In order to be able to compute
-  \gamma_t, it's most convenient to compute this squared row-norm for each row
-  of \hat{X}_t, as a vector, to compute tr(\hat{X}_t \hat{X}_t^2) from this vector as its sum, and
-  to then work back to compute tr(X_t X_t^2) from the relation between \hat{X}_t and
-  X_t.  We can then scale the row-norms we computed for \hat{X}_t, so they apply to
-  \bar{X}_t.
-
-  For current purposes, you can imagine that we computed tr(\hat{X}_t \hat{X}_t^T) directly.
-  Using (from eqn:pt2)
-      \hat{X}_t = X_t - X_t W_t^T W_t,
-  we can expand tr(\hat{X}_t \hat{X}_t^T) as:
-   tr(\hat{X}_t \hat{X}_t^T) = tr(X_t X_t^T) + tr(X_t W_t^T W_t W_t^T W_t X_t^T)
-                  - 2 tr(X_t W_t^T W_t X_t^T)
-                 = tr(X_t X_t^T) + tr(W_t X_t^T X_t W_t^T W_t W_t^T)
-                  - 2 tr(W_t X_t^T X_t W_t^T)
-                 = tr(X_t X_t^T) + tr(L_t W_t W_t^T) - 2 tr(L_t)
-                 = tr(X_t X_t^T) + tr(L_t E_t) - 2 tr(L_t)
-  and all quantities have already been computed (or are quick to compute, such as
-  the small traces on the right), except tr(X_t X_t^T), so we can write
-
-    tr(X_t X_t^T) = tr(\hat{X}_t \hat{X}_t^T) - tr(L_t E_t) + 2 tr(L_t)
-  and the above expression can be used to obtain tr(X_t X_t^2).
-  We can then do
-     \gamma_t <-- sqrt(tr(X_t X_t^T)  / tr(\hat{X}_t \hat{X}_t^T)).
-  (or one if the denominator is zero), and then
-      \bar{X}_t <-- \gamma_t \hat{X}_t
-  We can then output the per-row squared-l2-norms of Q by scaling those we
-  computed from P by \gamma_t^2.
-
-  OK, the digression on how to compute \gamma_t and tr(X_t X_t^T) is over.
-  We now return to the computation of R_{t+1}, W_{t+1}, \rho_{t+1}, D_{t+1} and E_{t+1}.
-
-  We found above in (eqn:rhot1)
-     \rho_{t+1} = 1/(D - R) (\eta tr(S_t) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
-  Expanding out S_t from its definition in (eqn:St),
-     \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
-  We can compute this directly as all the quantities involved are already known
-  or easy to compute.
-  Next, from (eqn:dt1), we compute
-     D_{t+1} = C_t^{0.5} - \rho_{t+1} I
-  At this point if \rho_{t+1} is smaller than some small value \epsilon, e.g. 1.0e-10, we
-  set it to \epsilon; as mentioned, we do this to stop F_t approaching zero if all inputs
-  are zero.  Next, if any diagonal element D_{t+1,i,i} has absolute value less
-  than \epsilon, we set it to +\epsilon.  This is to ensure that diagonal
-  elements of E are never zero, which would cause problems.
-
-  Next, we compute (from eqn:betat2, eqn:etdef, eqn:tii),
-        \beta_{t+1} = \rho_{t+1} (1+\alpha) + \alpha/D tr(D_{t+1})
-            E_{t+1} = 1/\beta_{t+1} (D_{t+1}^{-1} + 1/\beta_{t+1} I)^{-1},
- i.e.:      e_{tii} = 1/(\beta_{t+1}/d_{t+1,ii} + 1)
-
- We'll want to store D_{t+1}.  We next want to compute W_{t+1}.
-
-  Before computing W_{t+1}, we need to find an expression for
-     R_{t+1} = C_t^{-0.5} U_t^T Y_t
-   Expanding out Y_t using the expression in (eqn:yt),
-     R_{t+1} = C_t^{-0.5} U_t^T  (\eta/N E_t^{-0.5} J_t  + (1-\eta) (D_t + \rho_t I) R_t)
-             =  (\eta/N C_t^{-0.5} U_t^T E_t^{-0.5})  J_t
-               +((1-\eta) C_t^{-0.5} U_t^T (D_t + \rho_t I) E_t^{-0.5}) W_t
-
-   What we actually want is W_{t+1} = E_{t+1}^{0.5} R_{t+1}:
-     W_{t+1} = (\eta/N E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5}) J_t
-              +((1-\eta) E_{t+1}^{0.5} C_t^{-0.5} U_t^T (D_t + \rho_t I) E_t^{-0.5}) W_t
-   and to minimize the number of matrix-matrix multiplies we can factorize this as:
-     W_{t+1} = A_t B_t
-        A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5}
-        B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t
-   [note: we use the fact that (D_t + \rho_t I) and E_t^{-0.5} commute because
-    they are diagonal].
-
-  A_t is computed on the CPU and transferred from there to the GPU, B_t is
-  computed on the PGU, and the multiplication of A_t with B_t is done on the GPU.
-
-   * Keeping R_t orthogonal *
-
-   Our method requires the R_t matrices to be orthogonal (which we define to
-   mean that R_t R_t^T = I).  If roundoff error causes this equality to be
-   significantly violated, it could cause a problem for the stability of our
-   method.  We now address our method for making sure that the R_t values stay
-   orthogonal.  We do this in the algorithm described above, after creating
-   W_{t+1}.  This extra step is only executed if the condition number of C_t
-   (i.e. the ratio of its largest to smallest diagonal element) exceeds a
-   specified threshold, such as 1.0e+06 [this is tested before applying the
-   floor to C_t].  The threshold was determined empirically by finding the
-   largest value needed to ensure a certain level of orthogonality in R_{t+1}.
-   For purposes of the present discussion, since R_{t+1} is not actually stored,
-   define it as E_{t+1}^{-0.5} W_{t+1}.  Define the following (and we will
-   just use t instead of t+1 below, as all quantities have the same subscript):
-
-      O_t =(def) R_t R_t^T
-          =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}
-
-   (and we would compute this by computing W_t W_t^T on the GPU, transferring
-   it to the CPU, and doing the rest there).  If O_t is not sufficiently close
-   to the unit matrix, we can re-orthogonalize as follows:
-   Do the Cholesky decomposition
-      O_t = C C^T
-   Clearly C^{-1} O_t C^{-T} = I, so if we correct R_t with:
-      R_t <-- C^{-1} R_t
-   we can ensure orthogonality.  If R_t's first k rows are orthogonal, this
-   transform will not affect them, because of its lower-triangular
-   structure... this is good because (thanks to the eigenvalue sorting), the
-   larger eigenvectors are first and it is more critical to keep them pointing
-   in the same direction.  Any loss of orthogonality will be dealt with by
-   modifying the smaller eigenvectors.
-   As a modification to W_t, this would be:
-      W_t <-- (E_t^{0.5} C^{-1} E_t^{-0.5}) W_t,
-   and the matrix in parentheses is computed on the CPU, transferred to the
-   GPU, and the multiplication is done there.
-
-
-   * Initialization *
-
-   Now, a note on what we do on time t = 0, i.e. for the first minibatch.  We
-   initialize X_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
-   minibatch size (num-rows of R0).  If L is the corresponding RxR diagonal
-   matrix of eigenvalues, then we will set D_0 = L - \rho_0 I.  We set \rho_0
-   to ensure that
-                      tr(F_0) = 1/N tr(X_0 X_0^T),
-           tr(D_0) - \rho_0 D = 1/N tr(X_0 X_0^T),
-  tr(L) + \rho_0 R - \rho_0 D = 1/N tr(X_0 X_0^T)
-                       \rho_0 = (1/N tr(X_0 X_0^T) - tr(L)) / (D - R)
-
-   We then floor \rho_0 to \epsilon (e.g. 1.0e-10) and also floor the
-   diagonal elements of D_0 to \epsilon; this ensures that we won't
-   crash for zero inputs.
-
-   A note on multi-threading.  This technique was really designed for use
-   with a GPU, where we won't have multi-threading, but we want it to work
-   also on a CPU, where we may have multiple worker threads.
-   Our approach is as follows (we do this when we're about to start updating
-   the parameters R_t, D_t, \rho_t and derived quantities):
-
-    For time t > 0 (where the matrices are already initialized), before starting
-    the part of the computation that updates the parameters (R_t, D_t, \rho_t and
-    derived quantities), we try to lock a mutex that guards the OnlinePreconditioner.
-    If we can lock it right away, we go ahead and do the update, but if not,
-    we just abandon the attempt to update those quantities.
-
-    We will have another mutex to ensure that when we access quantities like
-    W_t, \rho_t they are all "in sync" (and we don't access them while they are
-    being written by another thread).  This mutex will only be locked for short
-    periods of time.
-
-   Note: it might be a good idea to make sure that the R_t still retain orthonormal
-   rows even in the presence of roundoff, without errors accumulating.  My instinct
-   is that this isn't going to be a problem.
- */
-
-
-class OnlinePreconditioner {
- public:
-  OnlinePreconditioner();
-
-  void SetRank(int32 rank);
-  void SetUpdatePeriod(int32 update_period);
-  // num_samples_history is a time-constant (in samples) that determines eta.
-  void SetNumSamplesHistory(BaseFloat num_samples_history);
-  void SetAlpha(BaseFloat alpha);
-  void TurnOnDebug() { self_debug_ = true; }
-  BaseFloat GetNumSamplesHistory() const { return num_samples_history_; }
-  BaseFloat GetAlpha() const { return alpha_; }
-  int32 GetRank() const { return rank_; }
-  int32 GetUpdatePeriod() const { return update_period_; }
-
-  // The "R" pointer is both the input (R in the comment) and the output (P in
-  // the comment; equal to the preconditioned directions before scaling by
-  // gamma).  If the pointer "row_prod" is supplied, it's set to the inner product
-  // of each row of the preconditioned directions P, at output, with itself.
-  // You would need to apply "scale" to R and "scale * scale" to row_prod, to
-  // get the preconditioned directions; we don't do this ourselves, in order to
-  // save CUDA calls.
-  void PreconditionDirections(CuMatrixBase<BaseFloat> *R,
-                              CuVectorBase<BaseFloat> *row_prod,
-                              BaseFloat *scale);
-
-  // Copy constructor.
-  explicit OnlinePreconditioner(const OnlinePreconditioner &other);
-  // Assignent operator
-  OnlinePreconditioner &operator = (const OnlinePreconditioner &other);
- private:
-
-  // This does the work of PreconditionDirections (the top-level
-  // function handles some multithreading issues and then calls this function).
-  // Note: WJKL_t (dimension 2*R by D + R) is [ W_t L_t; J_t K_t ].
-  void PreconditionDirectionsInternal(const int32 t,
-                                      const BaseFloat rho_t,
-                                      const Vector<BaseFloat> &d_t,
-                                      CuMatrixBase<BaseFloat> *WJKL_t,
-                                      CuMatrixBase<BaseFloat> *X_t,
-                                      CuVectorBase<BaseFloat> *row_prod,
-                                      BaseFloat *scale);
-
-  void ComputeEt(const VectorBase<BaseFloat> &d_t,
-                 BaseFloat beta_t,
-                 VectorBase<BaseFloat> *e_t,
-                 VectorBase<BaseFloat> *sqrt_e_t,
-                 VectorBase<BaseFloat> *inv_sqrt_e_t) const;
-
-  void ComputeZt(int32 N,
-                 BaseFloat rho_t,
-                 const VectorBase<BaseFloat> &d_t,
-                 const VectorBase<BaseFloat> &inv_sqrt_e_t,
-                 const MatrixBase<BaseFloat> &K_t,
-                 const MatrixBase<BaseFloat> &L_t,
-                 SpMatrix<double> *Z_t) const;
-  // Computes W_{t+1}.  Overwrites J_t.
-  void ComputeWt1(int32 N,
-                  const VectorBase<BaseFloat> &d_t,
-                  const VectorBase<BaseFloat> &d_t1,
-                  BaseFloat rho_t,
-                  BaseFloat rho_t1,
-                  const MatrixBase<BaseFloat> &U_t,
-                  const VectorBase<BaseFloat> &sqrt_c_t,
-                  const VectorBase<BaseFloat> &inv_sqrt_e_t,
-                  const CuMatrixBase<BaseFloat> &W_t,
-                  CuMatrixBase<BaseFloat> *J_t,
-                  CuMatrixBase<BaseFloat> *W_t1) const;
-
-  // This function is called if C_t has high condition number; it makes sure
-  // that R_{t+1} is orthogonal.  See the section in the extended comment above
-  // on "keeping R_t orthogonal".
-  void ReorthogonalizeXt1(const VectorBase<BaseFloat> &d_t1,
-                          BaseFloat rho_t1,
-                          CuMatrixBase<BaseFloat> *W_t1,
-                          CuMatrixBase<BaseFloat> *temp_W,
-                          CuMatrixBase<BaseFloat> *temp_O);
-
-  void Init(const CuMatrixBase<BaseFloat> &R0);
-
-  // Initialize to some small 'default' values, called from Init().  Init() then
-  // does a few iterations of update with the first batch's data to give more
-  // reasonable values.
-  void InitDefault(int32 D);
-
-  // initializes R, which is assumed to have at least as many columns as rows,
-  // to a specially designed matrix with orthonormal rows, that has no zero rows
-  // or columns.
-  static void InitOrthonormalSpecial(CuMatrixBase<BaseFloat> *R);
-
-  // Returns the learning rate eta as the function of the number of samples
-  // (actually, N is the number of vectors we're preconditioning, which due to
-  // context is not always exactly the same as the number of samples).  The
-  // value returned depends on num_samples_history_.
-  BaseFloat Eta(int32 N) const;
-
-  // called if self_debug_ = true, makes sure the members satisfy certain
-  // properties.
-  void SelfTest() const;
-
-  // Configuration values:
-
-  // The rank of the correction to the unit matrix (e.g. 20).
-  int32 rank_;
-
-  // After a few initial iterations of updating whenever we can, we start only
-  // updating the Fisher-matrix parameters every "update_period_" minibatches;
-  // this saves time.
-  int32 update_period_;
-
-  // num_samples_history_ determines the value of eta, which in turn affects how
-  // fast we update our estimate of the covariance matrix.  We've done it this
-  // way in order to make it easy to have a single configuration value that
-  // doesn't have to be changed when we change the minibatch size.
-  BaseFloat num_samples_history_;
-
-  // alpha controls how much we smooth the Fisher matrix with the unit matrix.
-  // e.g. alpha = 4.0.
-  BaseFloat alpha_;
-
-  // epsilon is an absolute floor on the unit-matrix scaling factor rho_t in our
-  // Fisher estimate, which we set to 1.0e-10.  We don't actually make this
-  // configurable from the command line.  It's needed to avoid crashes on
-  // all-zero inputs.
-  BaseFloat epsilon_;
-
-  // delta is a relative floor on the unit-matrix scaling factor rho_t in our
-  // Fisher estimate, which we set to 1.0e-05: this is relative to the largest
-  // value of D_t.  It's needed to control roundoff error.  We apply the same
-  // floor to the eigenvalues in D_t.
-  BaseFloat delta_;
-
-  // t is a counter that measures how many updates we've done.
-  int32 t_;
-
-  // This keeps track of how many minibatches we've skipped updating the parameters,
-  // since the most recent update; it's used in enforcing "update_period_", which
-  // is a mechanism to avoid spending too much time updating the subspace (which can
-  // be wasteful).
-  int32 num_updates_skipped_;
-
-  // If true, activates certain checks.
-  bool self_debug_;
-
-  CuMatrix<BaseFloat> W_t_;
-  BaseFloat rho_t_;
-  Vector<BaseFloat> d_t_;
-
-
-  // Used to prevent parameters being read or written in an inconsistent state.
-  std::mutex read_write_mutex_;
-
-  // This mutex is used to control which thread gets to update the
-  // parameters, in multi-threaded code.
-  std::mutex update_mutex_;
-};
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-#endif
diff --git a/src/nnet2/nnet-precondition-test.cc b/src/nnet2/nnet-precondition-test.cc
deleted file mode 100644
index b84e6117790..00000000000
--- a/src/nnet2/nnet-precondition-test.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-// nnet2/nnet-precondition-test.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-precondition.h"
-#include "util/common-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-void UnitTestPreconditionDirections() {
-  MatrixIndexT N = 2 + Rand() % 30,
-               D = 1 + Rand() % 20;
-  BaseFloat lambda = 0.1;
-  CuMatrix<BaseFloat> R(N, D), P(N, D);
-  R.SetRandn();
-  P.SetRandn(); // contents should be overwritten.
-
-  PreconditionDirections(R, lambda, &P);
-  // The rest of this function will do the computation the function is doing in
-  // a different, less efficient way and compare with the function call.
-  
-  CuSpMatrix<BaseFloat> G(D);
-  G.SetUnit();
-  G.ScaleDiag(lambda);
-  // G += R^T R.
-  G.AddMat2(1.0/(N-1), R, kTrans, 1.0);
-  
-  for (int32 n = 0; n < N; n++) {
-    CuSubVector<BaseFloat> rn(R, n);
-    CuSpMatrix<BaseFloat> Gn(G);
-    Gn.AddVec2(-1.0/(N-1), rn); // subtract the
-    // outer product of "this" vector.
-    Gn.Invert();
-    CuSubVector<BaseFloat> pn(P, n);
-    CuVector<BaseFloat> pn_compare(D);
-    pn_compare.AddSpVec(1.0, Gn, rn, 0.0);
-    KALDI_ASSERT(pn.ApproxEqual(pn_compare, 0.1));
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-  for (int32 i = 0; i < 10; i++)
-    UnitTestPreconditionDirections();
-}
diff --git a/src/nnet2/nnet-precondition.cc b/src/nnet2/nnet-precondition.cc
deleted file mode 100644
index f828418ce8b..00000000000
--- a/src/nnet2/nnet-precondition.cc
+++ /dev/null
@@ -1,352 +0,0 @@
-// nnet2/nnet-precondition.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-precondition.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/// See below for comment.
-void PreconditionDirections(const CuMatrixBase<BaseFloat> &R,
-                            double lambda,
-                            CuMatrixBase<BaseFloat> *P) {
-  
-  int32 N = R.NumRows(), D = R.NumCols();
-  KALDI_ASSERT(SameDim(R, *P) && N > 0);
-  if (N == 1) {
-    KALDI_WARN << "Trying to precondition set of only one frames: returning "
-               << "unchanged.  Ignore this warning if infrequent.";
-    P->CopyFromMat(R);
-    return;
-  }
-  CuMatrixBase<BaseFloat> &Q = *P;
-  
-  if (N >= D) {
-    // Compute G = (\lambda I + 1/(N-1) R^T R)^{-1} by direct inversion.
-    // G <-- lambda I.
-    CuMatrix<BaseFloat> G(D, D);
-    G.AddToDiag(lambda);
-    // G += 1.0/(N-1) * R^T R.
-    G.SymAddMat2(1.0 / (N-1), R, kTrans, 1.0);
-    G.CopyLowerToUpper();
-    if (GetVerboseLevel() >= 5 && Rand() % 20 == 0) {
-      CuSpMatrix<BaseFloat> tmp(G, kTakeLower);
-      SpMatrix<BaseFloat> G_cpu(tmp);
-      G_cpu.PrintEigs("G");
-    }
-    G.SymInvertPosDef();
-    // Q <-- R G^T (we just make it transposed as we think
-    // it will be slightly faster; it's symmetric).
-    Q.AddMatMat(1.0, R, kNoTrans, G, kTrans, 0.0);
-  } else {
-    // Through a lot of rearrangements, it turns out that
-    // if we let  S = (\lambda I + 1/(N-1) R R^T)^{-1}
-    // then what we need is
-    // Q <-- S R.
-    // It is curious and (to me) unexpected that the actual code is basically
-    // the same when transposed.
-    CuMatrix<BaseFloat> S(N, N);
-    // S <-- lambda I.
-    S.AddToDiag(lambda);
-    // S += (N-1) R R^T.
-    // the following function only updates the lower triangle.
-    S.SymAddMat2(1.0 / (N-1), R, kNoTrans, 1.0);
-    S.CopyLowerToUpper();
-    // invert S, so now S = (\lambda I + (N-1) R R^T)^{-1}.
-    if (GetVerboseLevel() >= 5 && Rand() % 20 == 0) {
-      CuSpMatrix<BaseFloat> tmp(S, kTakeLower);
-      SpMatrix<BaseFloat> S_cpu(tmp);
-      S_cpu.PrintEigs("S");
-    }
-    S.SymInvertPosDef();
-    Q.AddMatMat(1.0, S, kNoTrans, R, kNoTrans, 0.0);
-  }
-
-#if 0  // Old code before it was optimized for CUDA:
-  for (int32 n = 0; n < N; n++) {
-    CuSubVector<BaseFloat> r(R, n), q(Q, n);
-    BaseFloat gamma = VecVec(r, q), // gamma_n = r_n^T q_n.
-               beta = 1 + gamma / (N - 1 - gamma);
-    if (!(gamma >= 0.0 && beta > 0.0)) {
-      KALDI_ERR << "Bad values encountered in preconditioning: gamma = " << gamma
-                << ", beta = " << beta;
-    }
-    // Q and P share the same memory.  The result of the
-    // scaling below will be output as P.
-    q.Scale(beta);
-  }
-#else
-  CuVector<BaseFloat> gamma(N);
-  gamma.AddDiagMatMat(1.0, R, kNoTrans, Q, kTrans, 0.0);
-  // at this point, gamma(i) equals the i'th row of R dotted with
-  // the i'th row of Q.
-  Vector<BaseFloat> cpu_gamma(gamma), cpu_beta(N, kUndefined);
-  for (int32 n = 0; n < N; n++) {
-    BaseFloat this_gamma = cpu_gamma(n),
-        this_beta = 1.0 + this_gamma / (N - 1 - this_gamma);
-    if (!(this_gamma >= 0.0 && this_beta > 0.0))
-      KALDI_ERR << "Bad values encountered in preconditioning: gamma = "
-                << this_gamma << ", beta = " << this_beta;
-    cpu_beta(n) = this_beta;
-  }
-  CuVector<BaseFloat> beta(cpu_beta);
-  P->MulRowsVec(beta);
-#endif
-}
-
-
-void PreconditionDirectionsAlpha(
-    const CuMatrixBase<BaseFloat> &R,
-    double alpha,
-    CuMatrixBase<BaseFloat> *P) {
-  KALDI_ASSERT(alpha > 0.0);
-  // probably does not really make sense.
-  double t = TraceMatMat(R, R, kTrans), floor = 1.0e-20;
-  if (t < floor) {
-    KALDI_WARN << "Flooring trace from " << t
-               << " to " << floor;
-    t = floor;
-  }
-  double lambda = t * alpha / R.NumRows() / R.NumCols();
-  // see the extended comment below for an explanation of this.
-  if (lambda <= 0.0) {
-    // This should never really happen, it would probably indicate a bug
-    // in the calling code.
-    KALDI_WARN << "Zero or negative lambda in PreconditionDirectionsAlpha.";
-    lambda = 1.0e-10;
-  }
-  PreconditionDirections(R, lambda, P);
-}
-
-
-void PreconditionDirectionsAlphaRescaled(
-    const CuMatrixBase<BaseFloat> &R,
-    double alpha,
-    CuMatrixBase<BaseFloat> *P) {
-  KALDI_ASSERT(alpha > 0.0); // alpha > 1.0
-  // probably does not really make sense.
-  double t = TraceMatMat(R, R, kTrans), floor = 1.0e-20;
-  if (t == 0.0) {
-    P->CopyFromMat(R);
-    return;
-  }
-  if (t < floor) {
-    KALDI_WARN << "Flooring trace from " << t
-               << " to " << floor;
-    t = floor;
-  }
-  double lambda = t * alpha / R.NumRows() / R.NumCols();
-  // see the extended comment below for an explanation of this.
-  KALDI_ASSERT(lambda != 0.0);
-  PreconditionDirections(R, lambda, P);
-  double p_trace = TraceMatMat(*P, *P, kTrans),
-      rescale = sqrt(t / p_trace);
-  KALDI_ASSERT(p_trace != 0.0);
-  P->Scale(rescale);
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-/*
-  Notes for an idea on preconditioning.
-  update is of form:
-     params += learning_rate * input_row * output_deriv'
-  want to precondition by fisher-like matrix in each of (the input dim and the
-  output dim).
-  [note: in this method we'll pretend the chunk-weights are all one.
-   It shouldn't really matter, it's only preconditioning.]
-
-   The first observation is, if we do:
-
-    params += learning_rate * S * input_row * output_deriv' * T
-
-   for any positive definite S and T that we choose (well, perhaps we have
-   to ensure their eigenvalues are bounded in some way, but we'll bother with
-   that later),  then we'll still get convergence.  But S and T cannot be
-   functions of the current sample, the one that creates "input_row" and
-   "output_deriv", or this introduces a bias.
-
-   We can view it as a preconditioning of the vectorized form of the
-   transformation matrix.
-
-   For a Fisher-like preconditioning, we can precondition using
-   the inverse of the scatter of the other features in the batch.
-   For the input_row, call this r_j.
-
-   Let the total scatter be
-
-    S =  \sum_n r_n r_n^T
-  where the sum is taken over the minibatch, and
-   S_n = S - r_n  r_n^T
-  i.e. the scatter with this sample removed.
-  Let F_n be the normalized version of this, dividing by the #samples.
-   F_n = 1/(N-1) S_n
-  where N is the minibatch size (so N-1 is excluding the current sample).
- We're going to want to invert F_n, so we need to make it positive definite.
-
-  We're going to define G_n as a smoothed form of the estimated Fisher matrix
-  for this batch:
-   G_n = F_n + \lambda_n I
-  where I is the identity.  A suitable formula for \lambda_n is to define
-  a small constant \alpha (say, \alpha=0.1), and let
-  
-   \lambda_n =  (\alpha/dim(F)) trace(F_n) .
-
-  In practice (although we lost strict convergence guarantees) it will be easier
-  to set a global \lambda, to:
-
-   \lambda  =  (\alpha/dim(S)) trace(S)
-            = (\alpha/(R.NumRows()*R.NumCols()) * trace(R^T R)).
-  
-  This is an easy way to set it.  Let's define P_n as the inverse of G_n.  This
-  is what we'll be multiplying the input values by:
-
-    P_n = G_n^{-1} = (F_n + \lambda_n I)^{-1}
-
-  First, let's define an uncorrected "global" Fisher matrix
-    F = (1/(N-1)) S_n,
-  and G = F^{-1}.
-  If we let R be the matrix each of whose rows is one of the r_n,
-  then
-    S = R^T R, and
-   F = 1/(N-1) R^T R
-
-           G = (F + \lambda I)^{-1}
-             = (1/(N-1) R^T R + \lambda I)^{-1}
-Using the Woodbury formula,
-     G  = (1/\lambda) I  - (1/\lambda^2) R^T M R
-where
-  M = ((N-1) I + 1/\lambda R R^T)^{-1}
-(and this inversion for M is actually done as an inversion, in a lower
- dimension such as 250, versus the actual dimension which might be 1000).
-
-Let's assume \lambda is a constant, i.e. there is no \lambda_n.
-We can get it from the previous minibatch.
-
- We want to compute
-
-    G_n = F_n^{-1} = (F - 1/(N-1) r_n r_n^T)^{-1}
-
- and using the Sherman-Morrison formula, this may be written as:
-
-   G_n = G  +  \alpha_n q_n q_n^T  # Caution: \alpha_n has nothing to do with \alpha.
-
- where q_n = G r_n, and
-
- \alpha_n =  1/( (N-1) (1 - 1/(N-1) r_n^T q_n) )
-          =  1 / (N - 1 - r_n^T q_n)
-
-  We'll want to compute this efficiently.  For each r_n we'll want to compute
-
- p_n =  G_n r_n
-
- which will correspond to the direction we update in.
- We'll use
-
-  p_n = G r_n + \alpha_n q_n q_n^T r_n
-
-  and since q_n = G r_n, both terms in this equation point in
-  the same direction, and we can write this as:
-
-  p_n = \beta_n q_n,
-
-  where, defining \gamma_n = r_n^T q_n, we have
-
-  \beta_n = 1 + \gamma_n \alpha_n 
-          = 1  +  \gamma_n / ((N-1) (1 - \gamma_n/(N-1)))
-          = 1  +  \gamma_n / (N - 1 - \gamma_n)
-  
-*/
-
-/*
-
-  SUMMARY:
-   let the input features (we can extend these with a 1 for the bias term) be
-   a matrix R, each row of which corresponds to a training example r_n
-
-   The dimension of R is N x D, where N is the minibatch size and D is the
-   dimension of the input to this layer of the network.
-
-   We'll be computing a matrix P, each row p_n of which will be the corresponding
-   row r_n of R, multiplied by a positive definite preconditioning matrix G_n.
-   [we can check that for each i, p_n^T r_n >= 0].
-   The following computation obtains P:
-
-   \lambda <-- (\alpha/N) \trace(R R^T).   # 0 < \alpha <= 1 is a global constant, e.g.
-                                           # \alpha = 0.1, but should try different
-                                           # values, this will be important (note: if the
-                                           # minibatch size is >= the dimension (N >= D),
-                                           # then we can let \alpha be quite small, e.g.
-                                           # 0.001.
-
-   if N >= D, then
-     # compute G by direct inversion.
-     G <-- (\lambda I  +  1/(N-1) R^T R)^{-1}
-     Q <-- R G.
-   else   # number of samples is less than dimension, use
-          # morrison-Woodbury formula, it's more efficient.
-      # We'd first compute:
-      # L <-- ((N-1) I + 1/\lambda R R^T)
-      # (note: L is something that appears in the morrison-Woodbury expansion of G)
-      # M <-- L^{-1}
-      # Note: G is  1/\lambda I  -  (1/\lambda^2) R^T M R
-      # We're doing Q <-- R G, which is:
-      # Q <-- 1/\lambda R - (1/\lambda^2) R (R^T M R)
-      # It's more efficient in this case to left-multiply R
-      # by something, i.e. bracket as:
-      # Q <-- 1/\lambda R - (1/\lambda^2) (R R^T M) R
-      # so let's write it as
-      # Q <-- S R, with
-      # S = 1/\lambda I - 1/\lambda^2 R R^T M
-      #   = 1/\lambda (I - 1/\lambda R R^T M)
-      # Now, -1/\lambda R R^T = (N-1) I - L, and L M = I, so
-      # S = 1/\lambda (I  + ((N-1) I - L) M)
-      #   = (N-1)/\lambda M
-      # and we can get rid of that scalar earlier on:
-      # if we let L' = \lambda/(N-1) L, so that
-      # L' = (lambda I + 1/(N-1) R R^T)
-      # then
-      # S = (\lambda I + 1/(N-1) R R^T)^{-1}. 
-
-      S <-- (\lambda I + 1/(N-1) R R^T)^{-1}.
-      Q <-- S R
-   fi
-
-   Here, we're right multiplying each row r_n of r by the symmetric matrix G, to get
-   the corresponding row q_n of q.  Note: in practice Q will be the same memory as P.
-   Next we work out for each n:
-     \gamma_n = r_n^T q_n     # This should be nonnegative!  Check this.
-      \beta_n = 1  +  \gamma_n / (N - 1 - \gamma_n)  # This should be positive; check this.
-  For each n, we'll do (for the corresponding rows of P and Q):
-     p_n <-- \beta_n q_n.
-  In practice, we'd do this computation in-place, with P and Q using the
-  same memory.
-
-  If we're being paranoid, we should verify that
-
-   p_n = (\lambda I  +  1/(N-1) \sum_{m != n} r_n r_n^T)^{-1} r_n.
-
-  This is exact mathematically, but there could be differences due to roundoff,
-  and if \alpha is quite small, these differences could be substantial.
-  
- */
-    
-
diff --git a/src/nnet2/nnet-precondition.h b/src/nnet2/nnet-precondition.h
deleted file mode 100644
index cf930cd855e..00000000000
--- a/src/nnet2/nnet-precondition.h
+++ /dev/null
@@ -1,88 +0,0 @@
-// nnet2/nnet-precondition.h
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_PRECONDITION_H_
-#define KALDI_NNET2_NNET_PRECONDITION_H_
-
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-#include "cudamatrix/cu-matrix-lib.h"
-
-#include <iostream>
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/**
-  The function PreconditionDirections views the input R as
-  a set of directions or gradients, each row r_i being one of the
-  directions.  For each i it constructs a preconditioning matrix
-  G_i formed from the *other* i's, using the formula:
-
-  G_i = (\lambda I + (1/(N-1)) \sum_{j \neq i} r_j r_j^T)^{-1},
-
-  where N is the number of rows in R.  This can be seen as a kind
-  of estimated Fisher matrix that has been smoothed with the
-  identity to make it invertible.  We recommend that you set
-  \lambda using:
-    \lambda = \alpha/(N D) trace(R^T, R)
-  for some small \alpha such as \alpha = 0.1.  However, we leave
-  this to the caller because there are reasons relating to
-  unbiased-ness of the resulting stochastic gradient descent, why you
-  might want to set \lambda using "other" data, e.g. a previous
-  minibatch.
-
-  The output of this function is a matrix P, each row p_i of
-  which is related to r_i by:
-    p_i = G_i r_i
-  Here, p_i is preconditioned by an estimated Fisher matrix
-  in such a way that it's suitable to be used as an update direction.
-
- */
-void PreconditionDirections(const CuMatrixBase<BaseFloat> &R,
-                            double lambda,
-                            CuMatrixBase<BaseFloat> *P);
-
-/**
-   This wrapper for PreconditionDirections computes lambda
-   using \lambda = \alpha/(N D) trace(R^T, R), and calls
-   PreconditionDirections. */
-void PreconditionDirectionsAlpha(
-    const CuMatrixBase<BaseFloat> &R,
-    double alpha,
-    CuMatrixBase<BaseFloat> *P);
-
-/**
-   This wrapper for PreconditionDirections computes lambda
-   using \lambda = \alpha/(N D) trace(R^T, R), and calls
-   PreconditionDirections.  It then rescales *P so that
-   its 2-norm is the same as that of R. */
-void PreconditionDirectionsAlphaRescaled(
-    const CuMatrixBase<BaseFloat> &R,
-    double alpha,
-    CuMatrixBase<BaseFloat> *P);
-  
-                           
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-#endif
diff --git a/src/nnet2/nnet-stats.cc b/src/nnet2/nnet-stats.cc
deleted file mode 100644
index 9293613ee2b..00000000000
--- a/src/nnet2/nnet-stats.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// nnet2/nnet-stats.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-stats.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-void NnetStats::StatsElement::PrintStats(std::ostream &os) {
-  BaseFloat c = (count == 0 ? 1 : count), // prevent division by zero.
-      deriv_mean = deriv_sum/c,
-      deriv_stddev = std::sqrt(deriv_sumsq/c - deriv_mean*deriv_mean),
-      abs_value_mean = abs_value_sum/c,
-      abs_value_stddev = std::sqrt(abs_value_sumsq/c -
-                                   abs_value_mean*abs_value_mean);
-
-  os << '[' << deriv_begin << ':' << deriv_end << "] count=" << count
-     << ", deriv mean,stddev=" << deriv_mean << ',' << deriv_stddev
-     << ", abs-avg-value mean,stddev=" << abs_value_mean << ','
-     << abs_value_stddev;
-}
-  
-void NnetStats::StatsElement::AddStats(BaseFloat avg_deriv, BaseFloat avg_value) {
-  count++;
-  deriv_sum += avg_deriv;
-  deriv_sumsq += avg_deriv * avg_deriv;
-  abs_value_sum += std::abs(avg_value);
-  abs_value_sumsq += avg_value * avg_value;
-}
-
-int32 NnetStats::BucketFor(BaseFloat avg_deriv) {
-  KALDI_ASSERT(avg_deriv >= 0.0);
-  KALDI_ASSERT(bucket_width_ > 0.0);
-  // cast ratio to int.  Since we do +0.5, this rounds down.
-  int32 index = static_cast<int32>(avg_deriv / bucket_width_ + 0.5);
-  while (index >= static_cast<int32>(buckets_.size()))
-    buckets_.push_back(StatsElement(buckets_.size() * bucket_width_,
-                                    (buckets_.size() + 1) * bucket_width_));
-  return index;
-}
-
-void NnetStats::AddStats(BaseFloat avg_deriv, BaseFloat avg_value) {
-  global_.AddStats(avg_deriv, avg_value);
-  buckets_[BucketFor(avg_deriv)].AddStats(avg_deriv, avg_value);
-}
-
-void NnetStats::AddStatsFromNnet(const Nnet &nnet) {
-  const AffineComponent *ac = dynamic_cast<const AffineComponent*>(
-      &(nnet.GetComponent(affine_component_index_)));
-  KALDI_ASSERT(ac != NULL); // would be an error in calling code.
-  const NonlinearComponent *nc = dynamic_cast<const NonlinearComponent*>(
-      &(nnet.GetComponent(affine_component_index_ + 1)));
-  KALDI_ASSERT(nc != NULL); // would be an error in calling code.
-
-  double count = nc->Count();
-  if (count == 0) {
-    KALDI_WARN << "No stats stored with nonlinear component";
-    return;
-  }
-  const CuVector<double> &value_sum = nc->ValueSum();
-  const CuVector<double> &deriv_sum = nc->DerivSum();
-  if (value_sum.Dim() != deriv_sum.Dim())
-    KALDI_ERR << "Error computing nnet stats: probably you are "
-              << "trying to compute stats for a sigmoid layer.";
-  for (int32 i = 0; i < value_sum.Dim(); i++) {
-    BaseFloat avg_value = value_sum(i) / count,
-        avg_deriv = deriv_sum(i) / count;
-    AddStats(avg_deriv, avg_value);
-  }
-}
-
-void NnetStats::PrintStats(std::ostream &os) {
-  os << "Stats for buckets:" << std::endl;
-  for (size_t i = 0; i < buckets_.size(); i++) {
-    buckets_[i].PrintStats(os);
-    os << std::endl;
-  }
-  os << "Global stats: ";
-  global_.PrintStats(os);
-  os << std::endl;
-}
-
-void GetNnetStats(const NnetStatsConfig &config,
-                  const Nnet &nnet,
-                  std::vector<NnetStats> *stats) {
-  KALDI_ASSERT(stats->size() == 0);
-  for (int32 c = 0; c + 1 < nnet.NumComponents(); c++) {
-    const AffineComponent *ac = dynamic_cast<const AffineComponent*>(
-        &(nnet.GetComponent(c)));
-    if (ac == NULL) continue;
-    const NonlinearComponent *nc = dynamic_cast<const NonlinearComponent*>(
-        &(nnet.GetComponent(c + 1)));
-    if (nc == NULL) continue;
-    // exclude softmax.
-    const SoftmaxComponent *sc = dynamic_cast<const SoftmaxComponent*>(
-        &(nnet.GetComponent(c + 1)));
-    if (sc != NULL) continue;
-    stats->push_back(NnetStats(c, config.bucket_width));
-    stats->back().AddStatsFromNnet(nnet);
-  }
-}
-
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-stats.h b/src/nnet2/nnet-stats.h
deleted file mode 100644
index 3a4d6db2e99..00000000000
--- a/src/nnet2/nnet-stats.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// nnet2/nnet-stats.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_STATS_H_
-#define KALDI_NNET2_NNET_STATS_H_
-
-#include "nnet2/nnet-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/* This program computes various statistics from a neural net.  These are
-   summaries of certain quantities already present in the network as
-   stored on disk, especially regarding certain average values and
-   derivatives of the sigmoids.   
-*/
-
-struct NnetStatsConfig {  
-  BaseFloat bucket_width;
-  NnetStatsConfig(): bucket_width(0.025) { }
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("bucket-width", &bucket_width, "Width of bucket in average-derivative "
-                   "stats for analysis.");
-  }
-};
-
-class NnetStats {
- public:
-  NnetStats(int32 affine_component_index, BaseFloat bucket_width):
-      affine_component_index_(affine_component_index),
-      bucket_width_(bucket_width), global_(0, -1) { }
-  
-  // Use default copy constructor and assignment operator.
-  
-  void AddStats(BaseFloat avg_deriv, BaseFloat avg_value);
-
-  void AddStatsFromNnet(const Nnet &nnet);
-  
-  void PrintStats(std::ostream &os);  
- private:
-
-  struct StatsElement {
-    BaseFloat deriv_begin; // avg-deriv, beginning of bucket.
-    BaseFloat deriv_end;   // avg-deriv, end of bucket.
-    BaseFloat deriv_sum;   // sum of avg-deriv within bucket.
-    BaseFloat deriv_sumsq;   // Sum-squared of avg-deriv within bucket.
-    BaseFloat abs_value_sum; // Sum of abs(avg-value).  Tells us whether it's
-    // saturating at one or both ends.
-    BaseFloat abs_value_sumsq; // Sum-squared of abs(avg-value).
-    int32 count;      // Number of nonlinearities in this bucket.
-
-    StatsElement(BaseFloat deriv_begin,
-                 BaseFloat deriv_end):
-        deriv_begin(deriv_begin), deriv_end(deriv_end), deriv_sum(0.0),
-        deriv_sumsq(0.0), abs_value_sum(0.0), abs_value_sumsq(0.0), count(0) { }
-    void AddStats(BaseFloat avg_deriv, BaseFloat avg_value);
-    // Outputs stats for this bucket; no newline
-    void PrintStats(std::ostream &os); 
-  };
-  int32 BucketFor(BaseFloat avg_deriv); // returns the bucket
-  // for this avg-derivative value, and makes sure it is allocated.
-  
-  int32 affine_component_index_; // Component index of the affine component
-                                // associated with this nonlinearity.
-  BaseFloat bucket_width_; // width of buckets of stats we store (in derivative values).
-  
-  std::vector<StatsElement> buckets_; // Stats divided into buckets by avg_deriv.
-  StatsElement global_; // All the stats.
-  
-};
-
-void GetNnetStats(const NnetStatsConfig &config,
-                  const Nnet &nnet,
-                  std::vector<NnetStats> *stats);
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_STATS_H_
diff --git a/src/nnet2/nnet-update-parallel.cc b/src/nnet2/nnet-update-parallel.cc
deleted file mode 100644
index b23ea500efc..00000000000
--- a/src/nnet2/nnet-update-parallel.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-// nnet2/nnet-update-parallel.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <numeric>
-#include "nnet2/nnet-update-parallel.h"
-#include "nnet2/nnet-update.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-class DoBackpropParallelClass: public MultiThreadable {
- public:
-  // This constructor is only called for a temporary object
-  // that we pass to the RunMultiThreaded function.
-  DoBackpropParallelClass(const Nnet &nnet,
-                          ExamplesRepository *repository,
-                          double *tot_weight_ptr,
-                          double *log_prob_ptr,
-                          Nnet *nnet_to_update,
-                          bool store_separate_gradients):
-      nnet_(nnet), repository_(repository),
-      nnet_to_update_(nnet_to_update),
-      nnet_to_update_orig_(nnet_to_update),
-      store_separate_gradients_(store_separate_gradients),
-      tot_weight_ptr_(tot_weight_ptr),
-      log_prob_ptr_(log_prob_ptr),
-      tot_weight_(0.0),
-      log_prob_(0.0) { }
-
-  // The following constructor is called multiple times within
-  // the RunMultiThreaded template function.
-  DoBackpropParallelClass(const DoBackpropParallelClass &other):
-      MultiThreadable(other),
-      nnet_(other.nnet_),
-      repository_(other.repository_),
-      nnet_to_update_(other.nnet_to_update_),
-      nnet_to_update_orig_(other.nnet_to_update_orig_),
-      store_separate_gradients_(other.store_separate_gradients_),
-      tot_weight_ptr_(other.tot_weight_ptr_),
-      log_prob_ptr_(other.log_prob_ptr_),
-      tot_weight_(0),
-      log_prob_(0.0) {
-    if (store_separate_gradients_) {
-      // To ensure correctness, we work on separate copies of the gradient
-      // object, which we'll sum at the end.  This is used for exact gradient
-      // computation.
-      if (other.nnet_to_update_ != NULL) {
-        nnet_to_update_ = new Nnet(*(other.nnet_to_update_));
-        // our "nnet_to_update_" variable is a copy of the neural network
-        // we are to update (presumably a gradient).  If we don't set these
-        // to zero we would end up adding multiple copies of the any initial
-        // gradient that "nnet_to_update_" contained when we initialize
-        // the first instance of the class.
-        nnet_to_update_->SetZero(true);
-      } else { // support case where we don't really need a gradient.
-        nnet_to_update_ = NULL;
-      }
-    }
-  }
-  // This does the main function of the class.
-  void operator () () {
-    std::vector<NnetExample> examples;
-    while (repository_->ProvideExamples(&examples)) {
-      // This is a function call to a function defined in
-      // nnet-update.h
-      double tot_loglike;
-      if (nnet_to_update_ != NULL)
-        tot_loglike = DoBackprop(nnet_, examples, nnet_to_update_);
-      else
-        tot_loglike = ComputeNnetObjf(nnet_, examples);
-      tot_weight_ += TotalNnetTrainingWeight(examples);
-      log_prob_ += tot_loglike;
-      KALDI_VLOG(4) << "Thread " << thread_id_ << " saw "
-                    << tot_weight_ << " frames so far (weighted); likelihood "
-                    << "per frame so far is " << (log_prob_ / tot_weight_);
-      examples.clear();
-    }
-  }
-
-  ~DoBackpropParallelClass() {
-    if (nnet_to_update_orig_ != nnet_to_update_) {
-      // This branch is only taken if this instance of the class is
-      // one of the multiple instances allocated inside the RunMultiThreaded
-      // template function, *and* store_separate_gradients_ has been set to true.
-      // In the typical hogwild case, we don't do this.
-      nnet_to_update_orig_->AddNnet(1.0, *nnet_to_update_);
-      delete nnet_to_update_;
-    }
-    *log_prob_ptr_ += log_prob_;
-    *tot_weight_ptr_ += tot_weight_;
-  }
- private:
-  const Nnet &nnet_;
-  ExamplesRepository *repository_;
-  Nnet *nnet_to_update_;
-  Nnet *nnet_to_update_orig_;
-  bool store_separate_gradients_;
-  double *tot_weight_ptr_;
-  double *log_prob_ptr_;
-  double tot_weight_;
-  double log_prob_; // log-like times num frames.
-};
-
-
-#if HAVE_CUDA == 1
-double DoBackpropSingleThreaded(const Nnet &nnet,
-                                int32 minibatch_size,
-                                SequentialNnetExampleReader *examples_reader,
-                                double *tot_weight_out,
-                                Nnet *nnet_to_update) {
-  double ans = 0.0, tot_weight = 0.0;
-  KALDI_ASSERT(minibatch_size > 0);
-  while (!examples_reader->Done()) {
-    std::vector<NnetExample> egs;
-    egs.reserve(minibatch_size);
-    while (egs.size() < minibatch_size && examples_reader->Done()) {
-      egs.push_back(examples_reader->Value());
-      examples_reader->Next();
-    }
-    ans += DoBackprop(nnet, egs, nnet_to_update);
-    tot_weight += TotalNnetTrainingWeight(egs);
-  }
-  *tot_weight_out = tot_weight;
-  return ans;
-}
-#endif
-
-
-double DoBackpropParallel(const Nnet &nnet,
-                          int32 minibatch_size,
-                          SequentialNnetExampleReader *examples_reader,
-                          double *tot_weight,
-                          Nnet *nnet_to_update) {
-#if HAVE_CUDA == 1
-  // Our GPU code won't work with multithreading; we do this
-  // to enable it to work with this code in the single-threaded
-  // case.
-  if (CuDevice::Instantiate().Enabled())
-    return DoBackpropSingleThreaded(nnet, minibatch_size, examples_reader,
-                                    tot_weight, nnet_to_update);
-#endif
-
-  ExamplesRepository repository; // handles parallel programming issues regarding
-  // the "examples" of data.
-  double tot_log_prob = 0.0;
-  *tot_weight = 0.0;
-
-  // This function assumes you want the exact gradient, if
-  // nnet_to_update != &nnet.
-  const bool store_separate_gradients = (nnet_to_update != &nnet);
-
-  DoBackpropParallelClass c(nnet, &repository, tot_weight,
-                            &tot_log_prob, nnet_to_update,
-                            store_separate_gradients);
-
-  {
-    // The initialization of the following class spawns the threads that
-    // process the examples.  They get re-joined in its destructor.
-    MultiThreader<DoBackpropParallelClass> m(g_num_threads, c);
-
-    std::vector<NnetExample> examples;
-    for (; !examples_reader->Done(); examples_reader->Next()) {
-      examples.push_back(examples_reader->Value());
-      if (examples.size() == minibatch_size)
-        repository.AcceptExamples(&examples);
-    }
-    if (!examples.empty()) // partial minibatch.
-      repository.AcceptExamples(&examples);
-    // Here, the destructor of "m" re-joins the threads, and
-    // does the summing of the gradients if we're doing gradient
-    // computation (i.e. &nnet != nnet_to_update).  This gets
-    // done in the destructors of the objects of type
-    // DoBackpropParallelClass.
-    repository.ExamplesDone();
-  }
-  KALDI_LOG << "Did backprop on " << *tot_weight << " examples, average log-prob "
-            << "per frame is " << (tot_log_prob / *tot_weight);
-  KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame="
-            << (tot_log_prob / *tot_weight);
-  return tot_log_prob;
-}
-
-
-double DoBackpropSingleThreaded(const Nnet &nnet,
-                                int32 minibatch_size,
-                                const std::vector<NnetExample> &egs,
-                                double *tot_weight,
-                                Nnet *nnet_to_update) {
-  double ans = 0.0;
-  *tot_weight = TotalNnetTrainingWeight(egs);
-  for (size_t i = 0; i < egs.size(); i += minibatch_size) {
-    std::vector<NnetExample>::const_iterator end_iter =
-      (i + minibatch_size > egs.size() ? egs.end() :
-       egs.begin() + i + minibatch_size);
-    std::vector<NnetExample> this_egs(egs.begin() + i,
-                                              end_iter);
-    ans += DoBackprop(nnet, this_egs, nnet_to_update);
-  }
-  return ans;
-}
-
-
-double DoBackpropParallel(const Nnet &nnet,
-                          int32 minibatch_size,
-                          int32 num_threads,
-                          const std::vector<NnetExample> &egs,
-                          double *tot_weight,
-                          Nnet *nnet_to_update) {
-  if (num_threads == 1) // support GPUs: special case for 1 thread.
-    return DoBackpropSingleThreaded(nnet, minibatch_size, egs,
-                                    tot_weight, nnet_to_update);
-
-  ExamplesRepository repository; // handles parallel programming issues regarding
-  // the "examples" of data.
-  double tot_log_prob = 0.0;
-  *tot_weight = 0;
-  const bool store_separate_gradients = (nnet_to_update != &nnet);
-
-  DoBackpropParallelClass c(nnet, &repository, tot_weight,
-                            &tot_log_prob, nnet_to_update,
-                            store_separate_gradients);
-
-  {
-    // The initialization of the following class spawns the threads that
-    // process the examples.  They get re-joined in its destructor.
-    MultiThreader<DoBackpropParallelClass> m(num_threads, c);
-
-    int32 num_egs = egs.size();
-    for (int32 offset = 0; offset < num_egs; offset += minibatch_size) {
-      int32 this_minibatch_size = std::min(minibatch_size, num_egs - offset);
-
-      // We waste a little time copying the examples here, but it's very minor.
-      std::vector<NnetExample> examples(egs.begin() + offset,
-                                                egs.begin() + offset + this_minibatch_size);
-
-      repository.AcceptExamples(&examples);
-    }
-
-    // Here, the destructor of "m" re-joins the threads, and
-    // does the summing of the gradients if we're doing gradient
-    // computation (i.e. &nnet != nnet_to_update).  This gets
-    // done in the destructors of the objects of type
-    // DoBackpropParallelClass.
-    repository.ExamplesDone();
-  }
-  KALDI_VLOG(2) << "Did backprop on " << *tot_weight << " examples, average log-prob "
-                << "per frame is " << (tot_log_prob / *tot_weight);
-  return tot_log_prob;
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-update-parallel.h b/src/nnet2/nnet-update-parallel.h
deleted file mode 100644
index b1478877660..00000000000
--- a/src/nnet2/nnet-update-parallel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-// nnet2/nnet-update-parallel.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_UPDATE_PARALLEL_H_
-#define KALDI_NNET2_NNET_UPDATE_PARALLEL_H_
-
-#include "nnet2/nnet-nnet.h"
-#include "util/table-types.h"
-#include "util/kaldi-semaphore.h"
-#include "util/kaldi-thread.h"
-#include "itf/options-itf.h"
-#include "nnet2/nnet-update.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-/// This function is similar to "DoBackprop" in nnet-update.h
-/// This function computes the objective function and either updates the model
-/// or computes parameter gradients.  It returns the cross-entropy objective
-/// function summed over all samples, weighted, and the total weight of
-/// the samples (typically the same as the #frames) into total_weight.
-/// It is mostly a wrapper for
-/// a class NnetUpdater that's defined in nnet-update.cc, but we
-/// don't want to expose that complexity at this level.
-/// Note: this function 
-/// If &nnet == nnet_to_update, it assumes we're doing SGD and does
-/// something like Hogwild; otherwise it assumes we're computing a
-/// gradient and it sums up the gradients.
-/// The return value is the total log-prob summed over the #frames. It also
-/// outputs the #frames into "num_frames".
-double DoBackpropParallel(const Nnet &nnet,
-                          int32 minibatch_size,
-                          SequentialNnetExampleReader *example_reader,
-                          double *tot_weight,
-                          Nnet *nnet_to_update);
-
-
-/// This version of DoBackpropParallel takes a vector of examples, and will
-/// typically be used to compute the exact gradient. 
-double DoBackpropParallel(const Nnet &nnet,
-                          int32 minibatch_size,
-                          int32 num_threads,
-                          const std::vector<NnetExample> &examples,
-                          double *num_frames,
-                          Nnet *nnet_to_update);
-
-
-
-/// This is basically to clarify the fact that DoBackpropParallel will
-/// also work with nnet_to_update == NULL, and will compute the objf.
-/// Both versions of the function will support it, but this
-/// version (that takes a vector) is currently the only one we need
-/// to do this with.
-inline double ComputeNnetObjfParallel(
-    const Nnet &nnet,
-    int32 minibatch_size,
-    int32 num_threads,
-    const std::vector<NnetExample> &examples,
-    double *num_frames) {
-  return DoBackpropParallel(nnet, minibatch_size, num_threads,
-                            examples, num_frames, NULL);
-}
-
-
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_UPDATE_PARALLEL_H_
diff --git a/src/nnet2/nnet-update.cc b/src/nnet2/nnet-update.cc
deleted file mode 100644
index c2c628ebaac..00000000000
--- a/src/nnet2/nnet-update.cc
+++ /dev/null
@@ -1,361 +0,0 @@
-// nnet2/nnet-update.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-//           2014   Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/nnet-update.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-
-NnetUpdater::NnetUpdater(const Nnet &nnet,
-                         Nnet *nnet_to_update):
-    nnet_(nnet), nnet_to_update_(nnet_to_update) {
-}
- 
-
-
-void NnetUpdater::FormatInput(const std::vector<NnetExample> &data) {
-
-  forward_data_.resize(nnet_.NumComponents() + 1);
-  Matrix<BaseFloat> input;
-  FormatNnetInput(nnet_, data, &input);
-  forward_data_[0].Resize(0, 0);  // avoids the next command ever copying GPU->CPU
-  forward_data_[0].Swap(&input); // Copy to GPU, if being used.
-  nnet_.ComputeChunkInfo(1 + nnet_.LeftContext() + nnet_.RightContext(),
-                         data.size(), &chunk_info_out_);
-}
-
-double NnetUpdater::ComputeForMinibatch(
-    const std::vector<NnetExample> &data,
-    double *tot_accuracy) {
-
-  FormatInput(data);
-  Propagate();
-  CuMatrix<BaseFloat> tmp_deriv;
-  double ans = ComputeObjfAndDeriv(data, &tmp_deriv, tot_accuracy);
-  if (nnet_to_update_ != NULL)
-    Backprop(&tmp_deriv); // this is summed (after weighting), not
-                          // averaged.
-  return ans;
-}
-
-
-// form of ComputeForMinibatch for when the input data has
-// already been formatted as a single matrix.
-double NnetUpdater::ComputeForMinibatch(const std::vector<NnetExample> &data,
-                                        Matrix<BaseFloat> *formatted_data,
-                                        double *tot_accuracy) {
-  { // accept the formatted input.  This replaces the call to FormatInput().
-    int32 num_chunks = data.size();
-    KALDI_ASSERT(formatted_data->NumRows() ==
-                 num_chunks * (1 + nnet_.LeftContext() + nnet_.RightContext()) &&
-                 formatted_data->NumCols() == nnet_.InputDim());
-
-    forward_data_.resize(nnet_.NumComponents() + 1);
-    // the next command avoids the Swap() command ever copying GPU->CPU in case
-    // an instance of this class is used more than once (which it isn't in
-    // practice).
-    forward_data_[0].Resize(0, 0);  
-    forward_data_[0].Swap(formatted_data); // Copy to GPU, if being used.
-    nnet_.ComputeChunkInfo(1 + nnet_.LeftContext() + nnet_.RightContext(),
-                           data.size(), &chunk_info_out_);
-  }
-  Propagate();
-  CuMatrix<BaseFloat> tmp_deriv;
-  double ans = ComputeObjfAndDeriv(data, &tmp_deriv, tot_accuracy);
-  if (nnet_to_update_ != NULL)
-    Backprop(&tmp_deriv); // this is summed (after weighting), not
-                          // averaged.
-  return ans;
-}
-
-
-void NnetUpdater::GetOutput(CuMatrix<BaseFloat> *output) {
-  int32 num_components = nnet_.NumComponents(); 
-  KALDI_ASSERT(forward_data_.size() == nnet_.NumComponents() + 1); 
-  *output = forward_data_[num_components];
-}
-
-void NnetUpdater::Propagate() {
-  static int32 num_times_printed = 0;
-        
-  int32 num_components = nnet_.NumComponents();
-  for (int32 c = 0; c < num_components; c++) {
-    const Component &component = nnet_.GetComponent(c);
-    const CuMatrix<BaseFloat> &input = forward_data_[c];
-    CuMatrix<BaseFloat> &output = forward_data_[c+1];
-    // Note: the Propagate function will automatically resize the
-    // output.
-    component.Propagate(chunk_info_out_[c], chunk_info_out_[c+1], input, &output);
-    // If we won't need the output of the previous layer for
-    // backprop, delete it to save memory.
-    bool need_last_output =
-        (c>0 && nnet_.GetComponent(c-1).BackpropNeedsOutput()) ||
-        component.BackpropNeedsInput();
-    if (g_kaldi_verbose_level >= 3 && num_times_printed < 100) {
-      KALDI_VLOG(3) << "Stddev of data for component " << c
-                    << " for this minibatch is "
-                    << (TraceMatMat(forward_data_[c], forward_data_[c], kTrans) /
-                        (forward_data_[c].NumRows() * forward_data_[c].NumCols()));
-      num_times_printed++;
-    }
-    if (!need_last_output)
-      forward_data_[c].Resize(0, 0); // We won't need this data.
-  }
-}
-
-double NnetUpdater::ComputeObjfAndDeriv(
-    const std::vector<NnetExample> &data,
-    CuMatrix<BaseFloat> *deriv,
-    double *tot_accuracy) const {
-  BaseFloat tot_objf = 0.0, tot_weight = 0.0;
-  int32 num_components = nnet_.NumComponents();
-  int32 num_chunks = data.size();
-  deriv->Resize(num_chunks, nnet_.OutputDim()); // sets to zero.
-  const CuMatrix<BaseFloat> &output(forward_data_[num_components]);
-  KALDI_ASSERT(SameDim(output, *deriv));
-
-  std::vector<MatrixElement<BaseFloat> > sv_labels;
-  sv_labels.reserve(num_chunks); // We must have at least this many labels.
-  for (int32 m = 0; m < num_chunks; m++) {
-    KALDI_ASSERT(data[m].labels.size() == 1 &&
-                 "Training code currently does not support multi-frame egs");
-    const std::vector<std::pair<int32,BaseFloat> > &labels = data[m].labels[0];
-    for (size_t i = 0; i < labels.size(); i++) {
-      KALDI_ASSERT(labels[i].first < nnet_.OutputDim() &&
-                        "Possibly egs come from alignments from mismatching model");
-      MatrixElement<BaseFloat> elem = {m, labels[i].first, labels[i].second};
-      sv_labels.push_back(elem);
-    }
-  }
-
-  if (tot_accuracy != NULL)
-    *tot_accuracy = ComputeTotAccuracy(data);
-  
-  deriv->CompObjfAndDeriv(sv_labels, output, &tot_objf, &tot_weight);
-  
-  KALDI_VLOG(4) << "Objective function is " << (tot_objf/tot_weight) << " over "
-                << tot_weight << " samples (weighted).";
-  return tot_objf;
-}
-
-
-double NnetUpdater::ComputeTotAccuracy(
-    const std::vector<NnetExample> &data) const {
-  BaseFloat tot_accuracy = 0.0;
-  int32 num_components = nnet_.NumComponents();
-  const CuMatrix<BaseFloat> &output(forward_data_[num_components]);
-  KALDI_ASSERT(output.NumRows() == static_cast<int32>(data.size()));
-  CuArray<int32> best_pdf(output.NumRows());
-  std::vector<int32> best_pdf_cpu;
-  
-  output.FindRowMaxId(&best_pdf);
-  best_pdf.CopyToVec(&best_pdf_cpu);
-
-  for (int32 i = 0; i < output.NumRows(); i++) {
-    KALDI_ASSERT(data[i].labels.size() == 1 &&
-                 "Training code currently does not support multi-frame egs");
-    const std::vector<std::pair<int32,BaseFloat> > &labels = data[i].labels[0];
-    for (size_t j = 0; j < labels.size(); j++) {
-      int32 ref_pdf_id = labels[j].first,
-          hyp_pdf_id = best_pdf_cpu[i];
-      BaseFloat weight = labels[j].second;
-      tot_accuracy += weight * (hyp_pdf_id == ref_pdf_id ? 1.0 : 0.0);
-    }
-  }
-  return tot_accuracy;
-}
-
-
-void NnetUpdater::Backprop(CuMatrix<BaseFloat> *deriv) const {
-  // We assume ComputeObjfAndDeriv has already been called.
-  for (int32 c = nnet_.NumComponents() - 1;
-       c >= nnet_.FirstUpdatableComponent(); c--) {
-    const Component &component = nnet_.GetComponent(c);
-    Component *component_to_update = (nnet_to_update_ == NULL ? NULL :
-                                      &(nnet_to_update_->GetComponent(c)));
-    const CuMatrix<BaseFloat> &input = forward_data_[c],
-        &output = forward_data_[c+1];
-    CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
-    const CuMatrix<BaseFloat> &output_deriv(*deriv);
-    component.Backprop(chunk_info_out_[c], chunk_info_out_[c+1], input, output,                       
-                       output_deriv, component_to_update,
-                       &input_deriv);
-    input_deriv.Swap(deriv);
-  }
-}
-
-
-void FormatNnetInput(const Nnet &nnet,
-                     const std::vector<NnetExample> &data,
-                     Matrix<BaseFloat> *input_mat) {
-  KALDI_ASSERT(data.size() > 0);
-  int32 num_splice = 1 + nnet.RightContext() + nnet.LeftContext();
-  KALDI_ASSERT(data[0].input_frames.NumRows() >= num_splice);
-  
-  int32 feat_dim = data[0].input_frames.NumCols(),
-         spk_dim = data[0].spk_info.Dim(),
-         tot_dim = feat_dim + spk_dim; // we append these at the neural net
-                                       // input... note, spk_dim might be 0.
-  KALDI_ASSERT(tot_dim == nnet.InputDim());
-  KALDI_ASSERT(data[0].left_context >= nnet.LeftContext());
-  int32 ignore_frames = data[0].left_context - nnet.LeftContext(); // If
-  // the NnetExample has more left-context than we need, ignore some.
-  // this may happen in settings where we increase the amount of context during
-  // training, e.g. by adding layers that require more context.  
-
-  int32 num_chunks = data.size();
-  
-  input_mat->Resize(num_splice * num_chunks,
-                    tot_dim, kUndefined);
-  
-  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
-    SubMatrix<BaseFloat> dest(*input_mat,
-                              chunk * num_splice, num_splice,
-                              0, feat_dim);
-
-    Matrix<BaseFloat> full_src(data[chunk].input_frames);
-    SubMatrix<BaseFloat> src(full_src, ignore_frames, num_splice, 0, feat_dim);
-                             
-    dest.CopyFromMat(src);
-    if (spk_dim != 0) {
-      SubMatrix<BaseFloat> spk_dest(*input_mat,
-                                    chunk * num_splice, num_splice,
-                                    feat_dim, spk_dim);
-      spk_dest.CopyRowsFromVec(data[chunk].spk_info);
-    }
-  }
-}
-
-BaseFloat TotalNnetTrainingWeight(const std::vector<NnetExample> &egs) {
-  double ans = 0.0;
-  for (size_t i = 0; i < egs.size(); i++)
-    for (size_t j = 0; j < egs[i].labels.size(); j++) // for each labeled frame
-      for (size_t k = 0; k < egs[i].labels[j].size(); k++)
-        ans += egs[i].labels[j][k].second;
-  return ans;
-}
-
-
-double ComputeNnetObjf(const Nnet &nnet,
-                       const std::vector<NnetExample> &examples,
-                       double *tot_accuracy) {
-  NnetUpdater updater(nnet, NULL);
-  return updater.ComputeForMinibatch(examples, tot_accuracy);
-}
-
-double DoBackprop(const Nnet &nnet,
-                  const std::vector<NnetExample> &examples,
-                  Nnet *nnet_to_update,
-                  double *tot_accuracy) {
-  if (nnet_to_update == NULL)
-    return ComputeNnetObjf(nnet, examples, tot_accuracy);
-  try {
-    NnetUpdater updater(nnet, nnet_to_update);
-    return updater.ComputeForMinibatch(examples, tot_accuracy);
-  } catch (...) {
-    KALDI_LOG << "Error doing backprop, nnet info is: " << nnet.Info();
-    throw;
-  }
-}
-
-// version of DoBackprop that takes already-formatted examples.
-double DoBackprop(const Nnet &nnet,
-                  const std::vector<NnetExample> &examples,
-                  Matrix<BaseFloat> *examples_formatted,
-                  Nnet *nnet_to_update,
-                  double *tot_accuracy) {
-  if (nnet_to_update == NULL) {
-    KALDI_WARN << "Was not expecting to reach this code path "
-               << "(wastefully formatting data twice)";
-    return ComputeNnetObjf(nnet, examples, tot_accuracy);
- } try {
-    NnetUpdater updater(nnet, nnet_to_update);
-    return updater.ComputeForMinibatch(examples,
-                                       examples_formatted,
-                                       tot_accuracy);
-  } catch (...) {
-    KALDI_LOG << "Error doing backprop, nnet info is: " << nnet.Info();
-    throw;
-  }
-}
-
-
-double ComputeNnetGradient(
-    const Nnet &nnet,
-    const std::vector<NnetExample> &validation_set,
-    int32 batch_size,
-    Nnet *gradient) {
-  bool treat_as_gradient = true;
-  gradient->SetZero(treat_as_gradient);
-  std::vector<NnetExample> batch;
-  batch.reserve(batch_size);
-  double tot_objf = 0.0;
-  for (int32 start_pos = 0;
-       start_pos < static_cast<int32>(validation_set.size());
-       start_pos += batch_size) {
-    batch.clear();
-    for (int32 i = start_pos;
-         i < std::min(start_pos + batch_size,
-                      static_cast<int32>(validation_set.size()));
-         i++) {
-      batch.push_back(validation_set[i]);
-    }
-    tot_objf += DoBackprop(nnet,
-                           batch,
-                           gradient);
-  }
-  return tot_objf / validation_set.size();
-}
-
-double ComputeNnetObjf(
-    const Nnet &nnet,
-    const std::vector<NnetExample> &validation_set,
-    int32 batch_size,
-    double *tot_accuracy) {
-  double tot_accuracy_tmp;
-  if (tot_accuracy)
-    *tot_accuracy = 0.0;
-  std::vector<NnetExample> batch;
-  batch.reserve(batch_size);
-  double tot_objf = 0.0;
-  for (int32 start_pos = 0;
-       start_pos < static_cast<int32>(validation_set.size());
-       start_pos += batch_size) {
-    batch.clear();
-    for (int32 i = start_pos;
-         i < std::min(start_pos + batch_size,
-                      static_cast<int32>(validation_set.size()));
-         i++) {
-      batch.push_back(validation_set[i]);
-    }
-    tot_objf += ComputeNnetObjf(nnet, batch,
-                                tot_accuracy != NULL ? &tot_accuracy_tmp : NULL);
-    if (tot_accuracy)
-      *tot_accuracy += tot_accuracy_tmp;
-  }
-  return tot_objf;
-}
-
-  
-  
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/nnet-update.h b/src/nnet2/nnet-update.h
deleted file mode 100644
index c21ca31c61e..00000000000
--- a/src/nnet2/nnet-update.h
+++ /dev/null
@@ -1,191 +0,0 @@
-// nnet2/nnet-update.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-//           2014  Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_NNET_UPDATE_H_
-#define KALDI_NNET2_NNET_UPDATE_H_
-
-#include "nnet2/nnet-nnet.h"
-#include "nnet2/nnet-example.h"
-#include "util/table-types.h"
-
-
-namespace kaldi {
-namespace nnet2 {
-
-/** @file
-   This header provides functionality for sample-by-sample stochastic
-   gradient descent and gradient computation with a neural net.
-   See also \ref nnet-compute.h which is the same thing but for
-   whole utterances.
-*/
-
-class NnetEnsembleTrainer;
-
-// This class NnetUpdater contains functions for updating the neural net or
-// computing its gradient, given a set of NnetExamples. We
-// define it in the header file becaused it's needed by the ensemble training.
-// But in normal cases its functionality should be used by calling DoBackprop(),
-// and by ComputeNnetObjf()
-class NnetUpdater {
- public:
-  // Note: in the case of training with SGD, "nnet" and "nnet_to_update" will
-  // be identical.  They'll be different if we're accumulating the gradient
-  // for a held-out set and don't want to update the model.  Note: nnet_to_update
-  // may be NULL if you don't want do do backprop.
-  NnetUpdater(const Nnet &nnet,
-              Nnet *nnet_to_update);
-  
-  /// Does the entire forward and backward computation for this minbatch.
-  /// Returns total objective function over this minibatch.  If tot_accuracy != NULL,
-  /// outputs to that pointer the total accuracy.
-  double ComputeForMinibatch(const std::vector<NnetExample> &data,
-                             double *tot_accuracy);
-
-  /// This version of ComputeForMinibatch is used when you have already called
-  /// the function FormatNnetInput (defined below) to format your data as a
-  /// single matrix.  This interface is provided because it can be more
-  /// efficient to do this non-trivial CPU-based computation in a separate
-  /// thread.  formatted_data is an input but this function will destroy it,
-  /// which is why it's a pointer.
-  double ComputeForMinibatch(const std::vector<NnetExample> &data,
-                             Matrix<BaseFloat> *formatted_data,
-                             double *tot_accuracy);
-  
-  void GetOutput(CuMatrix<BaseFloat> *output);
- protected:
-
-  void Propagate();
-
-  /// Formats the input as a single matrix and sets the size of forward_data_,
-  /// and sets up chunk_info_out_.
-  void FormatInput(const std::vector<NnetExample> &data);
-
-  /// Computes objective function and derivative at output layer, but does not
-  /// do the backprop [for that, see Backprop()].  Returns objf summed over all
-  /// samples (with their weights).
-  /// If tot_accuracy != NULL, it will output to tot_accuracy the sum over all labels
-  /// of all examples, of (correctly classified ? 0 : 1) * weight-of-label.  This
-  /// involves extra computation.
-  double ComputeObjfAndDeriv(const std::vector<NnetExample> &data,
-                             CuMatrix<BaseFloat> *deriv,
-                             double *tot_accuracy = NULL) const;
-  
-
-  /// Backprop must be called after ComputeObjfAndDeriv.  Does the
-  /// backpropagation; "nnet_to_update_" is updated.  Note: "deriv" will
-  /// contain, at input, the derivative w.r.t. the output layer (as computed by
-  /// ComputeObjfAndDeriv), but will be used as a temporary variable by this
-  /// function.
-  void Backprop(CuMatrix<BaseFloat> *deriv) const;
-
-  friend class NnetEnsembleTrainer;
- private:
-  // Must be called after Propagate().
-  double ComputeTotAccuracy(const std::vector<NnetExample> &data) const;
-
-  const Nnet &nnet_;
-  Nnet *nnet_to_update_;
-  int32 num_chunks_; // same as the minibatch size.
-  std::vector<ChunkInfo> chunk_info_out_; 
-  
-  std::vector<CuMatrix<BaseFloat> > forward_data_; // The forward data
-  // for the outputs of each of the components.
-
-};
-
-
-/// Takes the input to the nnet for a minibatch of examples, and formats as a
-/// single matrix.  data.size() must be > 0.  Note: you will probably want to
-/// copy this to CuMatrix after you call this function.
-/// The num-rows of the output will, at exit, equal 
-/// (1 + nnet.LeftContext() + nnet.RightContext()) * data.size().
-/// The nnet is only needed so we can call LeftContext(), RightContext()
-/// and InputDim() on it.
-void FormatNnetInput(const Nnet &nnet,
-                     const std::vector<NnetExample> &data,
-                     Matrix<BaseFloat> *mat);
-
-
-/// This function computes the objective function and either updates the model
-/// or adds to parameter gradients.  Returns the cross-entropy objective
-/// function summed over all samples (normalize this by dividing by
-/// TotalNnetTrainingWeight(examples)).  It is mostly a wrapper for
-/// a class NnetUpdater that's defined in nnet-update.cc, but we
-/// don't want to expose that complexity at this level.
-/// All these examples will be treated as one minibatch.
-/// If tot_accuracy != NULL, it outputs to that pointer the total (weighted)
-/// accuracy.
-double DoBackprop(const Nnet &nnet,
-                  const std::vector<NnetExample> &examples,
-                  Nnet *nnet_to_update,
-                  double *tot_accuracy = NULL);
-
-/// This version of DoBackprop allows you to separately call
-/// FormatNnetInput and provide the result to DoBackprop; this
-/// can be useful when using GPUs because the call to FormatNnetInput
-/// can be in a separate thread from the one that uses the GPU.
-/// "examples_formatted" is really an input, but it's a pointer
-/// because internally we call Swap() on it, so we destroy
-/// its contents.
-double DoBackprop(const Nnet &nnet,
-                  const std::vector<NnetExample> &examples,
-                  Matrix<BaseFloat> *examples_formatted,
-                  Nnet *nnet_to_update,
-                  double *tot_accuracy = NULL);
-
-
-
-/// Returns the total weight summed over all the examples... just a simple
-/// utility function.
-BaseFloat TotalNnetTrainingWeight(const std::vector<NnetExample> &egs);
-
-/// Computes objective function over a minibatch.  Returns the *total* weighted
-/// objective function over the minibatch.
-/// If tot_accuracy != NULL, it outputs to that pointer the total (weighted)
-/// accuracy.
-double ComputeNnetObjf(const Nnet &nnet,
-                       const std::vector<NnetExample> &examples,
-                       double *tot_accuracy= NULL);
-
-/// This version of ComputeNnetObjf breaks up the examples into
-/// multiple minibatches to do the computation.
-/// Returns the *total* (weighted) objective function.
-/// If tot_accuracy != NULL, it outputs to that pointer the total (weighted)
-/// accuracy.
-double ComputeNnetObjf(const Nnet &nnet,                          
-                       const std::vector<NnetExample> &examples,
-                       int32 minibatch_size,
-                       double *tot_accuracy= NULL);
-
-
-/// ComputeNnetGradient is mostly used to compute gradients on validation sets;
-/// it divides the example into batches and calls DoBackprop() on each.
-/// It returns the *average* objective function per frame.
-double ComputeNnetGradient(
-    const Nnet &nnet,
-    const std::vector<NnetExample> &examples,
-    int32 batch_size,
-    Nnet *gradient);
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_NNET_UPDATE_H_
diff --git a/src/nnet2/online-nnet2-decodable-test.cc b/src/nnet2/online-nnet2-decodable-test.cc
deleted file mode 100644
index 10ca206c5ee..00000000000
--- a/src/nnet2/online-nnet2-decodable-test.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// nnet2/online-nnet2-decodable-test.cc
-
-// Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-component.h"
-#include "nnet2/decodable-am-nnet.h"
-#include "nnet2/online-nnet2-decodable.h"
-#include "feat/online-feature.h"
-#include "hmm/hmm-test-utils.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void UnitTestNnetDecodable() {
-  std::vector<int32> phones;
-  phones.push_back(1);
-  for (int32 i = 2; i < 20; i++)
-    if (rand() % 2 == 0)
-      phones.push_back(i);
-  int32 N = 2 + rand() % 2, // context-size N is 2 or 3.
-      P = rand() % N;  // Central-phone is random on [0, N)
-
-  std::vector<int32> num_pdf_classes;
-
-  ContextDependency *ctx_dep =
-      GenRandContextDependencyLarge(phones, N, P,
-                                    true, &num_pdf_classes);
-
-  HmmTopology topo = GetDefaultTopology(phones);
-
-  TransitionModel trans_model(*ctx_dep, topo);
-
-  delete ctx_dep; // We won't need this further.
-  ctx_dep = NULL;
-
-  int32 input_dim = 40, output_dim = trans_model.NumPdfs();
-  Nnet *nnet = GenRandomNnet(input_dim, output_dim);
-
-  AmNnet am_nnet(*nnet);
-  delete nnet;
-  nnet = NULL;
-  Vector<BaseFloat> priors(output_dim);
-  priors.SetRandn();
-  priors.ApplyExp();
-  priors.Scale(1.0 / priors.Sum());
-
-  am_nnet.SetPriors(priors);
-
-  DecodableNnet2OnlineOptions opts;
-  opts.max_nnet_batch_size = 20;
-  opts.acoustic_scale = 0.1;
-
-  opts.pad_input = (rand() % 2 == 0);
-
-  int32 num_input_frames = 400;
-  Matrix<BaseFloat> input_feats(num_input_frames, input_dim);
-  input_feats.SetRandn();
-
-  OnlineMatrixFeature matrix_feature(input_feats);
-
-  DecodableNnet2Online online_decodable(am_nnet, trans_model,
-                                        opts, &matrix_feature);
-
-  DecodableAmNnet offline_decodable(trans_model, am_nnet,
-                                    CuMatrix<BaseFloat>(input_feats),
-                                    opts.pad_input,
-                                    opts.acoustic_scale);
-
-  KALDI_ASSERT(online_decodable.NumFramesReady() ==
-               offline_decodable.NumFramesReady());
-  int32 num_frames = online_decodable.NumFramesReady(),
-      num_tids = trans_model.NumTransitionIds();
-
-  for (int32 i = 0; i < 50; i++) {
-
-    int32 t = rand() % num_frames, tid = 1 + rand() % num_tids;
-    BaseFloat l1 = online_decodable.LogLikelihood(t, tid),
-        l2 = offline_decodable.LogLikelihood(t, tid);
-    KALDI_ASSERT(ApproxEqual(l1, l2));
-  }
-}
-
-} // namespace nnet2
-} // namespace kaldi
-
-
-int main() {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-  using kaldi::int32;
-
-  for (int32 i = 0; i < 3; i++)
-    UnitTestNnetDecodable();
-  return 0;
-}
-
-
diff --git a/src/nnet2/online-nnet2-decodable.cc b/src/nnet2/online-nnet2-decodable.cc
deleted file mode 100644
index 715e1cc280d..00000000000
--- a/src/nnet2/online-nnet2-decodable.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// nnet2/online-nnet2-decodable.cc
-
-// Copyright  2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/online-nnet2-decodable.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-DecodableNnet2Online::DecodableNnet2Online(
-    const AmNnet &nnet,
-    const TransitionModel &trans_model,
-    const DecodableNnet2OnlineOptions &opts,
-    OnlineFeatureInterface *input_feats):
-    features_(input_feats),
-    nnet_(nnet),
-    trans_model_(trans_model),
-    opts_(opts),
-    feat_dim_(input_feats->Dim()),
-    left_context_(nnet.GetNnet().LeftContext()),
-    right_context_(nnet.GetNnet().RightContext()),
-    num_pdfs_(nnet.GetNnet().OutputDim()),
-    begin_frame_(-1) {
-  KALDI_ASSERT(opts_.max_nnet_batch_size > 0);
-  log_priors_ = nnet_.Priors();
-  KALDI_ASSERT(log_priors_.Dim() == trans_model_.NumPdfs() &&
-               "Priors in neural network not set up (or mismatch "
-               "with transition model).");
-  log_priors_.ApplyLog();
-}
-
-
-
-BaseFloat DecodableNnet2Online::LogLikelihood(int32 frame, int32 index) {
-  ComputeForFrame(frame);
-  int32 pdf_id = trans_model_.TransitionIdToPdf(index);
-  KALDI_ASSERT(frame >= begin_frame_ &&
-               frame < begin_frame_ + scaled_loglikes_.NumRows());
-  return scaled_loglikes_(frame - begin_frame_, pdf_id);
-}
-
-
-bool DecodableNnet2Online::IsLastFrame(int32 frame) const {
-  if (opts_.pad_input) { // normal case
-    return features_->IsLastFrame(frame);
-  } else {
-    return features_->IsLastFrame(frame + left_context_ + right_context_);
-  }
-}
-
-int32 DecodableNnet2Online::NumFramesReady() const {
-  int32 features_ready = features_->NumFramesReady();
-  if (features_ready == 0)
-    return 0;
-  bool input_finished = features_->IsLastFrame(features_ready - 1);
-  if (opts_.pad_input) {
-    // normal case... we'll pad with duplicates of first + last frame to get the
-    // required left and right context.
-    if (input_finished) return features_ready;
-    else return std::max<int32>(0, features_ready - right_context_);
-  } else {
-    return std::max<int32>(0, features_ready - right_context_ - left_context_);
-  }
-}
-
-void DecodableNnet2Online::ComputeForFrame(int32 frame) {
-  int32 features_ready = features_->NumFramesReady();
-  bool input_finished = features_->IsLastFrame(features_ready - 1);
-  KALDI_ASSERT(frame >= 0);
-  if (frame >= begin_frame_ &&
-      frame < begin_frame_ + scaled_loglikes_.NumRows())
-    return;
-  KALDI_ASSERT(frame < NumFramesReady());
-
-  int32 input_frame_begin;
-  if (opts_.pad_input)
-    input_frame_begin = frame - left_context_;
-  else
-    input_frame_begin = frame;
-  int32 max_possible_input_frame_end = features_ready;
-  if (input_finished && opts_.pad_input)
-    max_possible_input_frame_end += right_context_;
-  int32 input_frame_end = std::min<int32>(max_possible_input_frame_end,
-                                          input_frame_begin +
-                                          left_context_ + right_context_ +
-                                          opts_.max_nnet_batch_size);
-  KALDI_ASSERT(input_frame_end > input_frame_begin);
-  Matrix<BaseFloat> features(input_frame_end - input_frame_begin,
-                             feat_dim_);
-  for (int32 t = input_frame_begin; t < input_frame_end; t++) {
-    SubVector<BaseFloat> row(features, t - input_frame_begin);
-    int32 t_modified = t;
-    // The next two if-statements take care of "pad_input"
-    if (t_modified < 0)
-      t_modified = 0;
-    if (t_modified >= features_ready)
-      t_modified = features_ready - 1;
-    features_->GetFrame(t_modified, &row);
-  }
-  CuMatrix<BaseFloat> cu_features;
-  cu_features.Swap(&features);  // Copy to GPU, if we're using one.
-
-
-  int32 num_frames_out = input_frame_end - input_frame_begin -
-      left_context_ - right_context_;
-
-  CuMatrix<BaseFloat> cu_posteriors(num_frames_out, num_pdfs_);
-
-  // The "false" below tells it not to pad the input: we've already done
-  // any padding that we needed to do.
-  NnetComputation(nnet_.GetNnet(), cu_features,
-                  false, &cu_posteriors);
-
-  cu_posteriors.ApplyFloor(1.0e-20); // Avoid log of zero which leads to NaN.
-  cu_posteriors.ApplyLog();
-  // subtract log-prior (divide by prior)
-  cu_posteriors.AddVecToRows(-1.0, log_priors_);
-  // apply probability scale.
-  cu_posteriors.Scale(opts_.acoustic_scale);
-
-  // Transfer the scores the CPU for faster access by the
-  // decoding process.
-  scaled_loglikes_.Resize(0, 0);
-  cu_posteriors.Swap(&scaled_loglikes_);
-
-  begin_frame_ = frame;
-}
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/online-nnet2-decodable.h b/src/nnet2/online-nnet2-decodable.h
deleted file mode 100644
index 96e0a4b8926..00000000000
--- a/src/nnet2/online-nnet2-decodable.h
+++ /dev/null
@@ -1,122 +0,0 @@
-// nnet2/online-nnet2-decodable.h
-
-// Copyright  2014  Johns Hopkins Universithy (author: Daniel Povey)
-
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_ONLINE_NNET2_DECODABLE_H_
-#define KALDI_NNET2_ONLINE_NNET2_DECODABLE_H_
-
-#include "itf/online-feature-itf.h"
-#include "itf/decodable-itf.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-compute.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// Note: see also nnet-compute-online.h, which provides a different
-// (lower-level) interface and more efficient for progressive evaluation of an
-// nnet throughout an utterance, with re-use of already-computed activations.
-
-struct DecodableNnet2OnlineOptions {
-  BaseFloat acoustic_scale;
-  bool pad_input;
-  int32 max_nnet_batch_size;
-  
-  DecodableNnet2OnlineOptions():
-      acoustic_scale(0.1),
-      pad_input(true),
-      max_nnet_batch_size(256) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("acoustic-scale", &acoustic_scale,
-                   "Scaling factor for acoustic likelihoods");
-    opts->Register("pad-input", &pad_input,
-                   "If true, pad acoustic features with required acoustic context "
-                   "past edges of file.");
-    opts->Register("max-nnet-batch-size", &max_nnet_batch_size,
-                   "Maximum batch size we use in neural-network decodable object, "
-                   "in cases where we are not constrained by currently available "
-                   "frames (this will rarely make a difference)");
-                 
-  }
-};
-
-
-/**
-   This Decodable object for class nnet2::AmNnet takes feature input from class
-   OnlineFeatureInterface, unlike, say, class DecodableAmNnet which takes
-   feature input from a matrix.
-*/
-
-class DecodableNnet2Online: public DecodableInterface {
- public:
-  DecodableNnet2Online(const AmNnet &nnet,
-                       const TransitionModel &trans_model,
-                       const DecodableNnet2OnlineOptions &opts,
-                       OnlineFeatureInterface *input_feats);
-  
-  
-  /// Returns the scaled log likelihood
-  virtual BaseFloat LogLikelihood(int32 frame, int32 index);
-  
-  virtual bool IsLastFrame(int32 frame) const;
-
-  virtual int32 NumFramesReady() const;  
-  
-  /// Indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-  
- private:
-
-  /// If the neural-network outputs for this frame are not cached, it computes
-  /// them (and possibly for some succeeding frames)
-  void ComputeForFrame(int32 frame);
-  
-  OnlineFeatureInterface *features_;
-  const AmNnet &nnet_;
-  const TransitionModel &trans_model_;
-  DecodableNnet2OnlineOptions opts_;
-  CuVector<BaseFloat> log_priors_;  // log-priors taken from the model.
-  int32 feat_dim_;  // dimensionality of the input features.
-  int32 left_context_;  // Left context of the network (cached here)
-  int32 right_context_;  // Right context of the network (cached here)
-  int32 num_pdfs_;  // Number of pdfs, equals output-dim of the network (cached
-                    // here)
-  
-  int32 begin_frame_;  // First frame for which scaled_loglikes_ is valid
-                       // (i.e. the first frame of the batch of frames for
-                       // which we've computed the output).
-  
-  // scaled_loglikes_ contains the neural network pseudo-likelihoods: the log of
-  // (prob divided by the prior), scaled by opts.acoustic_scale).  We may
-  // compute this using the GPU, but we transfer it back to the system memory
-  // when we store it here.  These scores are only kept for a subset of frames,
-  // starting at begin_frame_, whose length depends how many frames were ready
-  // at the time we called LogLikelihood(), and will never exceed
-  // opts_.max_nnet_batch_size.
-  Matrix<BaseFloat> scaled_loglikes_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableNnet2Online);
-};
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif // KALDI_NNET2_ONLINE_NNET2_DECODABLE_H_
diff --git a/src/nnet2/rescale-nnet.cc b/src/nnet2/rescale-nnet.cc
deleted file mode 100644
index 204720df839..00000000000
--- a/src/nnet2/rescale-nnet.cc
+++ /dev/null
@@ -1,227 +0,0 @@
-// nnet2/rescale-nnet.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/rescale-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-class NnetRescaler {
- public:
-  NnetRescaler(const NnetRescaleConfig &config,
-               const std::vector<NnetExample> &examples,
-               Nnet *nnet):
-      config_(config), examples_(examples), nnet_(nnet) {}
-                            
-  void Rescale();
-
- private:
-  /// takes the input and formats as a single matrix, in forward_data_[0].
-  void FormatInput(const std::vector<NnetExample> &data,
-                   CuMatrix<BaseFloat> *input);
-  void RescaleComponent(int32 c, int32 num_chunks,
-                        CuMatrixBase<BaseFloat> *cur_data_in,
-                        CuMatrix<BaseFloat> *next_data);
-
-  void ComputeRelevantIndexes();
-  
-  BaseFloat GetTargetAvgDeriv(int32 c);
-  
-  const NnetRescaleConfig &config_;
-  const std::vector<NnetExample> &examples_;
-  Nnet *nnet_;
-  std::vector <ChunkInfo> chunk_info_out_;
-  std::set<int32> relevant_indexes_; // values of c with AffineComponent followed
-  // by (at c+1) NonlinearComponent that is not SoftmaxComponent.
-};
-
-
-void NnetRescaler::FormatInput(const std::vector<NnetExample> &data,
-                               CuMatrix<BaseFloat> *input) {
-  KALDI_ASSERT(data.size() > 0);
-  int32 num_splice = nnet_->LeftContext() + 1 + nnet_->RightContext();
-  KALDI_ASSERT(data[0].input_frames.NumRows() == num_splice);
-
-  int32 feat_dim = data[0].input_frames.NumCols(),
-         spk_dim = data[0].spk_info.Dim(),
-         tot_dim = feat_dim + spk_dim; // we append these at the neural net
-                                       // input... note, spk_dim might be 0.
-  KALDI_ASSERT(tot_dim == nnet_->InputDim());
-  int32 num_chunks = data.size();
-
-  input->Resize(num_splice * num_chunks,
-                tot_dim);
-  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
-    CuSubMatrix<BaseFloat> dest(*input,
-                                chunk * num_splice, num_splice,
-                                0, feat_dim);
-    Matrix<BaseFloat> src(data[chunk].input_frames);
-    dest.CopyFromMat(src);
-    if (spk_dim != 0) {
-      CuSubMatrix<BaseFloat> spk_dest(*input,
-                                      chunk * num_splice, num_splice,
-                                      feat_dim, spk_dim);
-      spk_dest.CopyRowsFromVec(data[chunk].spk_info);
-    }
-  }
-  // TODO : filter out the unnecessary rows from the input
-  nnet_->ComputeChunkInfo(num_splice, num_chunks, &chunk_info_out_);
-
-}
-
-void NnetRescaler::ComputeRelevantIndexes() {
-  for (int32 c = 0; c + 1 < nnet_->NumComponents(); c++)
-    if (dynamic_cast<AffineComponent*>(&nnet_->GetComponent(c)) != NULL &&
-        (dynamic_cast<NonlinearComponent*>(&nnet_->GetComponent(c+1)) != NULL &&
-         dynamic_cast<SoftmaxComponent*>(&nnet_->GetComponent(c+1)) == NULL))
-      relevant_indexes_.insert(c);
-}
-
-
-BaseFloat NnetRescaler::GetTargetAvgDeriv(int32 c) {
-  KALDI_ASSERT(relevant_indexes_.count(c) == 1);
-  BaseFloat factor;
-  if (dynamic_cast<SigmoidComponent*>(&(nnet_->GetComponent(c + 1))) != NULL)
-    factor = 0.25;
-  else if (dynamic_cast<TanhComponent*>(&(nnet_->GetComponent(c + 1))) != NULL)
-    factor = 1.0;
-  else
-    KALDI_ERR << "This type of nonlinear component is not handled: index  " << c;
-  
-  int32 last_c = *std::max_element(relevant_indexes_.begin(), relevant_indexes_.end()),
-      first_c = *std::min_element(relevant_indexes_.begin(), relevant_indexes_.end());
-  if (c == first_c)
-    return factor * config_.target_first_layer_avg_deriv;
-  else if (c == last_c)
-    return factor * config_.target_last_layer_avg_deriv;
-  else
-    return factor * config_.target_avg_deriv;
-}
-
-// Here, c is the index of the affine component, and
-// c + 1 is the index of the nonlinear component; *cur_data is the
-// output of the affine component.
-void NnetRescaler::RescaleComponent(
-    int32 c,
-    int32 num_chunks,
-    CuMatrixBase<BaseFloat> *cur_data_in,
-    CuMatrix<BaseFloat> *next_data) {
-  int32 rows = cur_data_in->NumRows(), cols = cur_data_in->NumCols();
-  // Only handle sigmoid or tanh here.
-  if (dynamic_cast<SigmoidComponent*>(&(nnet_->GetComponent(c + 1))) == NULL &&
-      dynamic_cast<TanhComponent*>(&(nnet_->GetComponent(c + 1))) == NULL)
-    KALDI_ERR << "This type of nonlinear component is not handled: index  " << c;
-  KALDI_ASSERT(chunk_info_out_[0].NumChunks() == num_chunks); //TODO verify how this component can be used
-                                                             // rewrite the
-                                                             // chunk_info_out_
-                                                             // computation
-  // the nonlinear component:
-  NonlinearComponent &nc =
-      *(dynamic_cast<NonlinearComponent*>(&(nnet_->GetComponent(c + 1))));
-  ChunkInfo in_info, out_info;
-  in_info = chunk_info_out_[c+1];
-  out_info = chunk_info_out_[c+2];
-
-  BaseFloat orig_avg_deriv, target_avg_deriv = GetTargetAvgDeriv(c);
-  BaseFloat cur_scaling = 1.0; // current rescaling factor (on input).
-  int32 num_iters = 10;
-  
-  CuMatrix<BaseFloat> cur_data(*cur_data_in),
-      ones(rows, cols), in_deriv(rows, cols);
-      
-  ones.Set(1.0);
-  nc.Propagate(in_info, out_info, cur_data, next_data);
-  nc.Backprop(in_info, out_info, cur_data, *next_data, ones, NULL, &in_deriv);
-  BaseFloat cur_avg_deriv;
-  cur_avg_deriv = in_deriv.Sum() / (rows * cols);
-  orig_avg_deriv = cur_avg_deriv;
-  for (int32 iter = 0; iter < num_iters; iter++) {
-    // We already have "cur_avg_deriv"; perturb the scale and compute
-    // the next avg_deriv, so we can see how it changes with the scale.
-    cur_data.CopyFromMat(*cur_data_in);
-    cur_data.Scale(cur_scaling + config_.delta);
-    nc.Propagate(in_info, out_info, cur_data, next_data);
-    nc.Backprop(in_info, out_info, cur_data, *next_data, ones, NULL, &in_deriv);
-    BaseFloat next_avg_deriv = in_deriv.Sum() / (rows * cols);
-    KALDI_ASSERT(next_avg_deriv < cur_avg_deriv);
-    // "gradient" is how avg_deriv changes as we change the scale.
-    // should be negative.
-    BaseFloat gradient = (next_avg_deriv - cur_avg_deriv) / config_.delta;
-    KALDI_ASSERT(gradient < 0.0);
-    BaseFloat proposed_change = (target_avg_deriv - cur_avg_deriv) / gradient;
-    KALDI_VLOG(2) << "cur_avg_deriv = " << cur_avg_deriv << ", target_avg_deriv = "
-                  << target_avg_deriv << ", gradient = " << gradient
-                  << ", proposed_change " << proposed_change; 
-    // Limit size of proposed change in "cur_scaling", to ensure stability.
-    if (fabs(proposed_change / cur_scaling) > config_.max_change)
-      proposed_change = cur_scaling * config_.max_change *
-          (proposed_change > 0.0 ? 1.0 : -1.0);
-    cur_scaling += proposed_change;
-    
-    cur_data.CopyFromMat(*cur_data_in);
-    cur_data.Scale(cur_scaling);
-    nc.Propagate(in_info, out_info, cur_data, next_data);
-    nc.Backprop(in_info, out_info, cur_data, *next_data, ones, NULL, &in_deriv);
-    cur_avg_deriv = in_deriv.Sum() / (rows * cols);
-    if (fabs(proposed_change) < config_.min_change) break; // Terminate the
-    // optimization
-  }
-  UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(
-      &nnet_->GetComponent(c));
-  KALDI_ASSERT(uc != NULL);
-  uc->Scale(cur_scaling); // scale the parameters of the previous
-  // AffineComponent.
-  
-  KALDI_LOG << "For component " << c << ", scaling parameters by "
-            << cur_scaling << "; average "
-            << "derivative changed from " << orig_avg_deriv << " to "
-            << cur_avg_deriv << "; target was " << target_avg_deriv;
-}
-    
-
-
-void NnetRescaler::Rescale() {
-  ComputeRelevantIndexes(); // set up relevant_indexes_.
-  CuMatrix<BaseFloat> cur_data, next_data;
-  FormatInput(examples_, &cur_data);
-  int32 num_chunks = examples_.size();
-  for (int32 c = 0; c < nnet_->NumComponents(); c++) {
-    Component &component = nnet_->GetComponent(c);
-    if (relevant_indexes_.count(c - 1) == 1) {
-      // the following function call also appropriately sets "next_data"
-      // after doing the rescaling
-      RescaleComponent(c - 1, num_chunks, &cur_data, &next_data);
-    } else {
-      component.Propagate(chunk_info_out_[c], chunk_info_out_[c+1], cur_data, &next_data);
-    }
-    cur_data.Swap(&next_data);
-  }
-}
-
-void RescaleNnet(const NnetRescaleConfig &rescale_config,
-                 const std::vector<NnetExample> &examples,
-                 Nnet *nnet) {
-  NnetRescaler rescaler(rescale_config, examples, nnet);
-  rescaler.Rescale();
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/rescale-nnet.h b/src/nnet2/rescale-nnet.h
deleted file mode 100644
index 3d367fc1c1c..00000000000
--- a/src/nnet2/rescale-nnet.h
+++ /dev/null
@@ -1,80 +0,0 @@
-// nnet2/rescale-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_RESCALE_NNET_H_
-#define KALDI_NNET2_RESCALE_NNET_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "itf/options-itf.h"
-
-// Neural net rescaling is a rescaling of the parameters of the various layers
-// of a neural net, done so as to match certain specified statistics on the
-// average derivative of the sigmoid, measured on sample data.  This relates to
-// how "saturated" the sigmoid is.
-
-namespace kaldi {
-namespace nnet2 {
-
-
-struct NnetRescaleConfig {
-  BaseFloat target_avg_deriv;
-  BaseFloat target_first_layer_avg_deriv;
-  BaseFloat target_last_layer_avg_deriv;
-
-  // These are relatively unimportant; for now they have no
-  // command line options.
-  BaseFloat num_iters;
-  BaseFloat delta;
-  BaseFloat max_change; // maximum change on any one iteration (to
-  // ensure stability).
-  BaseFloat min_change; // minimum change on any one iteration (controls
-  // termination
-  
-  NnetRescaleConfig(): target_avg_deriv(0.2),
-                       target_first_layer_avg_deriv(0.3),
-                       target_last_layer_avg_deriv(0.1),
-                       num_iters(10),
-                       delta(0.01),
-                       max_change(0.2), min_change(1.0e-05) { }
-  
-  void Register(OptionsItf *opts) {
-    opts->Register("target-avg-deriv", &target_avg_deriv, "Target average derivative "
-                   "for hidden layers that are the not the first or last hidden layer "
-                   "(as fraction of maximum derivative of the nonlinearity)");
-    opts->Register("target-first-layer-avg-deriv", &target_first_layer_avg_deriv,
-                   "Target average derivative for the first hidden layer"
-                   "(as fraction of maximum derivative of the nonlinearity)");
-    opts->Register("target-last-layer-avg-deriv", &target_last_layer_avg_deriv,
-                   "Target average derivative for the last hidden layer, if "
-                   "#hid-layers > 1"
-                   "(as fraction of maximum derivative of the nonlinearity)");
-  }  
-};
-
-void RescaleNnet(const NnetRescaleConfig &rescale_config,
-                 const std::vector<NnetExample> &examples,
-                 Nnet *nnet);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/shrink-nnet.cc b/src/nnet2/shrink-nnet.cc
deleted file mode 100644
index cc24869c02d..00000000000
--- a/src/nnet2/shrink-nnet.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// nnet2/shrink-nnet.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/shrink-nnet.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-static BaseFloat ComputeObjfAndGradient(
-    const std::vector<NnetExample> &validation_set,
-    const Vector<double> &log_scale_params,
-    const Nnet &nnet,
-    Vector<double> *gradient) {
-  Vector<BaseFloat> scale_params(log_scale_params);
-  scale_params.ApplyExp();
-  Nnet nnet_scaled(nnet);
-  nnet_scaled.ScaleComponents(scale_params);
-  
-  Nnet nnet_gradient(nnet);
-  bool is_gradient = true;
-  nnet_gradient.SetZero(is_gradient);
-
-  // note: "ans" is normalized by the total weight of validation frames.
-  int32 batch_size = 1024;
-  BaseFloat ans = ComputeNnetGradient(nnet_scaled,
-                                      validation_set,
-                                      batch_size,
-                                      &nnet_gradient);
-
-  BaseFloat tot_count = validation_set.size();
-  int32 i = 0; // index into log_scale_params.
-  for (int32 j = 0; j < nnet_scaled.NumComponents(); j++) {
-    const UpdatableComponent *uc =
-        dynamic_cast<const UpdatableComponent*>(&(nnet.GetComponent(j))),
-        *uc_gradient =
-        dynamic_cast<const UpdatableComponent*>(&(nnet_gradient.GetComponent(j)));
-    if (uc != NULL) {
-      BaseFloat dotprod = uc->DotProduct(*uc_gradient) / tot_count;
-      (*gradient)(i) = dotprod * scale_params(i); // gradient w.r.t log of scaling factor.
-      // We multiply by scale_params(i) to take into account d/dx exp(x); "gradient"
-      // is the gradient w.r.t. the log of the scale_params.
-      i++;
-    }
-  }
-  KALDI_ASSERT(i == log_scale_params.Dim());
-  return ans;
-}
-                                   
-
-void ShrinkNnet(const NnetShrinkConfig &shrink_config,
-                const std::vector<NnetExample> &validation_set,
-                Nnet *nnet) {
-
-  int32 dim = nnet->NumUpdatableComponents();
-  KALDI_ASSERT(dim > 0);
-  Vector<double> log_scale(dim), gradient(dim); // will be zero.
-  
-  // Get initial gradient.
-  double objf, initial_objf;
-
-
-  LbfgsOptions lbfgs_options;
-  lbfgs_options.minimize = false; // We're maximizing.
-  lbfgs_options.m = dim; // Store the same number of vectors as the dimension
-  // itself, so this is BFGS.
-  lbfgs_options.first_step_length = shrink_config.initial_step;
-  
-  OptimizeLbfgs<double> lbfgs(log_scale,
-                              lbfgs_options);
-  
-  for (int32 i = 0; i < shrink_config.num_bfgs_iters; i++) {
-    log_scale.CopyFromVec(lbfgs.GetProposedValue());
-    objf = ComputeObjfAndGradient(validation_set, log_scale,
-                                  *nnet,
-                                  &gradient);
-
-    KALDI_VLOG(2) << "log-scale = " << log_scale << ", objf = " << objf
-                  << ", gradient = " << gradient;
-    if (i == 0) initial_objf = objf;
-
-    lbfgs.DoStep(objf, gradient);
-  }
-
-  log_scale.CopyFromVec(lbfgs.GetValue(&objf));
-
-  Vector<BaseFloat> scale(log_scale);
-  scale.ApplyExp();
-  KALDI_LOG << "Shrinking nnet, validation objf per frame changed from "
-            << initial_objf << " to " << objf << ", scale factors per layer are "
-            << scale;
-  nnet->ScaleComponents(scale);
-}
- 
-  
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/shrink-nnet.h b/src/nnet2/shrink-nnet.h
deleted file mode 100644
index 5a80920df77..00000000000
--- a/src/nnet2/shrink-nnet.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// nnet2/shrink-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_SHRINK_NNET_H_
-#define KALDI_NNET2_SHRINK_NNET_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/** Configuration class that controls neural net "shrinkage" which is actually a
-    scaling on the parameters of each of the updatable layers.
- */
-struct NnetShrinkConfig {
-  int32 num_bfgs_iters; // The dimension is small (e.g. 3 to 5) so we do
-  // BFGS.  We actually implement this as L-BFGS but setting the number of
-  // vectors to be the same as the dimension of the space.  Note: this
-  // num-iters is in reality the number of function evaluations.
-
-  BaseFloat initial_step;
-  
-  NnetShrinkConfig(): num_bfgs_iters(10), initial_step(0.1) { }
-  void Register(OptionsItf *opts) {
-    opts->Register("num-bfgs-iters", &num_bfgs_iters, "Number of iterations of "
-                   "BFGS to use when optimizing shrinkage parameters");
-    opts->Register("initial-step", &initial_step, "Parameter in the optimization, "
-                   "used to set the initial step length");
-  }  
-};
-
-void ShrinkNnet(const NnetShrinkConfig &shrink_config,
-                const std::vector<NnetExample> &validation_set,
-                Nnet *nnet);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/train-nnet-ensemble.cc b/src/nnet2/train-nnet-ensemble.cc
deleted file mode 100644
index e04f86c267f..00000000000
--- a/src/nnet2/train-nnet-ensemble.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-// nnet2/train-nnet-ensemble.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-//           2014   Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/train-nnet-ensemble.h"
-#include <numeric> // for std::accumulate
-
-namespace kaldi {
-namespace nnet2 {
-
-static inline Int32Pair MakePair(int32 first, int32 second) {
-  Int32Pair ans;
-  ans.first = first;
-  ans.second = second;
-  return ans;
-}
-
-NnetEnsembleTrainer::NnetEnsembleTrainer(
-    const NnetEnsembleTrainerConfig &config,
-    std::vector<Nnet*> nnet_ensemble):
-    config_(config), nnet_ensemble_(nnet_ensemble) {
-  beta_ = config_.beta;
-  num_phases_ = 0;
-  bool first_time = true;
-  BeginNewPhase(first_time);
-}
-
-void NnetEnsembleTrainer::TrainOnExample(const NnetExample &value) {
-  buffer_.push_back(value);
-  if (static_cast<int32>(buffer_.size()) == config_.minibatch_size)
-    TrainOneMinibatch();
-}
-
-void NnetEnsembleTrainer::TrainOneMinibatch() {
-  KALDI_ASSERT(!buffer_.empty());
-  
-  int32 num_states = nnet_ensemble_[0]->GetComponent(nnet_ensemble_[0]->NumComponents() - 1).OutputDim();
-  // average of posteriors matrix, storing averaged outputs of net ensemble.
-  CuMatrix<BaseFloat> post_avg(buffer_.size(), num_states);
-  updater_ensemble_.reserve(nnet_ensemble_.size());
-  std::vector<CuMatrix<BaseFloat> > post_mat;
-  post_mat.resize(nnet_ensemble_.size());
-  for (int32 i = 0; i < nnet_ensemble_.size(); i++) {
-    updater_ensemble_.push_back(new NnetUpdater(*(nnet_ensemble_[i]), nnet_ensemble_[i]));
-    updater_ensemble_[i]->FormatInput(buffer_);
-    updater_ensemble_[i]->Propagate();
-    // posterior matrix, storing output of one net.
-    updater_ensemble_[i]->GetOutput(&post_mat[i]);
-    CuVector<BaseFloat> row_sum(post_mat[i].NumRows());
-    post_avg.AddMat(1.0, post_mat[i]);
-  }
-
-  // calculate the interpolated posterios as new supervision labels, and also 
-  // collect the indices of the original supervision labels for later use (calc. objf.).
-  std::vector<MatrixElement<BaseFloat> > sv_labels;
-  std::vector<Int32Pair > sv_labels_ind;
-  sv_labels.reserve(buffer_.size()); // We must have at least this many labels.
-  sv_labels_ind.reserve(buffer_.size()); // We must have at least this many labels.
-  for (int32 m = 0; m < buffer_.size(); m++) {
-    KALDI_ASSERT(buffer_[m].labels.size() == 1 &&
-                 "Currently this code only supports single-frame egs.");
-    const std::vector<std::pair<int32,BaseFloat> > &labels = buffer_[m].labels[0];
-    for (size_t i = 0; i < labels.size(); i++) {
-      MatrixElement<BaseFloat> 
-          tmp = {m, labels[i].first, labels[i].second};
-      sv_labels.push_back(tmp);
-      sv_labels_ind.push_back(MakePair(m, labels[i].first));
-    }
-  }
-  post_avg.Scale(1.0 / nnet_ensemble_.size());
-  post_avg.Scale(beta_);
-  post_avg.AddElements(1.0, sv_labels);
-
-  // calculate the deriv, do backprop, and calculate the objf.
-  for (int32 i = 0; i < nnet_ensemble_.size(); i++) {  
-    CuMatrix<BaseFloat> tmp_deriv(post_mat[i]);
-    post_mat[i].ApplyLog();
-    std::vector<BaseFloat> log_post_correct;
-    log_post_correct.resize(sv_labels_ind.size());
-    post_mat[i].Lookup(sv_labels_ind, &(log_post_correct[0]));
-    BaseFloat log_prob_this_net = std::accumulate(log_post_correct.begin(),
-                                                  log_post_correct.end(),
-                                                  static_cast<BaseFloat>(0));
-    avg_logprob_this_phase_ += log_prob_this_net;
-    tmp_deriv.InvertElements();
-    tmp_deriv.MulElements(post_avg);
-    updater_ensemble_[i]->Backprop(&tmp_deriv);
-  }
-  count_this_phase_ += buffer_.size();
-  buffer_.clear();
-  minibatches_seen_this_phase_++;
-  if (minibatches_seen_this_phase_ == config_.minibatches_per_phase) {
-    avg_logprob_this_phase_ /= static_cast<BaseFloat>(nnet_ensemble_.size());
-    bool first_time = false;
-    BeginNewPhase(first_time);
-  }
-}
-
-void NnetEnsembleTrainer::BeginNewPhase(bool first_time) {
-  if (!first_time)
-    KALDI_LOG << "Averaged cross-entropy between the supervision labels and the output is "
-              << (avg_logprob_this_phase_/count_this_phase_) << " over "
-              << count_this_phase_ << " frames, during this phase";
-  avg_logprob_this_phase_ = 0.0;
-  count_this_phase_ = 0.0;
-  minibatches_seen_this_phase_ = 0;
-  num_phases_++;
-}
-
-
-NnetEnsembleTrainer::~NnetEnsembleTrainer() {
-  if (!buffer_.empty()) {
-    KALDI_LOG << "Doing partial minibatch of size "
-              << buffer_.size();
-    TrainOneMinibatch();
-    if (minibatches_seen_this_phase_ != 0) {
-      bool first_time = false;
-      BeginNewPhase(first_time);
-    }
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/train-nnet-ensemble.h b/src/nnet2/train-nnet-ensemble.h
deleted file mode 100644
index 3ea450d695b..00000000000
--- a/src/nnet2/train-nnet-ensemble.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// nnet2/train-nnet-ensemble.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-//           2014  Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_TRAIN_NNET_ENSEMBLE_H_
-#define KALDI_NNET2_TRAIN_NNET_ENSEMBLE_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-struct NnetEnsembleTrainerConfig {
-  int32 minibatch_size;
-  int32 minibatches_per_phase;
-  double beta;
-
-  NnetEnsembleTrainerConfig(): minibatch_size(500),
-                             minibatches_per_phase(50),
-                             beta(0.5) { }
-  
-  void Register (OptionsItf *opts) {
-    opts->Register("minibatch-size", &minibatch_size,
-                   "Number of samples per minibatch of training data.");
-    opts->Register("minibatches-per-phase", &minibatches_per_phase,
-                   "Number of minibatches to wait before printing training-set "
-                   "objective.");
-    opts->Register("beta", &beta, 
-                   "weight of the second term in the objf, which is the cross-entropy "
-                   "between the output posteriors and the averaged posteriors from other nets.");
-  }  
-};
-
-
-// Similar as NnetTrainer, Class NnetEnsembleTrainer first batches
-// up the input into minibatches and feed the data into every nnet in 
-// the ensemble, call Propogate to do forward propogation, and 
-// collect the output posteriors. The posteriors from different 
-// nets are averaged and then used to compute the additional term 
-// in the objf: (a constant times) the cross-entropy between each 
-// net's output posteriors and the averaged posteriors of 
-// the whole nnet ensemble. We also calculate the derivs and 
-// then call Backprop() to update each net separately.
-
-class NnetEnsembleTrainer {
- public:
-  NnetEnsembleTrainer(const NnetEnsembleTrainerConfig &config,
-                      std::vector<Nnet*> nnet_ensemble);
-  
-  /// TrainOnExample will take the example and add it to a buffer;
-  /// if we've reached the minibatch size it will do the training.
-  void TrainOnExample(const NnetExample &value);
-
-  ~NnetEnsembleTrainer();
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(NnetEnsembleTrainer);
-  
-  void TrainOneMinibatch();
-  
-  // The following function is called by TrainOneMinibatch()
-  // when we enter a new phase.
-  void BeginNewPhase(bool first_time);
-  
-  // Things we were given in the initializer:
-  NnetEnsembleTrainerConfig config_;
-
-  std::vector<Nnet*> nnet_ensemble_; // the nnet ensemble we're training.
-  std::vector<NnetUpdater*> updater_ensemble_;
-
-  // State information:
-  int32 num_phases_;
-  int32 minibatches_seen_this_phase_;
-  std::vector<NnetExample> buffer_; 
-
-  // ratio of the supervision, when interpolating the supervision with the averaged posteriors. 
-  double beta_;
-  double avg_logprob_this_phase_; // Needed for accumulating train log-prob on each phase.
-  double count_this_phase_; // count corresponding to the above.
-};
-
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/train-nnet.cc b/src/nnet2/train-nnet.cc
deleted file mode 100644
index fe6957190f1..00000000000
--- a/src/nnet2/train-nnet.cc
+++ /dev/null
@@ -1,206 +0,0 @@
-// nnet2/train-nnet.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/train-nnet.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-class NnetExampleBackgroundReader {
- public:
-  NnetExampleBackgroundReader(int32 minibatch_size,
-                              Nnet *nnet,
-                              SequentialNnetExampleReader *reader):
-      minibatch_size_(minibatch_size), nnet_(nnet), reader_(reader),
-      finished_(false) {
-    // When this class is created, it spawns a thread which calls ReadExamples()
-    // in the background. Below, Run is the static class-member function.
-    thread_ = std::thread(Run, this);
-    // the following call is a signal that no-one is currently using the examples_ and
-    // formatted_examples_ class members.
-    consumer_semaphore_.Signal();
-  }
-  ~NnetExampleBackgroundReader() {
-    if (!thread_.joinable())
-      KALDI_ERR << "No thread to join.";
-    thread_.join();
-  }
-
-  // This will be called in a background thread.  It's responsible for
-  // reading and formatting the examples.
-  void ReadExamples() {
-    KALDI_ASSERT(minibatch_size_ > 0);
-    int32 minibatch_size = minibatch_size_;
-
-
-    // Loop over minibatches...
-    while (true) {
-      // When the following call succeeds we interpret it as a signal that
-      // we are free to write to the class-member variables examples_ and formatted_examples_.
-      consumer_semaphore_.Wait();
-
-      examples_.clear();
-      examples_.reserve(minibatch_size);
-      // Read the examples.
-      for (; examples_.size() < minibatch_size && !reader_->Done(); reader_->Next())
-        examples_.push_back(reader_->Value());
-
-      // Format the examples as a single matrix.  The reason we do this here is
-      // that it's a somewhat CPU-intensive operation (involves decompressing
-      // the matrix), so we do it in a separate thread from the one that's
-      // controlling the GPU (assuming we're using a GPU), so we can get better
-      // GPU utilization.  If we have no GPU this doesn't hurt us.
-      if (examples_.empty()) {
-        formatted_examples_.Resize(0, 0);
-        total_weight_ = 0.0;
-      } else {
-        FormatNnetInput(*nnet_, examples_, &formatted_examples_);
-        total_weight_ = TotalNnetTrainingWeight(examples_);
-      }
-
-      bool finished = examples_.empty();
-
-      // The following call alerts the main program thread (that calls
-      // GetNextMinibatch() that it can how use the contents of
-      // examples_ and formatted_examples_.
-      producer_semaphore_.Signal();
-
-      // If we just read an empty minibatch (because no more examples),
-      // then return.
-      if (finished)
-        return;
-    }
-  }
-
-  // this wrapper can be passed to pthread_create.
-  static void* Run(void *ptr_in) {
-    NnetExampleBackgroundReader *ptr =
-        reinterpret_cast<NnetExampleBackgroundReader*>(ptr_in);
-    ptr->ReadExamples();
-    return NULL;
-  }
-
-  // This call makes available the next minibatch of input.  It returns
-  // true if it got some, and false if there was no more available.
-  // It is an error if you call this function after it has returned false.
-  bool GetNextMinibatch(std::vector<NnetExample> *examples,
-                        Matrix<BaseFloat> *formatted_examples,
-                        double *total_weight) {
-    KALDI_ASSERT(!finished_);
-    // wait until examples_ and formatted_examples_ have been created by
-    // the background thread.
-    producer_semaphore_.Wait();
-    // the calls to swap and Swap are lightweight.
-    examples_.swap(*examples);
-    formatted_examples_.Swap(formatted_examples);
-    *total_weight = total_weight_;
-
-    // signal the background thread that it is now free to write
-    // again to examples_ and formatted_examples_.
-    consumer_semaphore_.Signal();
-
-    if (examples->empty()) {
-      finished_ = true;
-      return false;
-    } else {
-      return true;
-    }
-  }
-
- private:
-  int32 minibatch_size_;
-  Nnet *nnet_;
-  SequentialNnetExampleReader *reader_;
-  std::thread thread_;
-
-  std::vector<NnetExample> examples_;
-  Matrix<BaseFloat> formatted_examples_;
-  double total_weight_;  // total weight, from TotalNnetTrainingWeight(examples_).
-                         // better to compute this in the background thread.
-
-  Semaphore producer_semaphore_;
-  Semaphore consumer_semaphore_;
-
-  bool finished_;
-};
-
-
-
-int64 TrainNnetSimple(const NnetSimpleTrainerConfig &config,
-                      Nnet *nnet,
-                      SequentialNnetExampleReader *reader,
-                      double *tot_weight_ptr,
-                      double *tot_logprob_ptr) {
-  int64 num_egs_processed = 0;
-  double tot_weight = 0.0, tot_logprob = 0.0;
-  NnetExampleBackgroundReader background_reader(config.minibatch_size,
-                                                nnet, reader);
-  KALDI_ASSERT(config.minibatches_per_phase > 0);
-  while (true) {
-    // Iterate over phases.  A phase of training is just a certain number of
-    // minibatches, and its only significance is that it's the periodicity with
-    // which we print diagnostics.
-    double tot_weight_this_phase = 0.0, tot_logprob_this_phase = 0.0;
-
-    int32 i;
-    for (i = 0; i < config.minibatches_per_phase; i++) {
-      std::vector<NnetExample> examples;
-      Matrix<BaseFloat> examples_formatted;
-      double minibatch_total_weight;  // this will normally equal minibatch size.
-      if (!background_reader.GetNextMinibatch(&examples, &examples_formatted,
-                                              &minibatch_total_weight))
-        break;
-      tot_logprob_this_phase += DoBackprop(*nnet, examples, &examples_formatted,
-                                           nnet, NULL);
-      tot_weight_this_phase += minibatch_total_weight;
-      num_egs_processed += examples.size();
-    }
-    if (i != 0) {
-      KALDI_LOG << "Training objective function (this phase) is "
-                << (tot_logprob_this_phase / tot_weight_this_phase) << " over "
-                << tot_weight_this_phase << " frames.";
-    }
-    tot_weight += tot_weight_this_phase;
-    tot_logprob += tot_logprob_this_phase;
-    if (i != config.minibatches_per_phase) {
-      // did not get all the minibatches we wanted because no more input.
-      // this is true if and only if we did "break" in the loop over i above.
-      break;
-    }
-  }
-  if (tot_weight == 0.0) {
-    KALDI_WARN << "No data seen.";
-  } else {
-    KALDI_LOG << "Did backprop on " << tot_weight
-              << " examples, average log-prob per frame is "
-              << (tot_logprob / tot_weight);
-    KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame="
-              << (tot_logprob / tot_weight);
-  }
-  if (tot_weight_ptr) *tot_weight_ptr = tot_weight;
-  if (tot_logprob_ptr) *tot_logprob_ptr = tot_logprob;
-  return num_egs_processed;
-}
-
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/train-nnet.h b/src/nnet2/train-nnet.h
deleted file mode 100644
index 835ea8b47e6..00000000000
--- a/src/nnet2/train-nnet.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// nnet2/train-nnet.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_TRAIN_NNET_H_
-#define KALDI_NNET2_TRAIN_NNET_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-struct NnetSimpleTrainerConfig {
-  int32 minibatch_size;
-  int32 minibatches_per_phase;
-  
-  NnetSimpleTrainerConfig(): minibatch_size(500),
-                             minibatches_per_phase(50) { }
-  
-  void Register (OptionsItf *opts) {
-    opts->Register("minibatch-size", &minibatch_size,
-                   "Number of samples per minibatch of training data.");
-    opts->Register("minibatches-per-phase", &minibatches_per_phase,
-                   "Number of minibatches to wait before printing training-set "
-                   "objective.");
-  }  
-};
-
-
-/// Train on all the examples it can read from the reader.  This does training
-/// in a single thread, but it uses a separate thread to read in the examples
-/// and format the input data on the CPU; this saves us time when using GPUs.
-/// Returns the number of examples processed.
-/// Outputs to tot_weight and tot_logprob_per_frame, if non-NULL, the total
-/// weight of the examples (typically equal to the number of examples) and the
-/// total logprob objective function.
-int64 TrainNnetSimple(const NnetSimpleTrainerConfig &config,
-                      Nnet *nnet,
-                      SequentialNnetExampleReader *reader,
-                      double *tot_weight = NULL,
-                      double *tot_logprob = NULL);
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2/widen-nnet.cc b/src/nnet2/widen-nnet.cc
deleted file mode 100644
index 2b49be56728..00000000000
--- a/src/nnet2/widen-nnet.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// nnet2/widen-nnet.cc
-
-// Copyright 2012   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/widen-nnet.h"
-#include "gmm/model-common.h" // for GetSplitTargets()
-#include <numeric> // for std::accumulate
-
-namespace kaldi {
-namespace nnet2 {
-
-
-void AffineComponent::Widen(int32 new_dim,
-                            BaseFloat param_stddev,
-                            BaseFloat bias_stddev,
-                            std::vector<NonlinearComponent*> c2, // will usually
-                                                                 // have just
-                                                                 // one element.
-                            AffineComponent *c3) {
-  int32 old_dim = this->OutputDim(), extra_dim = new_dim - old_dim;
-  KALDI_ASSERT(!c2.empty());
-  if (new_dim <= old_dim) {
-    KALDI_WARN << "Not widening component because new dim "
-               << new_dim << " <= old dim " << old_dim;
-    return;
-  }
-  
-  this->bias_params_.Resize(new_dim,
-                            kCopyData);
-  this->bias_params_.Range(old_dim, extra_dim).SetRandn();
-  this->bias_params_.Range(old_dim, extra_dim).Scale(bias_stddev);
-
-  this->linear_params_.Resize(new_dim, InputDim(), kCopyData);
-  this->linear_params_.Range(old_dim, extra_dim,
-                             0, InputDim()).SetRandn();
-  this->linear_params_.Range(old_dim, extra_dim,
-                             0, InputDim()).Scale(param_stddev);
-
-  for (size_t i = 0; i < c2.size(); i++) // Change dimension of nonlinear
-    c2[i]->SetDim(new_dim);              // components
-    
-  // Change dimension of next affine component [extend with zeros,
-  // so the existing outputs do not change in value]
-  c3->linear_params_.Resize(c3->OutputDim(), new_dim, kCopyData);
-}
-
-void WidenNnet(const NnetWidenConfig &widen_config,
-               Nnet *nnet) {
-
-  int32 C = nnet->NumComponents();
-  int32 num_widened = 0;
-
-  for (int32 c = 0; c < C - 3; c++) {
-    AffineComponent *c1 = dynamic_cast<AffineComponent*>(&(nnet->GetComponent(c)));
-    if (c1 == NULL) continue;
-    std::vector<NonlinearComponent*> c2; // normally just one element, but allow two right now.
-    c2.push_back(dynamic_cast<NonlinearComponent*>(&(nnet->GetComponent(c+1))));
-    if (c2.back() == NULL) continue;
-    c2.push_back(dynamic_cast<NonlinearComponent*>(&(nnet->GetComponent(c+2))));
-    AffineComponent *c3;
-    if (c2.back() == NULL) {
-      c2.pop_back();
-      c3 = dynamic_cast<AffineComponent*>(&(nnet->GetComponent(c+2)));
-    } else {
-      if (c + 3 >= C) continue;
-      c3 = dynamic_cast<AffineComponent*>(&(nnet->GetComponent(c+3)));
-    }
-    if (c3 == NULL) continue;
-    BaseFloat param_stddev = widen_config.param_stddev_factor /
-        sqrt(1.0 * c1->InputDim());
-    KALDI_LOG << "Widening component " << c << " from "
-              << c1->OutputDim() << " to " << widen_config.hidden_layer_dim;
-    
-    c1->Widen(widen_config.hidden_layer_dim,
-              param_stddev, widen_config.bias_stddev,
-              c2, c3);
-    num_widened++;
-  }
-  nnet->Check();
-  KALDI_LOG << "Widened " << num_widened << " components.";
-}  
-
-  
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/widen-nnet.h b/src/nnet2/widen-nnet.h
deleted file mode 100644
index 1684b4b69fb..00000000000
--- a/src/nnet2/widen-nnet.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// nnet2/widen-nnet.h
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_WIDEN_NNET_H_
-#define KALDI_NNET2_WIDEN_NNET_H_
-
-#include "nnet2/nnet-update.h"
-#include "nnet2/nnet-compute.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/** Configuration class that controls neural net "widening", which means increasing
-    the dimension of the hidden layers of an already-trained neural net.
- */
-struct NnetWidenConfig {
-  int32 hidden_layer_dim;
-  BaseFloat param_stddev_factor;
-  BaseFloat bias_stddev;
-  
-  NnetWidenConfig(): hidden_layer_dim(-1),
-                     param_stddev_factor(1.0),
-                     bias_stddev(0.5) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("hidden-layer-dim", &hidden_layer_dim, "[required option]: "
-                   "target dimension of hidden layers");
-    opts->Register("param-stddev-factor", &param_stddev_factor, "Factor in "
-                   "standard deviation of linear parameters of new part of "
-                   "transform (multiply by 1/sqrt of input-dim)");
-    opts->Register("bias-stddev", &bias_stddev, "Standard deviation of added "
-                   "bias parameters");
-  }  
-};
-
-/**
-   This function widens a neural network by increasing the hidden-layer
-   dimensions to the target. */
-
-void WidenNnet(const NnetWidenConfig &widen_config,
-               Nnet *nnet);
-  
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2bin/Makefile b/src/nnet2bin/Makefile
deleted file mode 100644
index b172bd3739b..00000000000
--- a/src/nnet2bin/Makefile
+++ /dev/null
@@ -1,46 +0,0 @@
-
-# cuda-compiled was moved to nnet3bin/; remove it in case an outdated
-# version persists.
-all:
-	-rm -f cuda-compiled
-
-
-EXTRA_CXXFLAGS = -Wno-sign-compare
-include ../kaldi.mk
-
-LDFLAGS += $(CUDA_LDFLAGS)
-LDLIBS += $(CUDA_LDLIBS)
-
-BINFILES = nnet-am-info nnet-init \
-   nnet-train-simple nnet-train-ensemble nnet-train-transitions nnet-latgen-faster nnet-am-copy \
-   nnet-am-init nnet-insert nnet-align-compiled \
-   nnet-compute-prob nnet-copy-egs nnet-combine \
-   nnet-am-average nnet-am-compute nnet-am-mixup \
-   nnet-get-egs nnet-train-parallel nnet-combine-fast \
-   nnet-subset-egs nnet-shuffle-egs nnet-am-fix \
-   nnet-latgen-faster-parallel nnet-to-raw-nnet nnet-compute \
-   raw-nnet-concat raw-nnet-info \
-   nnet-get-feature-transform nnet-compute-from-egs \
-   nnet-am-widen nnet-show-progress \
-   nnet-get-feature-transform-multi nnet-copy-egs-discriminative \
-   nnet-get-egs-discriminative nnet-shuffle-egs-discriminative \
-   nnet-compare-hash-discriminative nnet-combine-egs-discriminative \
-   nnet-train-discriminative-simple nnet-train-discriminative-parallel \
-   nnet-modify-learning-rates nnet-normalize-stddev  \
-   nnet-get-weighted-egs nnet-adjust-priors \
-   nnet-replace-last-layers nnet-am-switch-preconditioning \
-   nnet1-to-raw-nnet raw-nnet-copy nnet-relabel-egs nnet-am-reinitialize
-
-OBJFILES =
-
-
-TESTFILES =
-
-ADDLIBS = ../nnet2/kaldi-nnet2.a ../nnet/kaldi-nnet.a \
-          ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
-          ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
-          ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
-
-include ../makefiles/default_rules.mk
diff --git a/src/nnet2bin/nnet-adjust-priors.cc b/src/nnet2bin/nnet-adjust-priors.cc
deleted file mode 100644
index 4d0ec110698..00000000000
--- a/src/nnet2bin/nnet-adjust-priors.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-// nnet2bin/nnet-adjust-priors.cc
-
-// Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-// Computes one-sided K-L divergence from p to q.
-BaseFloat KlDivergence(const Vector<BaseFloat> &p,
-                       const Vector<BaseFloat> &q) {
-  BaseFloat sum_p = p.Sum(), sum_q = q.Sum();
-  if (fabs(sum_p - 1.0) > 0.01 || fabs(sum_q - 1.0) > 0.01) {
-    KALDI_WARN << "KlDivergence: vectors are not close to being normalized "
-               << sum_p << ", " << sum_q;
-  }
-  KALDI_ASSERT(p.Dim() == q.Dim());
-  double ans = 0.0;
-
-  for (int32 i = 0; i < p.Dim(); i++) {
-    BaseFloat p_prob = p(i) / sum_p, q_prob = q(i) / sum_q;
-    ans += p_prob * Log(p_prob / q_prob);
-  }
-  return ans;
-}
-
-void PrintPriorDiagnostics(const Vector<BaseFloat> &old_priors,
-                           const Vector<BaseFloat> &new_priors) {
-  if (old_priors.Dim() == 0) {
-    KALDI_LOG << "Model did not previously have priors attached.";
-  } else {
-    Vector<BaseFloat> diff_prior(new_priors);
-    diff_prior.AddVec(-1.0, old_priors);
-    diff_prior.ApplyAbs();
-    int32 max_index;
-    diff_prior.Max(&max_index);
-    KALDI_LOG << "Adjusting priors: largest absolute difference was for "
-              << "pdf " << max_index << ", " << old_priors(max_index)
-              << " -> " << new_priors(max_index);
-    KALDI_LOG << "Adjusting priors: K-L divergence from old to new is "
-              << KlDivergence(old_priors, new_priors);
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Set the priors of the neural net to the computed posterios from the net,\n"
-        "on typical data (e.g. training data). This is correct under more general\n"
-        "circumstances than using the priors of the class labels in the training data\n"
-        "\n"
-        "Typical usage of this program will involve computation of an average pdf-level\n"
-        "posterior with nnet-compute or nnet-compute-from-egs, piped into matrix-sum-rows\n"
-        "and then vector-sum, to compute the average posterior\n"
-        "\n"
-        "Usage: nnet-adjust-priors [options] <nnet-in> <summed-posterior-vector-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-adjust-priors final.mdl prior.vec final.mdl\n";
-    
-    bool binary_write = true;
-    BaseFloat prior_floor = 1.0e-15; // Have a very low prior floor, since this method
-                                     // isn't likely to have a problem with very improbable
-                                     // classes.
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("prior-floor", &prior_floor, "When setting priors, floor for "
-                "priors (only used to avoid generating NaNs upon inversion)");
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        posterior_vec_rxfilename = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-    
-
-    Vector<BaseFloat> posterior_vec;
-    ReadKaldiObject(posterior_vec_rxfilename, &posterior_vec);
-
-    KALDI_ASSERT(posterior_vec.Sum() > 0.0);
-    posterior_vec.Scale(1.0 / posterior_vec.Sum()); // Renormalize
-    
-    Vector<BaseFloat> old_priors(am_nnet.Priors());
-
-    PrintPriorDiagnostics(old_priors, posterior_vec);
-    
-    am_nnet.SetPriors(posterior_vec);
-        
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Modified priors of neural network model and wrote it to "
-              << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-align-compiled.cc b/src/nnet2bin/nnet-align-compiled.cc
deleted file mode 100644
index 6d199671cc4..00000000000
--- a/src/nnet2bin/nnet-align-compiled.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-// nnet2bin/nnet-align-compiled.cc
-
-// Copyright 2009-2012  Microsoft Corporation
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "hmm/hmm-utils.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "decoder/training-graph-compiler.h"
-#include "nnet2/decodable-am-nnet.h"
-#include "lat/kaldi-lattice.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Align features given neural-net-based model\n"
-        "Usage:   nnet-align-compiled [options] <model-in> <graphs-rspecifier> "
-        "<feature-rspecifier> <alignments-wspecifier>\n"
-        "e.g.: \n"
-        " nnet-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n"
-        "or:\n"
-        " compile-train-graphs tree 1.mdl lex.fst 'ark:sym2int.pl -f 2- words.txt text|' \\\n"
-        "   ark:- | nnet-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
-
-    ParseOptions po(usage);
-    AlignConfig align_config;
-    std::string use_gpu = "yes";
-    BaseFloat acoustic_scale = 1.0;
-    BaseFloat transition_scale = 1.0;
-    BaseFloat self_loop_scale = 1.0;
-    std::string per_frame_acwt_wspecifier;
-
-    align_config.Register(&po);
-    po.Register("transition-scale", &transition_scale,
-                "Transition-probability scale [relative to acoustics]");
-    po.Register("acoustic-scale", &acoustic_scale,
-                "Scaling factor for acoustic likelihoods");
-    po.Register("self-loop-scale", &self_loop_scale,
-                "Scale of self-loop versus non-self-loop "
-                "log probs [relative to acoustics]");
-    po.Register("write-per-frame-acoustic-loglikes", &per_frame_acwt_wspecifier,
-                "Wspecifier for table of vectors containing the acoustic log-likelihoods "
-                "per frame for each utterance. E.g. ark:foo/per_frame_logprobs.1.ark");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 4 || po.NumArgs() > 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_rspecifier = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        alignment_wspecifier = po.GetArg(4),
-        scores_wspecifier = po.GetOptArg(5);
-
-    int num_done = 0, num_err = 0, num_retry = 0;
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-
-    {
-      TransitionModel trans_model;
-      AmNnet am_nnet;
-      {
-        bool binary;
-        Input ki(model_in_filename, &binary);
-        trans_model.Read(ki.Stream(), binary);
-        am_nnet.Read(ki.Stream(), binary);
-      }
-
-      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_rspecifier);
-      RandomAccessBaseFloatCuMatrixReader feature_reader(feature_rspecifier);
-      Int32VectorWriter alignment_writer(alignment_wspecifier);
-      BaseFloatWriter scores_writer(scores_wspecifier);
-      BaseFloatVectorWriter per_frame_acwt_writer(per_frame_acwt_wspecifier);
-
-      for (; !fst_reader.Done(); fst_reader.Next()) {
-        std::string utt = fst_reader.Key();
-        if (!feature_reader.HasKey(utt)) {
-          KALDI_WARN << "No features for utterance " << utt;
-          num_err++;
-          continue;
-        }
-        const CuMatrix<BaseFloat> &features = feature_reader.Value(utt);
-        VectorFst<StdArc> decode_fst(fst_reader.Value());
-        fst_reader.FreeCurrent();  // this stops copy-on-write of the fst
-        // by deleting the fst inside the reader, since we're about to mutate
-        // the fst by adding transition probs.
-
-        if (features.NumRows() == 0) {
-          KALDI_WARN << "Zero-length utterance: " << utt;
-          num_err++;
-          continue;
-        }
-
-        {  // Add transition-probs to the FST.
-          std::vector<int32> disambig_syms;  // empty.
-          AddTransitionProbs(trans_model, disambig_syms,
-                             transition_scale, self_loop_scale,
-                             &decode_fst);
-        }
-
-        bool pad_input = true;
-        DecodableAmNnet nnet_decodable(trans_model, am_nnet, features,
-                                       pad_input, acoustic_scale);
-
-        AlignUtteranceWrapper(align_config, utt,
-                              acoustic_scale, &decode_fst, &nnet_decodable,
-                              &alignment_writer, &scores_writer,
-                              &num_done, &num_err, &num_retry,
-                              &tot_like, &frame_count, &per_frame_acwt_writer);
-      }
-      KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count)
-                << " over " << frame_count<< " frames.";
-      KALDI_LOG << "Retried " << num_retry << " out of "
-                << (num_done + num_err) << " utterances.";
-      KALDI_LOG << "Done " << num_done << ", errors on " << num_err;
-    }
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-average.cc b/src/nnet2bin/nnet-am-average.cc
deleted file mode 100644
index 61ee9218ff7..00000000000
--- a/src/nnet2bin/nnet-am-average.cc
+++ /dev/null
@@ -1,260 +0,0 @@
-// nnet2bin/nnet-am-average.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/combine-nnet-a.h"
-#include "nnet2/am-nnet.h"
-
-namespace kaldi {
-
-void GetWeights(const std::string &weights_str,
-                int32 num_inputs,
-                std::vector<BaseFloat> *weights) {
-  KALDI_ASSERT(num_inputs >= 1);
-  if (!weights_str.empty()) {
-    SplitStringToFloats(weights_str, ":", true, weights);
-    if (weights->size() != num_inputs) {
-      KALDI_ERR << "--weights option must be a colon-separated list "
-                << "with " << num_inputs << " elements, got: "
-                << weights_str;
-    }
-  } else {
-    for (int32 i = 0; i < num_inputs; i++)
-      weights->push_back(1.0 / num_inputs);
-  }
-  // normalize the weights to sum to one.
-  float weight_sum = 0.0;
-  for (int32 i = 0; i < num_inputs; i++)
-    weight_sum += (*weights)[i];
-  for (int32 i = 0; i < num_inputs; i++)
-    (*weights)[i] = (*weights)[i] / weight_sum;
-  if (fabs(weight_sum - 1.0) > 0.01) {
-    KALDI_WARN << "Normalizing weights to sum to one, sum was " << weight_sum;
-  }
-}
-
-
-
-std::vector<bool> GetSkipLayers(const std::string &skip_layers_str,
-                                const int32 first_layer_idx,
-                                const int32 last_layer_idx) {
-
-  std::vector<bool> skip_layers(last_layer_idx, false);
-
-  if (skip_layers_str.empty()) {
-    return skip_layers;
-  }
-
-  std::vector<int> layer_indices;
-  bool ret = SplitStringToIntegers(skip_layers_str, ":", true, &layer_indices);
-  if (!ret) {
-    KALDI_ERR << "Cannot parse the skip layers specifier. It should be"
-              << "colon-separated list of integers";
-  }
-
-  int min_elem = std::numeric_limits<int>().max(),
-      max_elem = std::numeric_limits<int>().min();
-
-  std::vector<int>::iterator it;
-  for ( it = layer_indices.begin(); it != layer_indices.end(); ++it ) {
-    if ( *it < 0 )
-      *it = last_layer_idx + *it;  // convert the negative indices to
-                                       // correct indices -- -1 would be the
-                                       // last one, -2 the one before the last
-                                       // and so on.
-    if (*it > max_elem)
-      max_elem = *it;
-
-    if (*it < min_elem)
-      min_elem = *it;
-  }
-
-  if (max_elem >= last_layer_idx) {
-    KALDI_ERR << "--skip-layers option has to be a colon-separated list"
-              << "of indices which are supposed to be skipped.\n"
-              << "Maximum expected index: " << last_layer_idx
-              << " got: " << max_elem ;
-  }
-  if (min_elem < first_layer_idx) {
-    KALDI_ERR << "--skip-layers option has to be a colon-separated list"
-              << "of indices which are supposed to be skipped.\n"
-              << "Minimum expected index: " << first_layer_idx
-              << " got: " << min_elem ;
-  }
-
-  for ( it = layer_indices.begin(); it != layer_indices.end(); ++it ) {
-    skip_layers[*it] = true;
-  }
-  return skip_layers;
-}
-
-}
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    using std::string;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "This program averages (or sums, if --sum=true) the parameters over a\n"
-        "number of neural nets.  If you supply the option --skip-last-layer=true,\n"
-        "the parameters of the last updatable layer are copied from <model1> instead\n"
-        "of being averaged (useful in multi-language scenarios).\n"
-        "The --weights option can be used to weight each model differently.\n"
-        "\n"
-        "Usage:  nnet-am-average [options] <model1> <model2> ... <modelN> <model-out>\n"
-        "\n"
-        "e.g.:\n"
-        " nnet-am-average 1.1.nnet 1.2.nnet 1.3.nnet 2.nnet\n";
-
-    bool binary_write = true;
-    bool sum = false;
-
-    ParseOptions po(usage);
-    po.Register("sum", &sum, "If true, sums instead of averages.");
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    string weights_str;
-    bool skip_last_layer = false;
-    string skip_layers_str;
-    po.Register("weights", &weights_str, "Colon-separated list of weights, one "
-                "for each input model.  These will be normalized to sum to one.");
-    po.Register("skip-last-layer", &skip_last_layer, "If true, averaging of "
-                "the last updatable layer is skipped (result comes from model1)");
-    po.Register("skip-layers", &skip_layers_str, "Colon-separated list of "
-                "indices of the layers that should be skipped during averaging."
-                "Be careful: this parameter uses an absolute indexing of "
-                "layers, i.e. iterates over all components, not over updatable "
-                "ones only.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string
-        nnet1_rxfilename = po.GetArg(1),
-        nnet_wxfilename = po.GetArg(po.NumArgs());
-
-    TransitionModel trans_model1;
-    AmNnet am_nnet1;
-    {
-      bool binary_read;
-      Input ki(nnet1_rxfilename, &binary_read);
-      trans_model1.Read(ki.Stream(), binary_read);
-      am_nnet1.Read(ki.Stream(), binary_read);
-    }
-
-    int32 num_inputs = po.NumArgs() - 1;
-
-    std::vector<BaseFloat> model_weights;
-    GetWeights(weights_str, num_inputs, &model_weights);
-
-    int32 c_begin = 0,
-        c_end = (skip_last_layer ?
-                 am_nnet1.GetNnet().LastUpdatableComponent() :
-                 am_nnet1.GetNnet().NumComponents());
-    KALDI_ASSERT(c_end != -1 && "Network has no updatable components.");
-
-    int32 last_layer_idx = am_nnet1.GetNnet().NumComponents();
-    std::vector<bool> skip_layers = GetSkipLayers(skip_layers_str,
-                                             0,
-                                             last_layer_idx);
-
-    // scale the components - except the last layer, if skip_last_layer == true.
-    for (int32 c = c_begin; c < c_end; c++) {
-      if (skip_layers[c]) {
-        KALDI_VLOG(2) << "Not averaging layer " << c << " (as requested)";
-        continue;
-      }
-      bool updated = false;
-      UpdatableComponent *uc =
-        dynamic_cast<UpdatableComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
-      if (uc != NULL)  {
-        KALDI_VLOG(2) << "Averaging layer " << c << " (UpdatableComponent)";
-        uc->Scale(model_weights[0]);
-        updated = true;
-      }
-      NonlinearComponent *nc =
-        dynamic_cast<NonlinearComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
-      if (nc != NULL) {
-        KALDI_VLOG(2) << "Averaging layer " << c << " (NonlinearComponent)";
-        nc->Scale(model_weights[0]);
-        updated = true;
-      }
-      if (! updated) {
-        KALDI_VLOG(2) << "Not averaging layer " << c
-          << " (unscalable component)";
-      }
-    }
-
-    for (int32 i = 2; i <= num_inputs; i++) {
-      bool binary_read;
-      Input ki(po.GetArg(i), &binary_read);
-      TransitionModel trans_model;
-      trans_model.Read(ki.Stream(), binary_read);
-      AmNnet am_nnet;
-      am_nnet.Read(ki.Stream(), binary_read);
-
-      for (int32 c = c_begin; c < c_end; c++) {
-        if (skip_layers[c]) continue;
-
-        UpdatableComponent *uc_average =
-          dynamic_cast<UpdatableComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
-        const UpdatableComponent *uc_this =
-          dynamic_cast<const UpdatableComponent*>(&(am_nnet.GetNnet().GetComponent(c)));
-        if (uc_average != NULL) {
-          KALDI_ASSERT(uc_this != NULL &&
-                       "Networks must have the same structure.");
-          uc_average->Add(model_weights[i-1], *uc_this);
-        }
-
-        NonlinearComponent *nc_average =
-          dynamic_cast<NonlinearComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
-        const NonlinearComponent *nc_this =
-          dynamic_cast<const NonlinearComponent*>(&(am_nnet.GetNnet().GetComponent(c)));
-        if (nc_average != NULL) {
-          KALDI_ASSERT(nc_this != NULL &&
-                       "Networks must have the same structure.");
-          nc_average->Add(model_weights[i-1], *nc_this);
-        }
-      }
-    }
-
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model1.Write(ko.Stream(), binary_write);
-      am_nnet1.Write(ko.Stream(), binary_write);
-    }
-
-    KALDI_LOG << "Averaged parameters of " << num_inputs
-              << " neural nets, and wrote to " << nnet_wxfilename;
-    return 0; // it will throw an exception if there are any problems.
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-compute.cc b/src/nnet2bin/nnet-am-compute.cc
deleted file mode 100644
index 2b50f7cc656..00000000000
--- a/src/nnet2bin/nnet-am-compute.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-// nnet2bin/nnet-am-compute.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-//           2015  Johns Hopkins University (author:  Daniel Garcia-Romero)
-//           2015  David Snyder
-//           2017  Karel Vesely
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Does the neural net computation for each file of input features, and\n"
-        "outputs as a matrix the result.  Used mostly for debugging.\n"
-        "Note: if you want it to apply a log (e.g. for log-likelihoods), use\n"
-        "--apply-log=true\n"
-        "\n"
-        "Usage:  nnet-am-compute [options] <model-in> <feature-rspecifier> "
-        "<feature-or-loglikes-wspecifier>\n"
-        "See also: nnet-compute, nnet-logprob\n";
-
-    bool divide_by_priors = false;
-    bool apply_log = false;
-    bool pad_input = true;
-    std::string use_gpu = "no";
-    int32 chunk_size = 0;
-    ParseOptions po(usage);
-    po.Register("divide-by-priors", &divide_by_priors, "If true, "
-                "divide by the priors stored in the model and re-normalize, apply-log may follow");
-    po.Register("apply-log", &apply_log, "Apply a log to the result of the computation "
-                "before outputting.");
-    po.Register("pad-input", &pad_input, "If true, duplicate the first and last frames "
-                "of input features as required for temporal context, to prevent #frames "
-                "of output being less than those of input.");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    po.Register("chunk-size", &chunk_size, "Process the feature matrix in chunks.  "
-                "This is useful when processing large feature files in the GPU.  "
-                "If chunk-size > 0, pad-input must be true.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    // If chunk_size is greater than 0, pad_input needs to be true.
-    KALDI_ASSERT(chunk_size < 0 || pad_input);
-
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        features_rspecifier = po.GetArg(2),
-        features_or_loglikes_wspecifier = po.GetArg(3);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-
-    Nnet &nnet = am_nnet.GetNnet();
-
-    int64 num_done = 0, num_frames = 0;
-
-    Vector<BaseFloat> inv_priors(am_nnet.Priors());
-    KALDI_ASSERT((!divide_by_priors || inv_priors.Dim() == am_nnet.NumPdfs()) &&
-                 "Priors in neural network not set up.");
-    inv_priors.ApplyPow(-1.0);
-
-    SequentialBaseFloatMatrixReader feature_reader(features_rspecifier);
-    BaseFloatMatrixWriter writer(features_or_loglikes_wspecifier);
-
-    for (; !feature_reader.Done();  feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      const Matrix<BaseFloat> &feats  = feature_reader.Value();
-
-      int32 output_frames = feats.NumRows(), output_dim = nnet.OutputDim();
-      if (!pad_input)
-        output_frames -= nnet.LeftContext() + nnet.RightContext();
-      if (output_frames <= 0) {
-        KALDI_WARN << "Skipping utterance " << utt << " because output "
-                   << "would be empty.";
-        continue;
-      }
-
-      Matrix<BaseFloat> output(output_frames, output_dim);
-      if (chunk_size > 0 && chunk_size < feats.NumRows()) {
-        NnetComputationChunked(nnet, feats, chunk_size, &output);
-      } else {
-        CuMatrix<BaseFloat> cu_feats(feats);
-        CuMatrix<BaseFloat> cu_output(output);
-        NnetComputation(nnet, cu_feats, pad_input, &cu_output);
-        output.CopyFromMat(cu_output);
-      }
-
-      if (divide_by_priors) {
-        output.MulColsVec(inv_priors); // scales each column by the corresponding element
-        // of inv_priors.
-        for (int32 i = 0; i < output.NumRows(); i++) {
-          SubVector<BaseFloat> frame(output, i);
-          BaseFloat p = frame.Sum();
-          if (!(p > 0.0)) {
-            KALDI_WARN << "Bad sum of probabilities " << p;
-          } else {
-            frame.Scale(1.0 / p); // re-normalize to sum to one.
-          }
-        }
-      }
-
-      if (apply_log) {
-        output.ApplyFloor(1.0e-20);
-        output.ApplyLog();
-      }
-      writer.Write(utt, output);
-      num_frames += feats.NumRows();
-      num_done++;
-    }
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-
-    KALDI_LOG << "Processed " << num_done << " feature files, "
-              << num_frames << " frames of input were processed.";
-
-    return (num_done == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-copy.cc b/src/nnet2bin/nnet-am-copy.cc
deleted file mode 100644
index 2ea797350ee..00000000000
--- a/src/nnet2bin/nnet-am-copy.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-// nnet2bin/nnet-am-copy.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <typeinfo>
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/am-nnet.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Copy a (nnet2) neural net and its associated transition model,\n"
-        "possibly changing the binary mode\n"
-        "Also supports multiplying all the learning rates by a factor\n"
-        "(the --learning-rate-factor option) and setting them all to a given\n"
-        "value (the --learning-rate options)\n"
-        "\n"
-        "Usage:  nnet-am-copy [options] <nnet-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-am-copy --binary=false 1.mdl text.mdl\n";
-
-    int32 truncate = -1;
-    bool binary_write = true;
-    bool remove_dropout = false;
-    BaseFloat dropout_scale = -1.0;
-    bool remove_preconditioning = false;
-    bool collapse = false;
-    bool match_updatableness = true;
-    BaseFloat learning_rate_factor = 1.0, learning_rate = -1;
-    std::string learning_rate_scales_str = " ";
-    std::string learning_rates = "";
-    std::string scales = "";
-    std::string stats_from;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("learning-rate-factor", &learning_rate_factor,
-                "Before copying, multiply all the learning rates in the "
-                "model by this factor.");
-    po.Register("learning-rate", &learning_rate,
-                "If supplied, all the learning rates of \"updatable\" layers"
-                "are set to this value.");
-    po.Register("learning-rates", &learning_rates,
-                "If supplied (a colon-separated list of learning rates), sets "
-                "the learning rates of \"updatable\" layers to these values.");
-    po.Register("scales", &scales,
-                "A colon-separated list of scaling factors, one for each updatable "
-                "layer: a mechanism to scale the parameters.");
-    po.Register("learning-rate-scales", &learning_rate_scales_str,
-                "Colon-separated list of scaling factors for learning rates, "
-                "applied after the --learning-rate and --learning-rates options."
-                "Used to scale learning rates for particular layer types.  E.g."
-                "--learning-rate-scales=AffineComponent=0.5");
-    po.Register("truncate", &truncate, "If set, will truncate the neural net "
-                "to this many components by removing the last components.");
-    po.Register("remove-dropout", &remove_dropout, "Set this to true to remove "
-                "any dropout components.");
-    po.Register("dropout-scale", &dropout_scale, "If set, set the dropout scale in any "
-                "dropout components to this value.  Note: in traditional dropout, this "
-                "is always zero; you can set it to any value between zero and one.");
-    po.Register("remove-preconditioning", &remove_preconditioning, "Set this to true to replace "
-                "components of type AffineComponentPreconditioned with AffineComponent.");
-    po.Register("stats-from", &stats_from, "Before copying neural net, copy the "
-                "statistics in any layer of type NonlinearComponent, from this "
-                "neural network: provide the extended filename.");
-    po.Register("collapse", &collapse, "If true, collapse sequences of AffineComponents "
-                "and FixedAffineComponents to compactify model");
-    po.Register("match-updatableness", &match_updatableness, "Only relevant if "
-                "collapse=true; set this to false to collapse mixed types.");
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        nnet_wxfilename = po.GetArg(2);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    if (learning_rate_factor != 1.0)
-      am_nnet.GetNnet().ScaleLearningRates(learning_rate_factor);
-
-    if (learning_rate >= 0)
-      am_nnet.GetNnet().SetLearningRates(learning_rate);
-
-    if (learning_rates != "") {
-      std::vector<BaseFloat> learning_rates_vec;
-      if (!SplitStringToFloats(learning_rates, ":", false, &learning_rates_vec)
-          || static_cast<int32>(learning_rates_vec.size()) !=
-             am_nnet.GetNnet().NumUpdatableComponents()) {
-        KALDI_ERR << "Expected --learning-rates option to be a "
-                  << "colon-separated string with "
-                  << am_nnet.GetNnet().NumUpdatableComponents()
-                  << " elements, instead got \"" << learning_rates << '"';
-      }
-      SubVector<BaseFloat> learning_rates_vector(&(learning_rates_vec[0]),
-                                                 learning_rates_vec.size());
-      am_nnet.GetNnet().SetLearningRates(learning_rates_vector);
-    }
-
-    if (learning_rate_scales_str != " ")  {
-      // parse the learning_rate_scales provided as an option
-      std::map<std::string, BaseFloat> learning_rate_scales;
-      std::vector<std::string> learning_rate_scale_vec;
-      SplitStringToVector(learning_rate_scales_str, ":", true,
-                          &learning_rate_scale_vec);
-      for (int32 index = 0; index < learning_rate_scale_vec.size();
-          index++) {
-        std::vector<std::string> parts;
-        BaseFloat scale_factor;
-        SplitStringToVector(learning_rate_scale_vec[index],
-                            "=", false,  &parts);
-        if (!ConvertStringToReal(parts[1], &scale_factor)) {
-          KALDI_ERR << "Unknown format for --learning-rate-scales option. "
-              << "Expected format is "
-              << "--learning-rate-scales=AffineComponent=0.1:AffineComponentPreconditioned=0.5 "
-              << "instead got "
-              << learning_rate_scales_str;
-        }
-        learning_rate_scales.insert(std::pair<std::string, BaseFloat>(
-                parts[0], scale_factor));
-      }
-      // use the learning_rate_scales to scale the component learning rates
-      am_nnet.GetNnet().ScaleLearningRates(learning_rate_scales);
-    }
-
-    if (scales != "") {
-      std::vector<BaseFloat> scales_vec;
-      if (!SplitStringToFloats(scales, ":", false, &scales_vec)
-          || static_cast<int32>(scales_vec.size()) !=
-             am_nnet.GetNnet().NumUpdatableComponents()) {
-        KALDI_ERR << "Expected --scales option to be a "
-                  << "colon-separated string with "
-                  << am_nnet.GetNnet().NumUpdatableComponents()
-                  << " elements, instead got \"" << scales << '"';
-      }
-      SubVector<BaseFloat> scales_vector(&(scales_vec[0]),
-                                         scales_vec.size());
-      am_nnet.GetNnet().ScaleComponents(scales_vector);
-    }
-
-    if (truncate >= 0) {
-      am_nnet.GetNnet().Resize(truncate);
-      if (am_nnet.GetNnet().OutputDim() != am_nnet.Priors().Dim()) {
-        Vector<BaseFloat> empty_priors;
-        am_nnet.SetPriors(empty_priors);  // so dims don't disagree.
-      }
-    }
-
-    if (remove_dropout) am_nnet.GetNnet().RemoveDropout();
-
-    if (dropout_scale != -1.0) am_nnet.GetNnet().SetDropoutScale(dropout_scale);
-
-    if (remove_preconditioning) am_nnet.GetNnet().RemovePreconditioning();
-
-    if (collapse) am_nnet.GetNnet().Collapse(match_updatableness);
-    
-    if (stats_from != "") {
-      // Copy the stats associated with the layers descending from
-      // NonlinearComponent.
-      bool binary;
-      Input ki(stats_from, &binary);
-      TransitionModel trans_model;
-      trans_model.Read(ki.Stream(), binary);
-      AmNnet am_nnet_stats;
-      am_nnet_stats.Read(ki.Stream(), binary);
-      am_nnet.GetNnet().CopyStatsFrom(am_nnet_stats.GetNnet());
-    }
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Copied neural net from " << nnet_rxfilename
-              << " to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-fix.cc b/src/nnet2bin/nnet-am-fix.cc
deleted file mode 100644
index 849274671b2..00000000000
--- a/src/nnet2bin/nnet-am-fix.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// nnet2bin/nnet-am-fix.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/nnet-fix.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Copy a (cpu-based) neural net and its associated transition model,\n"
-        "but modify it to remove certain pathologies.  We use the average\n"
-        "derivative statistics stored with the layers derived from\n"
-        "NonlinearComponent.  Note: some processes, such as nnet-combine-fast,\n"
-        "may not process these statistics correctly, and you may have to recover\n"
-        "them using the --stats-from option of nnet-am-copy before you use.\n"
-        "this program.\n"
-        "\n"
-        "Usage:  nnet-am-fix [options] <nnet-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-am-fix 1.mdl 1_fixed.mdl\n"
-        "or:\n"
-        " nnet-am-fix --get-counts-from=1.gradient 1.mdl 1_shrunk.mdl\n";
-
-    bool binary_write = true;
-    NnetFixConfig config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        nnet_wxfilename = po.GetArg(2);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    FixNnet(config, &am_nnet.GetNnet());
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Copied neural net from " << nnet_rxfilename
-              << " to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-info.cc b/src/nnet2bin/nnet-am-info.cc
deleted file mode 100644
index 0206f542d5c..00000000000
--- a/src/nnet2bin/nnet-am-info.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-// nnet2bin/nnet-am-info.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/am-nnet.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Print human-readable information about the neural network\n"
-        "acoustic model to the standard output\n"
-        "Usage:  nnet-am-info [options] <nnet-in>\n"
-        "e.g.:\n"
-        " nnet-am-info 1.nnet\n";
-        
-    ParseOptions po(usage);
-
-    bool print_learning_rates = false;
-
-    po.Register("print-learning-rates", &print_learning_rates,
-                "If true, instead of printing the normal info, print a "
-                "colon-separated list of the learning rates for each updatable "
-                "layer, suitable to give to nnet-am-copy as the argument to"
-                "--learning-rates.");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 1) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-
-    if (print_learning_rates) {
-      Vector<BaseFloat> learning_rates(am_nnet.GetNnet().NumUpdatableComponents());
-      am_nnet.GetNnet().GetLearningRates(&learning_rates);
-      int32 nc = learning_rates.Dim();
-      for (int32 i = 0; i < nc; i++)
-        std::cout << learning_rates(i) << (i < nc - 1 ? ":" : "");
-      std::cout << std::endl;
-      KALDI_LOG << "Printed learning-rate info for " << nnet_rxfilename;
-    } else {
-      std::cout << am_nnet.Info();
-      KALDI_LOG << "Printed info about " << nnet_rxfilename;
-    }
-    
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
-
-
diff --git a/src/nnet2bin/nnet-am-init.cc b/src/nnet2bin/nnet-am-init.cc
deleted file mode 100644
index 39473a6bcc6..00000000000
--- a/src/nnet2bin/nnet-am-init.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// nnet2bin/nnet-am-init.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    // TODO: specify in the usage message where the example scripts are.
-    const char *usage =
-        "Initialize the neural network acoustic model and its associated\n"
-        "transition-model, from a tree, a topology file, and a neural-net\n"
-        "without an associated acoustic model.\n"
-        "See example scripts to see how this works in practice.\n"
-        "\n"
-        "Usage:  nnet-am-init [options] <tree-in> <topology-in> <raw-nnet-in> <nnet-am-out>\n"
-        "or:  nnet-am-init [options] <transition-model-in> <raw-nnet-in> <nnet-am-out>\n"
-        "e.g.:\n"
-        " nnet-am-init tree topo \"nnet-init nnet.config - |\" 1.mdl\n";
-        
-    bool binary_write = true;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3 && po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string raw_nnet_rxfilename, nnet_wxfilename;
-    
-    TransitionModel *trans_model = NULL;
-
-    if (po.NumArgs() == 4) {
-      std::string tree_rxfilename = po.GetArg(1),
-          topo_rxfilename = po.GetArg(2);
-      raw_nnet_rxfilename = po.GetArg(3);
-      nnet_wxfilename = po.GetArg(4);
-    
-      ContextDependency ctx_dep;
-      ReadKaldiObject(tree_rxfilename, &ctx_dep);
-    
-      HmmTopology topo;
-      ReadKaldiObject(topo_rxfilename, &topo);
-
-      // Construct the transition model from the tree and the topology file.
-      trans_model = new TransitionModel(ctx_dep, topo);
-    } else {
-      std::string trans_model_rxfilename = po.GetArg(1);
-      raw_nnet_rxfilename = po.GetArg(2);
-      nnet_wxfilename = po.GetArg(3);
-      trans_model = new TransitionModel();
-      ReadKaldiObject(trans_model_rxfilename, trans_model);
-    }
-    
-    AmNnet am_nnet;    
-    {
-      Nnet nnet;
-      bool binary;
-      Input ki(raw_nnet_rxfilename, &binary);
-      nnet.Read(ki.Stream(), binary);
-      am_nnet.Init(nnet);
-    }
-    
-    if (am_nnet.NumPdfs() != trans_model->NumPdfs())
-      KALDI_ERR << "Mismatch in number of pdfs, neural net has "
-                << am_nnet.NumPdfs() << ", transition model has "
-                << trans_model->NumPdfs();
-
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model->Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    delete trans_model;
-    KALDI_LOG << "Initialized neural net and wrote it to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-am-mixup.cc b/src/nnet2bin/nnet-am-mixup.cc
deleted file mode 100644
index 32a961b2da1..00000000000
--- a/src/nnet2bin/nnet-am-mixup.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-// nnet2bin/nnet-am-mixup.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/mixup-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Add mixture-components to a neural net (comparable to mixtures in a Gaussian\n"
-        "mixture model).  Number of mixture components must be greater than the number\n"
-        "of pdfs\n"
-        "\n"
-        "Usage:  nnet-am-mixup [options] <nnet-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-am-mixup --power=0.3 --num-mixtures=5000 1.mdl 2.mdl\n";
-
-    NnetMixupConfig config;
-    bool binary_write = true;
-    
-    ParseOptions po(usage);
-    config.Register(&po);
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        nnet_wxfilename = po.GetArg(2);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    MixupNnet(config, &(am_nnet.GetNnet()));
-
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Mixed up neural net from " << nnet_rxfilename
-              << " and wrote it to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-reinitialize.cc b/src/nnet2bin/nnet-am-reinitialize.cc
deleted file mode 100644
index 43d87ec9e46..00000000000
--- a/src/nnet2bin/nnet-am-reinitialize.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-// nnet2bin/nnet-am-reinitialize.cc
-
-// Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "This program can used when transferring a neural net from one language\n"
-        "to another (or one tree to another).  It takes a neural net and a\n"
-        "transition model from a different neural net, resizes the last layer\n"
-        "to match the new transition model, zeroes it, and writes out the new,\n"
-        "resized .mdl file.  If the original model had been 'mixed-up', the associated\n"
-        "SumGroupComponent will be removed.\n"
-        "\n"
-        "Usage:  nnet-am-reinitialize [options] <nnet-in> <new-transition-model> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-am-reinitialize 1.mdl exp/tri6/final.mdl 2.mdl\n";
-
-    bool binary_write = true;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        transition_model_rxfilename = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    TransitionModel orig_trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      orig_trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    TransitionModel new_trans_model;
-    ReadKaldiObject(transition_model_rxfilename, &new_trans_model);
-
-    am_nnet.ResizeOutputLayer(new_trans_model.NumPdfs());
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      new_trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Resized neural net from " << nnet_rxfilename
-              << " to " << am_nnet.NumPdfs()
-              << " pdfs, and wrote to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-switch-preconditioning.cc b/src/nnet2bin/nnet-am-switch-preconditioning.cc
deleted file mode 100644
index 7967eaaa866..00000000000
--- a/src/nnet2bin/nnet-am-switch-preconditioning.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// nnet2bin/nnet-am-switch-preconditioning.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Copy a (cpu-based) neural net and its associated transition model,\n"
-        "and switch it to online preconditioning, i.e. change any components\n"
-        "derived from AffineComponent to components of type\n"
-        "AffineComponentPreconditionedOnline.\n"
-        "\n"
-        "Usage:  nnet-am-switch-preconditioning [options] <nnet-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-am-switch-preconditioning --binary=false 1.mdl text.mdl\n";
-
-    int32 rank_in = 20, rank_out = 80, update_period = 4;
-    BaseFloat num_samples_history = 2000.0;
-    BaseFloat alpha = 4.0;
-    bool binary_write = true;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("rank-in", &rank_in,
-                "Rank used in online-preconditioning on input side of each layer");
-    po.Register("rank-out", &rank_out,
-                "Rank used in online-preconditioning on output side of each layer");
-    po.Register("update-period", &update_period,
-                "Affects how frequently we update the Fisher-matrix estimate (every "
-                "this-many minibatches).");
-    po.Register("num-samples-history", &num_samples_history,
-                "Number of samples of history to use in online preconditioning "
-                "(affects speed vs accuracy of update of Fisher matrix)");
-    po.Register("alpha", &alpha,
-                "Parameter that affects amount of smoothing with unit matrix "
-                "in online preconditioning (larger -> more smoothing)");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        nnet_wxfilename = po.GetArg(2);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    am_nnet.GetNnet().SwitchToOnlinePreconditioning(rank_in, rank_out, update_period,
-                                                    num_samples_history, alpha);
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Copied neural net from " << nnet_rxfilename
-              << " to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-am-widen.cc b/src/nnet2bin/nnet-am-widen.cc
deleted file mode 100644
index 0ed23c11cf6..00000000000
--- a/src/nnet2bin/nnet-am-widen.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// nnet2bin/nnet-am-widen.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/widen-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Copy a (cpu-based) neural net and its associated transition model,\n"
-        "possibly changing the binary mode\n"
-        "Also supports multiplying all the learning rates by a factor\n"
-        "(the --learning-rate-factor option) and setting them all to a given\n"
-        "value (the --learning-rate options)\n"
-        "\n"
-        "Usage:  nnet-am-widen [options] <nnet-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-am-widen --hidden-layer-dim=1024 1.mdl 2.mdl\n";
-
-    NnetWidenConfig config;
-    bool binary_write = true;
-    
-    ParseOptions po(usage);
-    config.Register(&po);
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        nnet_wxfilename = po.GetArg(2);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    WidenNnet(config, &(am_nnet.GetNnet()));
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Mixed up neural net from " << nnet_rxfilename
-              << " and wrote it to " << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-combine-egs-discriminative.cc b/src/nnet2bin/nnet-combine-egs-discriminative.cc
deleted file mode 100644
index da6e544950c..00000000000
--- a/src/nnet2bin/nnet-combine-egs-discriminative.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// nnet2bin/nnet-combine-egs-discriminative.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Copy examples for discriminative neural network training,\n"
-        "and combine successive examples if their combined length will\n"
-        "be less than --max-length.  This can help to improve efficiency\n"
-        "(--max-length corresponds to minibatch size)\n"
-        "\n"
-        "Usage:  nnet-combine-egs-discriminative [options] <egs-rspecifier> <egs-wspecifier>\n"
-        "\n"
-        "e.g.\n"
-        "nnet-combine-egs-discriminative --max-length=512 ark:temp.1.degs ark:1.degs\n";
-        
-    int32 max_length = 512;
-    int32 hard_max_length = 2048;
-    int32 batch_size = 250;
-    ParseOptions po(usage);
-    po.Register("max-length", &max_length, "Maximum length of example that we "
-                "will create when combining");
-    po.Register("batch-size", &batch_size, "Size of batch used when combinging "
-                "examples");
-    po.Register("hard-max-length", &hard_max_length, "Length of example beyond "
-                "which we will discard (very long examples may cause out of "
-                "memory errors)");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    KALDI_ASSERT(hard_max_length >= max_length);
-    KALDI_ASSERT(batch_size >= 1);
-    
-    std::string examples_rspecifier = po.GetArg(1),
-        examples_wspecifier = po.GetArg(2);
-
-    SequentialDiscriminativeNnetExampleReader example_reader(
-        examples_rspecifier);
-    DiscriminativeNnetExampleWriter example_writer(
-        examples_wspecifier);
-
-    int64 num_read = 0, num_written = 0, num_discarded = 0;
-
-    while (!example_reader.Done()) {
-      std::vector<DiscriminativeNnetExample> buffer;
-      size_t size = batch_size;
-      buffer.reserve(size);
-
-      for (; !example_reader.Done() && buffer.size() < size;
-           example_reader.Next()) {
-        buffer.push_back(example_reader.Value());
-        num_read++;
-      }
-
-      std::vector<DiscriminativeNnetExample> combined;
-      CombineDiscriminativeExamples(max_length, buffer, &combined);
-      buffer.clear();
-      for (size_t i = 0; i < combined.size(); i++) {
-        const DiscriminativeNnetExample &eg = combined[i];
-        int32 num_frames = eg.input_frames.NumRows();
-        if (num_frames > hard_max_length) {
-          KALDI_WARN << "Discarding segment of length " << num_frames
-                     << " because it exceeds --hard-max-length="
-                     << hard_max_length;
-          num_discarded++;
-        } else {
-          std::ostringstream ostr;
-          ostr << (num_written++);
-          example_writer.Write(ostr.str(), eg);
-        }
-      }
-    }
-    
-    KALDI_LOG << "Read " << num_read << " discriminative neural-network training"
-              << " examples, wrote " << num_written << ", discarded "
-              << num_discarded;
-    return (num_written == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-combine-fast.cc b/src/nnet2bin/nnet-combine-fast.cc
deleted file mode 100644
index 63a92f90311..00000000000
--- a/src/nnet2bin/nnet-combine-fast.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-// nnet2bin/nnet-combine-fast.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/combine-nnet-fast.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Using a validation set, compute an optimal combination of a number of\n"
-        "neural nets (the combination weights are separate for each layer and\n"
-        "do not have to sum to one).  The optimization is BFGS, which is initialized\n"
-        "from the best of the individual input neural nets (or as specified by\n"
-        "--initial-model)\n"
-        "\n"
-        "Usage:  nnet-combine-fast [options] <model-in1> <model-in2> ... <model-inN> <valid-examples-in> <model-out>\n"
-        "\n"
-        "e.g.:\n"
-        " nnet-combine-fast 1.1.nnet 1.2.nnet 1.3.nnet ark:valid.egs 2.nnet\n"
-        "Caution: the first input neural net must not be a gradient.\n";
-    
-    bool binary_write = true;
-    NnetCombineFastConfig combine_config;
-    std::string use_gpu = "yes";
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    
-    combine_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() < 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string
-        nnet1_rxfilename = po.GetArg(1),
-        valid_examples_rspecifier = po.GetArg(po.NumArgs() - 1),
-        nnet_wxfilename = po.GetArg(po.NumArgs());
-
-#if HAVE_CUDA==1
-    if (combine_config.num_threads == 1)
-      CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet1;
-    {
-      bool binary_read;
-      Input ki(nnet1_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet1.Read(ki.Stream(), binary_read);
-    }
-
-    int32 num_nnets = po.NumArgs() - 2;
-    std::vector<Nnet> nnets(num_nnets);
-    nnets[0] = am_nnet1.GetNnet();
-    am_nnet1.GetNnet() = Nnet(); // Clear it to save memory.
-
-    for (int32 n = 1; n < num_nnets; n++) {
-      TransitionModel trans_model;
-      AmNnet am_nnet;
-      bool binary_read;
-      Input ki(po.GetArg(1 + n), &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-      nnets[n] = am_nnet.GetNnet();
-    }      
-    
-    std::vector<NnetExample> validation_set; // stores validation
-    // frames.
-
-    { // This block adds samples to "validation_set".
-      SequentialNnetExampleReader example_reader(
-          valid_examples_rspecifier);
-      for (; !example_reader.Done(); example_reader.Next())
-        validation_set.push_back(example_reader.Value());
-      KALDI_LOG << "Read " << validation_set.size() << " examples from the "
-                << "validation set.";
-      KALDI_ASSERT(validation_set.size() > 0);
-    }
-
-    CombineNnetsFast(combine_config,
-                     validation_set,
-                     nnets,
-                     &(am_nnet1.GetNnet()));
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet1.Write(ko.Stream(), binary_write);
-    }
-    
-    KALDI_LOG << "Finished combining neural nets, wrote model to "
-              << nnet_wxfilename;
-    return (validation_set.size() == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-combine.cc b/src/nnet2bin/nnet-combine.cc
deleted file mode 100644
index 621def0c13c..00000000000
--- a/src/nnet2bin/nnet-combine.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-// nnet2bin/nnet-combine.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/combine-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Using a validation set, compute an optimal combination of a number of\n"
-        "neural nets (the combination weights are separate for each layer and\n"
-        "do not have to sum to one).  The optimization is BFGS, which is initialized\n"
-        "from the best of the individual input neural nets (or as specified by\n"
-        "--initial-model)\n"
-        "\n"
-        "Usage:  nnet-combine [options] <model-in1> <model-in2> ... <model-inN> <valid-examples-in> <model-out>\n"
-        "\n"
-        "e.g.:\n"
-        " nnet-combine 1.1.nnet 1.2.nnet 1.3.nnet ark:valid.egs 2.nnet\n"
-        "Caution: the first input neural net must not be a gradient.\n";
-    
-    bool binary_write = true;
-    NnetCombineConfig combine_config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    
-    combine_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() < 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string
-        nnet1_rxfilename = po.GetArg(1),
-        valid_examples_rspecifier = po.GetArg(po.NumArgs() - 1),
-        nnet_wxfilename = po.GetArg(po.NumArgs());
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet1;
-    {
-      bool binary_read;
-      Input ki(nnet1_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet1.Read(ki.Stream(), binary_read);
-    }
-
-    int32 num_nnets = po.NumArgs() - 2;
-    std::vector<Nnet> nnets(num_nnets);
-    nnets[0] = am_nnet1.GetNnet();
-    am_nnet1.GetNnet() = Nnet(); // Clear it to save memory.
-
-    for (int32 n = 1; n < num_nnets; n++) {
-      TransitionModel trans_model;
-      AmNnet am_nnet;
-      bool binary_read;
-      Input ki(po.GetArg(1 + n), &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-      nnets[n] = am_nnet.GetNnet();
-    }      
-    
-    std::vector<NnetExample> validation_set; // stores validation
-    // frames.
-
-    { // This block adds samples to "validation_set".
-      SequentialNnetExampleReader example_reader(
-          valid_examples_rspecifier);
-      for (; !example_reader.Done(); example_reader.Next())
-        validation_set.push_back(example_reader.Value());
-      KALDI_LOG << "Read " << validation_set.size() << " examples from the "
-                << "validation set.";
-      KALDI_ASSERT(validation_set.size() > 0);
-    }
-
-    CombineNnets(combine_config,
-                 validation_set,
-                 nnets,
-                 &(am_nnet1.GetNnet()));
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet1.Write(ko.Stream(), binary_write);
-    }
-    
-    KALDI_LOG << "Finished combining neural nets, wrote model to "
-              << nnet_wxfilename;
-    return (validation_set.size() == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-compare-hash-discriminative.cc b/src/nnet2bin/nnet-compare-hash-discriminative.cc
deleted file mode 100644
index e602165f527..00000000000
--- a/src/nnet2bin/nnet-compare-hash-discriminative.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// nnet2bin/nnet-compare-hash-discriminative.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Compares two archives of discriminative training examples and checks\n"
-        "that they behave the same way for purposes of discriminative training.\n"
-        "This program was created as a way of testing nnet-get-egs-discriminative\n"
-        "The model is only needed for its transition-model.\n"
-        "\n"
-        "Usage:  nnet-compare-hash-discriminative [options] <model-rxfilename> "
-        "<egs-rspecifier1> <egs-rspecifier2>\n"
-        "\n"
-        "Note: options --drop-frames and --criterion should be matched with the\n"
-        "command line of nnet-get-egs-discriminative used to get the examples\n"
-        "nnet-compare-hash-discriminative --drop-frames=true --criterion=mmi ark:1.degs ark:2.degs\n";
-    
-    std::string criterion = "smbr";
-    bool drop_frames = false;
-    bool one_silence_class = false;
-    BaseFloat threshold = 0.002;
-    BaseFloat acoustic_scale = 1.0, lm_scale = 1.0;
-    ParseOptions po(usage);
-
-    po.Register("acoustic-scale", &acoustic_scale,
-                "Scaling factor for acoustic likelihoods");
-    po.Register("lm-scale", &lm_scale,
-                "Scaling factor for \"graph costs\" (including LM costs)");
-    po.Register("criterion", &criterion, "Training criterion, 'mmi'|'mpfe'|'smbr'");
-    po.Register("drop-frames", &drop_frames, "If true, for MMI training, drop "
-                "frames where num and den do not intersect.");
-    po.Register("one-silence-class", &one_silence_class, "If true, newer "
-                 "behavior which will tend to reduce insertions.");
-    po.Register("threshold", &threshold, "Threshold for equality testing "
-                "(relative)");
-    
-    po.Read(argc, argv);
-
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_rxfilename = po.GetArg(1),
-        examples_rspecifier1 = po.GetArg(2),
-        examples_rspecifier2 = po.GetArg(3);
-
-    int64 num_done1 = 0, num_done2 = 0;
-
-
-    TransitionModel tmodel;
-    ReadKaldiObject(model_rxfilename, &tmodel);
-    
-    Matrix<double> hash1, hash2;
-
-    // some additional diagnostics:
-    double num_weight1 = 0.0, den_weight1 = 0.0, tot_t1 = 0.0;
-    double num_weight2 = 0.0, den_weight2 = 0.0, tot_t2 = 0.0;
-    
-    SequentialDiscriminativeNnetExampleReader
-        example_reader1(examples_rspecifier1),
-        example_reader2(examples_rspecifier2);
-
-    KALDI_LOG << "Computing first hash function";
-    for (; !example_reader1.Done(); example_reader1.Next(), num_done1++) {
-      DiscriminativeNnetExample eg = example_reader1.Value();
-      fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale),
-                        &(eg.den_lat));
-      UpdateHash(tmodel, eg, criterion, drop_frames,
-                 one_silence_class, &hash1,
-                 &num_weight1, &den_weight1, &tot_t1);
-    }
-    KALDI_LOG << "Processed " << num_done1 << " examples.";
-
-    KALDI_LOG << "Computing second hash function";
-    for (; !example_reader2.Done(); example_reader2.Next(), num_done2++) {
-      DiscriminativeNnetExample eg = example_reader2.Value();
-      fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale),
-                        &(eg.den_lat));
-      UpdateHash(tmodel, eg, criterion, drop_frames,
-                 one_silence_class, &hash2,
-                 &num_weight2, &den_weight2, &tot_t2);
-    }
-    KALDI_LOG << "Processed " << num_done2 << " examples.";
-    
-    double prod1 = TraceMatMat(hash1, hash1, kTrans),
-        prod2 = TraceMatMat(hash2, hash2, kTrans),
-        cross_prod = TraceMatMat(hash1, hash2, kTrans);
-
-    KALDI_LOG << "Products are as follows (should be the same): prod1 = "
-              << prod1 << ", prod2 = " << prod2 << ", cross_prod = "
-              << cross_prod;
-
-    KALDI_LOG << "Num-weight1 = " << num_weight1 << ", den-weight1 = "
-              << den_weight1 << ", tot_t1 = " << tot_t1;
-    KALDI_LOG << "Num-weight2 = " << num_weight2 << ", den-weight2 = "
-              << den_weight2 << ", tot_t2 = " << tot_t2;
-        
-    KALDI_ASSERT(ApproxEqual(prod1, prod2, threshold) &&
-                 ApproxEqual(prod2, cross_prod, threshold));
-    KALDI_ASSERT(prod1 > 0.0);
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-compute-from-egs.cc b/src/nnet2bin/nnet-compute-from-egs.cc
deleted file mode 100644
index a4c4c0c4ab7..00000000000
--- a/src/nnet2bin/nnet-compute-from-egs.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// nnet2bin/nnet-compute-from-egs.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Does the neural net computation, taking as input the nnet-training examples\n"
-        "(typically an archive with the extension .egs), ignoring the labels; it\n"
-        "outputs as a matrix the result.  Used mostly for debugging.\n"
-        "\n"
-        "Usage:  nnet-compute-from-egs [options] <raw-nnet-in> <egs-rspecifier> "
-        "<feature-wspecifier>\n"
-        "e.g.:  nnet-compute-from-egs 'nnet-to-raw-nnet final.mdl -|' egs.10.1.ark ark:-\n";
-    
-    ParseOptions po(usage);
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string raw_nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2),
-        features_or_loglikes_wspecifier = po.GetArg(3);
-
-    Nnet nnet;
-    ReadKaldiObject(raw_nnet_rxfilename, &nnet);
-    
-    int64 num_egs = 0;
-
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-    BaseFloatMatrixWriter writer(features_or_loglikes_wspecifier);
-    
-    int32 left_context = nnet.LeftContext(),
-        context = nnet.LeftContext() + 1 + nnet.RightContext();
-
-    for (; !example_reader.Done(); example_reader.Next()) {
-      const NnetExample &eg = example_reader.Value();
-      int32 start_offset = eg.left_context - left_context;
-      int32 basic_dim = eg.input_frames.NumCols(),
-          spk_dim = eg.spk_info.Dim(), dim = basic_dim + spk_dim;
-      Matrix<BaseFloat> input_frames(eg.input_frames),
-          input_block(context, dim);
-      input_block.Range(0, context, 0, basic_dim).CopyFromMat(
-          input_frames.Range(start_offset, context, 0, basic_dim));
-      if (spk_dim != 0) {
-        input_block.Range(0, context, basic_dim, spk_dim).CopyRowsFromVec(
-            eg.spk_info);
-      }
-      CuMatrix<BaseFloat> gpu_input_block;
-      gpu_input_block.Swap(&input_block);
-      CuMatrix<BaseFloat> gpu_output_block(1, nnet.OutputDim());
-      
-      bool pad_input = false;
-      NnetComputation(nnet, gpu_input_block, pad_input, &gpu_output_block);
-      writer.Write("global", Matrix<BaseFloat>(gpu_output_block));
-      num_egs++;
-    }
-    
-    KALDI_LOG << "Processed " << num_egs << " examples.";
-    
-    return (num_egs == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-compute-prob.cc b/src/nnet2bin/nnet-compute-prob.cc
deleted file mode 100644
index 7a5fa4b32f7..00000000000
--- a/src/nnet2bin/nnet-compute-prob.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// nnet2bin/nnet-compute-prob.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Computes and prints the average log-prob per frame of the given data with a\n"
-        "neural net.  The input of this is the output of e.g. nnet-get-egs\n"
-        "Aside from the logging output, which goes to the standard error, this program\n"
-        "prints the average log-prob per frame to the standard output.\n"
-        "Also see nnet-logprob, which produces a matrix of log-probs for each utterance.\n"
-        "\n"
-        "Usage:  nnet-compute-prob [options] <model-in> <training-examples-in>\n"
-        "e.g.: nnet-compute-prob 1.nnet ark:valid.egs\n";
-    
-    ParseOptions po(usage);
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-
-
-    std::vector<NnetExample> examples;
-    double tot_weight = 0.0, tot_like = 0.0, tot_accuracy = 0.0;
-    int64 num_examples = 0;
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-    for (; !example_reader.Done(); example_reader.Next(), num_examples++) {
-      if (examples.size() == 1000) {
-        double accuracy;
-        tot_like += ComputeNnetObjf(am_nnet.GetNnet(), examples, &accuracy);
-        tot_accuracy += accuracy;
-        tot_weight += TotalNnetTrainingWeight(examples);
-        examples.clear();
-      }
-      examples.push_back(example_reader.Value());
-      if (num_examples % 5000 == 0 && num_examples > 0)
-        KALDI_LOG << "Saw " << num_examples << " examples, average "
-                  << "probability is " << (tot_like / num_examples) << " with "
-                  << "total weight " << num_examples;
-    }
-    if (!examples.empty()) {
-      double accuracy;
-      tot_like += ComputeNnetObjf(am_nnet.GetNnet(), examples, &accuracy);
-      tot_accuracy += accuracy;      
-      tot_weight += TotalNnetTrainingWeight(examples);
-    }
-
-    KALDI_LOG << "Saw " << num_examples << " examples, average "
-              << "probability is " << (tot_like / tot_weight)
-              << " and accuracy is " << (tot_accuracy / tot_weight) << " with "
-              << "total weight " << tot_weight;
-    
-    std::cout << (tot_like / tot_weight) << "\n";
-    return (num_examples == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-compute.cc b/src/nnet2bin/nnet-compute.cc
deleted file mode 100644
index cc9b04f0ac7..00000000000
--- a/src/nnet2bin/nnet-compute.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// nnet2bin/nnet-compute.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Does the neural net computation for each file of input features, and\n"
-        "outputs as a matrix the result.  Used mostly for debugging.\n"
-        "Note: if you want it to apply a log (e.g. for log-likelihoods), use\n"
-        "--apply-log=true.  Unlike nnet-am-compute, this version reads a 'raw'\n"
-        "neural net\n"
-        "\n"
-        "Usage:  nnet-compute [options] <raw-nnet-in> <feature-rspecifier> "
-        "<feature-or-loglikes-wspecifier>\n";
-    
-    bool apply_log = false;
-    bool pad_input = true;
-    ParseOptions po(usage);
-    po.Register("apply-log", &apply_log, "Apply a log to the result of the computation "
-                "before outputting.");
-    po.Register("pad-input", &pad_input, "If true, duplicate the first and last frames "
-                "of input features as required for temporal context, to prevent #frames "
-                "of output being less than those of input.");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string raw_nnet_rxfilename = po.GetArg(1),
-        features_rspecifier = po.GetArg(2),
-        features_or_loglikes_wspecifier = po.GetArg(3);
-
-    Nnet nnet;
-    ReadKaldiObject(raw_nnet_rxfilename, &nnet);
-    
-    int64 num_done = 0, num_frames = 0;
-    SequentialBaseFloatCuMatrixReader feature_reader(features_rspecifier);
-    BaseFloatCuMatrixWriter writer(features_or_loglikes_wspecifier);
-    
-    for (; !feature_reader.Done();  feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      const CuMatrix<BaseFloat> &feats = feature_reader.Value();
-
-      int32 output_frames = feats.NumRows(), output_dim = nnet.OutputDim();
-      if (!pad_input)
-        output_frames -= nnet.LeftContext() + nnet.RightContext();
-      if (output_frames <= 0) {
-        KALDI_WARN << "Skipping utterance " << utt << " because output "
-                   << "would be empty.";
-        continue;
-      }
-      CuMatrix<BaseFloat> output(output_frames, output_dim);
-      NnetComputation(nnet, feats, pad_input, &output);
-
-      if (apply_log) {
-        output.ApplyFloor(1.0e-20);
-        output.ApplyLog();
-      }
-      writer.Write(utt, output);
-      num_frames += feats.NumRows();
-      num_done++;
-    }
-    
-    KALDI_LOG << "Processed " << num_done << " feature files, "
-              << num_frames << " frames of input were processed.";
-    
-    return (num_done == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-copy-egs-discriminative.cc b/src/nnet2bin/nnet-copy-egs-discriminative.cc
deleted file mode 100644
index a14fd3e404f..00000000000
--- a/src/nnet2bin/nnet-copy-egs-discriminative.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-// nnet2bin/nnet-copy-egs-discriminative.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-namespace kaldi {
-namespace nnet2 {
-// returns an integer randomly drawn with expected value "expected_count"
-// (will be either floor(expected_count) or ceil(expected_count)).
-// this will go into an infinite loop if expected_count is very huge, but
-// it should never be that huge.
-int32 GetCount(double expected_count) {
-  KALDI_ASSERT(expected_count >= 0.0);
-  int32 ans = 0;
-  while (expected_count > 1.0) {
-    ans++;
-    expected_count--;
-  }
-  if (WithProb(expected_count))
-    ans++;
-  return ans;
-}
-void AverageConstPart(int32 const_feat_dim,
-                      DiscriminativeNnetExample *eg) {
-  if (eg->spk_info.Dim() != 0) {  // already has const part.
-    KALDI_ASSERT(eg->spk_info.Dim() == const_feat_dim);
-    // and nothing to do.
-  } else {
-    int32 dim = eg->input_frames.NumCols(),
-        basic_dim = dim - const_feat_dim;
-    KALDI_ASSERT(const_feat_dim < eg->input_frames.NumCols());
-    Matrix<BaseFloat> mat(eg->input_frames);  // copy to non-compressed matrix.
-    eg->input_frames = mat.Range(0, mat.NumRows(), 0, basic_dim);
-    eg->spk_info.Resize(const_feat_dim);
-    eg->spk_info.AddRowSumMat(1.0 / mat.NumRows(),
-                              mat.Range(0, mat.NumRows(),
-                                        basic_dim, const_feat_dim),
-                              0.0);
-  }
-}
-                      
-
-} // namespace nnet2
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Copy examples for discriminative neural\n"
-        "network training.  Supports multiple wspecifiers, in\n"
-        "which case it will write the examples round-robin to the outputs.\n"
-        "\n"
-        "Usage:  nnet-copy-egs-discriminative [options] <egs-rspecifier> <egs-wspecifier1> [<egs-wspecifier2> ...]\n"
-        "\n"
-        "e.g.\n"
-        "nnet-copy-egs-discriminative ark:train.degs ark,t:text.degs\n"
-        "or:\n"
-        "nnet-copy-egs-discriminative ark:train.degs ark:1.degs ark:2.degs\n";
-        
-    bool random = false;
-    int32 srand_seed = 0;
-    BaseFloat keep_proportion = 1.0;
-    int32 const_feat_dim = 0;
-
-    ParseOptions po(usage);
-    po.Register("random", &random, "If true, will write frames to output "
-                "archives randomly, not round-robin.");
-    po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will "
-                "randomly keep this proportion of the input samples.  If >1.0, it will "
-                "in expectation copy a sample this many times.  It will copy it a number "
-                "of times equal to floor(keep-proportion) or ceil(keep-proportion).");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(only relevant if --random=true or --keep-proportion != 1.0)");
-    po.Register("const-feat-dim", &const_feat_dim,
-                "Dimension of part of features (last dims) which varies little "
-                "or not at all with time, and which should be stored as a single "
-                "vector for each example rather than in the feature matrix."
-                "Useful in systems that use iVectors.  Helpful to save space.");
-    
-    po.Read(argc, argv);
-
-    srand(srand_seed);
-    
-    if (po.NumArgs() < 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string examples_rspecifier = po.GetArg(1);
-
-    SequentialDiscriminativeNnetExampleReader example_reader(
-        examples_rspecifier);
-
-    int32 num_outputs = po.NumArgs() - 1;
-    std::vector<DiscriminativeNnetExampleWriter*> example_writers(num_outputs);
-    for (int32 i = 0; i < num_outputs; i++)
-      example_writers[i] = new DiscriminativeNnetExampleWriter(
-          po.GetArg(i+2));
-
-    
-    int64 num_read = 0, num_written = 0, num_frames_written = 0;
-    for (; !example_reader.Done(); example_reader.Next(), num_read++) {
-      int32 count = GetCount(keep_proportion);
-      for (int32 c = 0; c < count; c++) {
-        int32 index = (random ? Rand() : num_written) % num_outputs;
-        std::ostringstream ostr;
-        ostr << num_written;
-        if (const_feat_dim == 0) {
-          example_writers[index]->Write(ostr.str(),
-                                        example_reader.Value());
-        } else {
-          DiscriminativeNnetExample eg = example_reader.Value();
-          AverageConstPart(const_feat_dim, &eg);
-          example_writers[index]->Write(ostr.str(), eg);
-        }
-        num_written++;
-        num_frames_written +=
-            static_cast<int64>(example_reader.Value().num_ali.size());
-      }
-    }
-    
-    for (int32 i = 0; i < num_outputs; i++)
-      delete example_writers[i];
-    KALDI_LOG << "Read " << num_read << " discriminative neural-network training"
-              << " examples, wrote " << num_written << ", consisting of "
-              << num_frames_written << " frames.";
-    return (num_written == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-copy-egs.cc b/src/nnet2bin/nnet-copy-egs.cc
deleted file mode 100644
index 7ef07cdf935..00000000000
--- a/src/nnet2bin/nnet-copy-egs.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-// nnet2bin/nnet-copy-egs.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-// Copyright 2014  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-namespace kaldi {
-namespace nnet2 {
-// returns an integer randomly drawn with expected value "expected_count"
-// (will be either floor(expected_count) or ceil(expected_count)).
-// this will go into an infinite loop if expected_count is very huge, but
-// it should never be that huge.
-int32 GetCount(double expected_count) {
-  KALDI_ASSERT(expected_count >= 0.0);
-  int32 ans = 0;
-  while (expected_count > 1.0) {
-    ans++;
-    expected_count--;
-  }
-  if (WithProb(expected_count))
-    ans++;
-  return ans;
-}
-
-} // namespace nnet2
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Copy examples (typically single frames) for neural network training,\n"
-        "possibly changing the binary mode.  Supports multiple wspecifiers, in\n"
-        "which case it will write the examples round-robin to the outputs.\n"
-        "\n"
-        "Usage:  nnet-copy-egs [options] <egs-rspecifier> <egs-wspecifier1> [<egs-wspecifier2> ...]\n"
-        "\n"
-        "e.g.\n"
-        "nnet-copy-egs ark:train.egs ark,t:text.egs\n"
-        "or:\n"
-        "nnet-copy-egs ark:train.egs ark:1.egs ark:2.egs\n";
-        
-    bool random = false;
-    int32 srand_seed = 0;
-    BaseFloat keep_proportion = 1.0;
-
-    // The following config variables, if set, can be used to extract a single
-    // frame of labels from a multi-frame example, and/or to reduce the amount
-    // of context.
-    int32 left_context = -1, right_context = -1;
-    // you can set frame to a number to select a single frame with a particular
-    // offset, or to 'random' to select a random single frame.
-    std::string frame_str;
-    
-    ParseOptions po(usage);
-    po.Register("random", &random, "If true, will write frames to output "
-                "archives randomly, not round-robin.");
-    po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will "
-                "randomly keep this proportion of the input samples.  If >1.0, it will "
-                "in expectation copy a sample this many times.  It will copy it a number "
-                "of times equal to floor(keep-proportion) or ceil(keep-proportion).");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(only relevant if --random=true or --keep-proportion != 1.0)");
-    po.Register("frame", &frame_str, "This option can be used to select a single "
-                "frame from each multi-frame example.  Set to a number 0, 1, etc. "
-                "to select a frame with a given index, or 'random' to select a "
-                "random frame.");
-    po.Register("left-context", &left_context, "Can be used to truncate the "
-                "feature left-context that we output.");
-    po.Register("right-context", &right_context, "Can be used to truncate the "
-                "feature right-context that we output.");
-
-    
-    po.Read(argc, argv);
-
-    srand(srand_seed);
-
-    int32 frame = -1;  // -1 means don't do any selection (--frame option unse),
-                       // --2 means random selection.
-    if (frame_str != "") {
-      if (!ConvertStringToInteger(frame_str, &frame)) {
-        if (frame_str == "random") {
-          frame = -2;
-        } else {
-          KALDI_ERR << "Invalid --frame option: '" << frame_str << "'";
-        }
-      } else {
-        KALDI_ASSERT(frame >= 0);
-      }
-    }
-    // the following derived variables will be used if the frame, left_context,
-    // or right_context options were set (the frame option will be more common).
-    bool copy_eg = (frame != -1 || left_context != -1 || right_context != -1);
-    int32 start_frame = -1, num_frames = -1;
-    if (frame != -1) {  // frame >= 0 or frame == -2 meaning random frame
-      num_frames = 1;
-      start_frame = frame;  // value will be ignored if frame == -2.
-    }
-    
-    if (po.NumArgs() < 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string examples_rspecifier = po.GetArg(1);
-
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-
-    int32 num_outputs = po.NumArgs() - 1;
-    std::vector<NnetExampleWriter*> example_writers(num_outputs);
-    for (int32 i = 0; i < num_outputs; i++)
-      example_writers[i] = new NnetExampleWriter(po.GetArg(i+2));
-
-    
-    int64 num_read = 0, num_written = 0;
-    for (; !example_reader.Done(); example_reader.Next(), num_read++) {
-      // count is normally 1; could be 0, or possibly >1.
-      int32 count = GetCount(keep_proportion);  
-      std::string key = example_reader.Key();
-      const NnetExample &eg = example_reader.Value();
-      for (int32 c = 0; c < count; c++) {
-        int32 index = (random ? Rand() : num_written) % num_outputs;
-        if (!copy_eg) {
-          example_writers[index]->Write(key, eg);
-          num_written++;
-        } else { // the --frame option or related options were set.
-          if (frame == -2)  // --frame=random was set -> choose random frame
-            start_frame = RandInt(0, eg.labels.size() - 1);
-          if (start_frame == -1 || start_frame < eg.labels.size()) {
-            // note: we'd only reach here with start_frame == -1 if the
-            // --left-context or --right-context options were set (reducing
-            // context).  -1 means use whatever we had in the original eg.
-            NnetExample eg_mod(eg, start_frame, num_frames,
-                               left_context, right_context);
-            example_writers[index]->Write(key, eg_mod);
-            num_written++;            
-          }
-          // else this frame was out of range for this eg; we don't make this an
-          // error, because it can happen for truncated multi-frame egs that
-          // were created at the end of an utterance.
-        }
-      }
-    }
-    
-    for (int32 i = 0; i < num_outputs; i++)
-      delete example_writers[i];
-    KALDI_LOG << "Read " << num_read << " neural-network training examples, wrote "
-              << num_written;
-    return (num_written == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-get-egs-discriminative.cc b/src/nnet2bin/nnet-get-egs-discriminative.cc
deleted file mode 100644
index 58db6972567..00000000000
--- a/src/nnet2bin/nnet-get-egs-discriminative.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-// nnet2bin/nnet-get-egs-discriminative.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Get examples of data for discriminative neural network training;\n"
-        "each one corresponds to part of a file, of variable (and configurable)\n"
-        "length.\n"
-        "\n"
-        "Usage:  nnet-get-egs-discriminative [options] <model> "
-        "<features-rspecifier> <ali-rspecifier> <den-lat-rspecifier> "
-        "<training-examples-out>\n"
-        "\n"
-        "An example [where $feats expands to the actual features]:\n"
-        "nnet-get-egs-discriminative --acoustic-scale=0.1 \\\n"
-        "  1.mdl '$feats' 'ark,s,cs:gunzip -c ali.1.gz|' 'ark,s,cs:gunzip -c lat.1.gz|' ark:1.degs\n";
-    
-    SplitDiscriminativeExampleConfig split_config;
-    
-    ParseOptions po(usage);
-    split_config.Register(&po);
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        ali_rspecifier = po.GetArg(3),
-        clat_rspecifier = po.GetArg(4),
-        examples_wspecifier = po.GetArg(5);
-
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    int32 left_context = am_nnet.GetNnet().LeftContext(),
-        right_context = am_nnet.GetNnet().RightContext();
-
-    
-    // Read in all the training files.
-    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
-    RandomAccessInt32VectorReader ali_reader(ali_rspecifier);
-    RandomAccessCompactLatticeReader clat_reader(clat_rspecifier);
-    DiscriminativeNnetExampleWriter example_writer(examples_wspecifier);
-    
-    int32 num_done = 0, num_err = 0;
-    int64 examples_count = 0; // used in generating id's.
-    
-    SplitExampleStats stats; // diagnostic.
-    
-    for (; !feat_reader.Done(); feat_reader.Next()) {
-      std::string key = feat_reader.Key();
-      const Matrix<BaseFloat> &feats = feat_reader.Value();
-      if (!ali_reader.HasKey(key)) {
-        KALDI_WARN << "No pdf-level posterior for key " << key;
-        num_err++;
-        continue;
-      }
-      const std::vector<int32> &alignment = ali_reader.Value(key);
-      if (!clat_reader.HasKey(key)) {
-        KALDI_WARN << "No denominator lattice for key " << key;
-        num_err++;
-        continue;
-      }
-      CompactLattice clat = clat_reader.Value(key);
-      CreateSuperFinal(&clat); // make sure only one state has a final-prob (of One()).
-      if (clat.Properties(fst::kTopSorted, true) == 0) {
-        TopSort(&clat);
-      }      
-
-      BaseFloat weight = 1.0;
-      DiscriminativeNnetExample eg;
-
-      if (!LatticeToDiscriminativeExample(alignment, feats, clat, weight,
-                                          left_context, right_context, &eg)) {
-        KALDI_WARN << "Error converting lattice to example.";
-        num_err++;
-        continue;
-      }
-      
-      std::vector<DiscriminativeNnetExample> egs;
-      SplitDiscriminativeExample(split_config, trans_model, eg,
-                                 &egs, &stats);
-      
-      KALDI_VLOG(2) << "Split lattice " << key << " into "
-                    << egs.size() << " pieces.";
-      for (size_t i = 0; i < egs.size(); i++) {
-        // Note: excised_egs will be of size 0 or 1.
-        std::vector<DiscriminativeNnetExample> excised_egs;
-        ExciseDiscriminativeExample(split_config, trans_model, egs[i],
-                                    &excised_egs, &stats);
-        for (size_t j = 0; j < excised_egs.size(); j++) {
-          std::ostringstream os;
-          os << (examples_count++);
-          std::string example_key = os.str();
-          example_writer.Write(example_key, excised_egs[j]);
-        }
-      }
-      num_done++;
-    }
-
-    if (num_done > 0) stats.Print();
-    
-    KALDI_LOG << "Finished generating examples, "
-              << "successfully processed " << num_done
-              << " feature files, " << num_err << " had errors.";
-    return (num_done == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-get-egs.cc b/src/nnet2bin/nnet-get-egs.cc
deleted file mode 100644
index 49f270c4f18..00000000000
--- a/src/nnet2bin/nnet-get-egs.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-// nnet2bin/nnet-get-egs.cc
-
-// Copyright 2012-2014  Johns Hopkins University (author:  Daniel Povey)
-//                2014  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sstream>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-static void ProcessFile(const MatrixBase<BaseFloat> &feats,
-                        const Posterior &pdf_post,
-                        const std::string &utt_id,
-                        int32 left_context,
-                        int32 right_context,
-                        int32 num_frames,
-                        int32 const_feat_dim,
-                        int64 *num_frames_written,
-                        int64 *num_egs_written,
-                        NnetExampleWriter *example_writer) {
-  KALDI_ASSERT(feats.NumRows() == static_cast<int32>(pdf_post.size()));
-  int32 feat_dim = feats.NumCols();
-  KALDI_ASSERT(const_feat_dim < feat_dim);
-  KALDI_ASSERT(num_frames > 0);
-  int32 basic_feat_dim = feat_dim - const_feat_dim;
-
-  for (int32 t = 0; t < feats.NumRows(); t += num_frames) {
-    int32 this_num_frames = std::min(num_frames,
-                                     feats.NumRows() - t);
-
-    int32 tot_frames = left_context + this_num_frames + right_context;
-    NnetExample eg;
-    Matrix<BaseFloat> input_frames(tot_frames, basic_feat_dim);
-    eg.left_context = left_context;
-    eg.spk_info.Resize(const_feat_dim);
-
-    // Set up "input_frames".
-    for (int32 j = -left_context; j < this_num_frames + right_context; j++) {
-      int32 t2 = j + t;
-      if (t2 < 0) t2 = 0;
-      if (t2 >= feats.NumRows()) t2 = feats.NumRows() - 1;
-      SubVector<BaseFloat> src(feats.Row(t2), 0, basic_feat_dim),
-          dest(input_frames, j + left_context);
-      dest.CopyFromVec(src);
-      if (const_feat_dim > 0) {
-        SubVector<BaseFloat> src(feats.Row(t2), basic_feat_dim, const_feat_dim);
-        // set eg.spk_info to the average of the corresponding dimensions of
-        // the input, taken over the frames whose features we store in the eg.
-        eg.spk_info.AddVec(1.0 / tot_frames, src);
-      }
-    }
-    eg.labels.resize(this_num_frames);
-    for (int32 j = 0; j < this_num_frames; j++)
-      eg.labels[j] = pdf_post[t + j];
-    eg.input_frames = input_frames;  // Copy to CompressedMatrix.
-    
-    std::ostringstream os;
-    os << utt_id << "-" << t;
-
-    std::string key = os.str(); // key is <utt_id>-<frame_id>
-
-    *num_frames_written += this_num_frames;
-    *num_egs_written += 1;
-
-    example_writer->Write(key, eg);
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Get frame-by-frame examples of data for neural network training.\n"
-        "Essentially this is a format change from features and posteriors\n"
-        "into a special frame-by-frame format.  To split randomly into\n"
-        "different subsets, do nnet-copy-egs with --random=true, but\n"
-        "note that this does not randomize the order of frames.\n"
-        "\n"
-        "Usage:  nnet-get-egs [options] <features-rspecifier> "
-        "<pdf-post-rspecifier> <training-examples-out>\n"
-        "\n"
-        "An example [where $feats expands to the actual features]:\n"
-        "nnet-get-egs --left-context=8 --right-context=8 \"$feats\" \\\n"
-        "  \"ark:gunzip -c exp/nnet/ali.1.gz | ali-to-pdf exp/nnet/1.nnet ark:- ark:- | ali-to-post ark:- ark:- |\" \\\n"
-        "   ark:- \n"
-        "Note: the --left-context and --right-context would be derived from\n"
-        "the output of nnet-info.";
-        
-    
-    int32 left_context = 0, right_context = 0,
-        num_frames = 1, const_feat_dim = 0;
-    
-    ParseOptions po(usage);
-    po.Register("left-context", &left_context, "Number of frames of left "
-                "context the neural net requires.");
-    po.Register("right-context", &right_context, "Number of frames of right "
-                "context the neural net requires.");
-    po.Register("num-frames", &num_frames, "Number of frames with labels "
-                "that each example contains.");
-    po.Register("const-feat-dim", &const_feat_dim, "If specified, the last "
-                "const-feat-dim dimensions of the feature input are treated as "
-                "constant over the context window (so are not spliced)");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feature_rspecifier = po.GetArg(1),
-        pdf_post_rspecifier = po.GetArg(2),
-        examples_wspecifier = po.GetArg(3);
-
-    // Read in all the training files.
-    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
-    RandomAccessPosteriorReader pdf_post_reader(pdf_post_rspecifier);
-    NnetExampleWriter example_writer(examples_wspecifier);
-    
-    int32 num_done = 0, num_err = 0;
-    int64 num_frames_written = 0, num_egs_written = 0;
-    
-    for (; !feat_reader.Done(); feat_reader.Next()) {
-      std::string key = feat_reader.Key();
-      const Matrix<BaseFloat> &feats = feat_reader.Value();
-      if (!pdf_post_reader.HasKey(key)) {
-        KALDI_WARN << "No pdf-level posterior for key " << key;
-        num_err++;
-      } else {
-        const Posterior &pdf_post = pdf_post_reader.Value(key);
-        if (pdf_post.size() != feats.NumRows()) {
-          KALDI_WARN << "Posterior has wrong size " << pdf_post.size()
-                     << " versus " << feats.NumRows();
-          num_err++;
-          continue;
-        }
-        ProcessFile(feats, pdf_post, key,
-                    left_context, right_context, num_frames,
-                    const_feat_dim, &num_frames_written, &num_egs_written,
-                    &example_writer);
-        num_done++;
-      }
-    }
-
-    KALDI_LOG << "Finished generating examples, "
-              << "successfully processed " << num_done
-              << " feature files, wrote " << num_egs_written << " examples, "
-              << " with " << num_frames_written << " egs in total; "
-              << num_err << " files had errors.";
-    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-get-feature-transform-multi.cc b/src/nnet2bin/nnet-get-feature-transform-multi.cc
deleted file mode 100644
index d7763e821e0..00000000000
--- a/src/nnet2bin/nnet-get-feature-transform-multi.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-// nnet2bin/nnet-get-feature-transform-multi.cc
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/get-feature-transform.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  typedef kaldi::int32 int32;
-  try {
-    const char *usage =
-        "Get feature-projection transform using stats obtained with acc-lda.\n"
-        "The file <index-list> contains a series of line, each containing a list\n"
-        "of integer indexes.  For each line we create a transform of the same type\n"
-        "as nnet-get-feature-transform would produce, taking as input just the\n"
-        "listed feature dimensions.  The output transform will be the concatenation\n"
-        "of all these transforms.  The output-dim will be the number of integers in\n"
-        "the file <index-list> (the individual transforms are not dimension-reducing).\n"
-        "Do not set the --dim option."
-        "Usage:  nnet-get-feature-transform-multi [options] <index-list> <lda-acc-1> <lda-acc-2> ... <lda-acc-n> <matrix-out>\n";
-
-    bool binary = true;
-
-    FeatureTransformEstimateOptions opts;
-    ParseOptions po(usage);
-    po.Register("binary", &binary, "Write accumulators in binary mode.");
-    opts.Register(&po);
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() < 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    FeatureTransformEstimateMulti fte;
-    std::string index_list_rxfilename = po.GetArg(1);
-    std::string projection_wxfilename = po.GetArg(po.NumArgs());
-
-    std::vector<std::vector<int32> > indexes;
-    {
-      Input ki(index_list_rxfilename);
-      std::string line;
-      while (getline(ki.Stream(), line)) {
-        std::vector<int32> this_indexes;
-        if (!SplitStringToIntegers(line, " \t\n\r",
-                                   true, &this_indexes) ||
-            line.empty()) {
-          KALDI_ERR << "Bad line in index-list file: line is " << line;
-        }
-        indexes.push_back(this_indexes);
-      }
-      if (indexes.empty())
-        KALDI_ERR << "Empty index-list file "
-                  << PrintableRxfilename(index_list_rxfilename);
-    }
-    
-    for (int32 i = 2; i < po.NumArgs(); i++) {
-      bool binary_in, add = true;
-      Input ki(po.GetArg(i), &binary_in);
-      fte.Read(ki.Stream(), binary_in, add);
-    }
-
-    Matrix<BaseFloat> mat;
-    fte.Estimate(opts, indexes, &mat);
-    WriteKaldiObject(mat, projection_wxfilename, binary);
-
-    KALDI_LOG << "Wrote transform to "
-              << PrintableWxfilename(projection_wxfilename);
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-get-weighted-egs.cc b/src/nnet2bin/nnet-get-weighted-egs.cc
deleted file mode 100644
index a3099ad8017..00000000000
--- a/src/nnet2bin/nnet-get-weighted-egs.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-// nnet2bin/nnet-get-weighted-egs.cc
-
-// Copyright 2013-2014  (Author: Vimal Manohar)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-// returns an integer randomly drawn with expected value "expected_count"
-// (will be either floor(expected_count) or ceil(expected_count)).
-// this will go into an infinite loop if expected_count is very huge, but
-// it should never be that huge.
-// In the normal case, "expected_count" will be between zero and one.
-int32 GetCount(double expected_count) {
-  KALDI_ASSERT(expected_count >= 0.0);
-  int32 ans = 0;
-  while (expected_count > 1.0) {
-    ans++;
-    expected_count--;
-  }
-  if (WithProb(expected_count))
-    ans++;
-  return ans;
-}
-
-static void ProcessFile(const MatrixBase<BaseFloat> &feats,
-                        const Posterior &pdf_post,
-                        const std::string &utt_id,
-                        const Vector<BaseFloat> &weights,
-                        int32 left_context,
-                        int32 right_context,
-                        int32 const_feat_dim,
-                        BaseFloat keep_proportion,
-                        BaseFloat weight_threshold,
-                        bool use_frame_selection,
-                        bool use_frame_weights,
-                        int64 *num_frames_written,
-                        int64 *num_frames_skipped,
-                        NnetExampleWriter *example_writer) {
-  KALDI_ASSERT(feats.NumRows() == static_cast<int32>(pdf_post.size()));
-  int32 feat_dim = feats.NumCols();
-  KALDI_ASSERT(const_feat_dim < feat_dim);
-  int32 basic_feat_dim = feat_dim - const_feat_dim;
-  NnetExample eg;
-  Matrix<BaseFloat> input_frames(left_context + 1 + right_context,
-                                 basic_feat_dim);
-  eg.left_context = left_context;
-  // TODO: modify this code, and this binary itself, to support the --num-frames
-  // option to allow multiple frames per eg.
-  for (int32 i = 0; i < feats.NumRows(); i++) {
-    int32 count = GetCount(keep_proportion); // number of times
-    // we'll write this out (1 by default).
-    if (count > 0) {
-      // Set up "input_frames".
-      for (int32 j = -left_context; j <= right_context; j++) {
-        int32 j2 = j + i;
-        if (j2 < 0) j2 = 0;
-        if (j2 >= feats.NumRows()) j2 = feats.NumRows() - 1;
-        SubVector<BaseFloat> src(feats, j2), dest(input_frames,
-                                                  j + left_context);
-        dest.CopyFromVec(src);
-      }
-      eg.labels.push_back(pdf_post[i]);
-      eg.input_frames = input_frames;
-      if (const_feat_dim > 0) {
-        // we'll normally reach here if we're using online-estimated iVectors.
-        SubVector<BaseFloat> const_part(feats.Row(i),
-                                        basic_feat_dim, const_feat_dim);
-        eg.spk_info.CopyFromVec(const_part);
-      }
-      if (use_frame_selection) {
-        if (weights(i) < weight_threshold) {
-          (*num_frames_skipped)++;
-          continue;
-        }
-      }
-      std::ostringstream os;
-      os << utt_id << "-" << i;
-      std::string key = os.str(); // key in the archive is the number of the example
-
-      for (int32 c = 0; c < count; c++)
-        example_writer->Write(key, eg);
-    }
-  }
-}
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Get frame-by-frame examples of data for neural network training.\n"
-        "Essentially this is a format change from features and posteriors\n"
-        "into a special frame-by-frame format.  To split randomly into\n"
-        "different subsets, do nnet-copy-egs with --random=true, but\n"
-        "note that this does not randomize the order of frames.\n"
-        "\n"
-        "Usage:  nnet-get-weighted-egs [options] <features-rspecifier> "
-        "<pdf-post-rspecifier> <weights-rspecifier> <training-examples-out>\n"
-        "\n"
-        "An example [where $feats expands to the actual features]:\n"
-        "nnet-get-weighted-egs --left-context=8 --right-context=8 \"$feats\" \\\n"
-        "  \"ark:gunzip -c exp/nnet/ali.1.gz | ali-to-pdf exp/nnet/1.nnet ark:- ark:- | ali-to-post ark:- ark:- |\" \\\n"
-        "   ark:- \n"
-        "Note: the --left-context and --right-context would be derived from\n"
-        "the output of nnet-info.";
-        
-    
-    int32 left_context = 0, right_context = 0, const_feat_dim = 0;
-    int32 srand_seed = 0;
-    BaseFloat keep_proportion = 1.0;
-    BaseFloat weight_threshold = 0.0;
-    bool use_frame_selection = true, use_frame_weights=false;
-    
-    ParseOptions po(usage);
-    po.Register("left-context", &left_context, "Number of frames of left context "
-                "the neural net requires.");
-    po.Register("right-context", &right_context, "Number of frames of right context "
-                "the neural net requires.");
-    po.Register("const-feat-dim", &const_feat_dim, "If specified, the last "
-                "const-feat-dim dimensions of the feature input are treated as "
-                "constant over the context window (so are not spliced)");
-    po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will "
-                "randomly keep this proportion of the input samples.  If >1.0, it will "
-                "in expectation copy a sample this many times.  It will copy it a number "
-                "of times equal to floor(keep-proportion) or ceil(keep-proportion).");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(only relevant if --keep-proportion != 1.0)");
-    po.Register("weight-threshold", &weight_threshold, "Keep only frames with weights "
-                "above this threshold.");
-    po.Register("use-frame-selection", &use_frame_selection, "Remove the frames below threshold.");
-    po.Register("use-frame-weights", &use_frame_weights, "Scale the error derivatives by the weight");
-    
-    po.Read(argc, argv);
-
-    srand(srand_seed);
-    
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feature_rspecifier = po.GetArg(1),
-        pdf_post_rspecifier = po.GetArg(2),
-        weights_rspecifier = po.GetArg(3),
-        examples_wspecifier = po.GetArg(4);
-
-    // Read in all the training files.
-    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
-    RandomAccessPosteriorReader pdf_post_reader(pdf_post_rspecifier);
-    RandomAccessBaseFloatVectorReader weights_reader(weights_rspecifier);
-    NnetExampleWriter example_writer(examples_wspecifier);
-    
-    int32 num_done = 0, num_err = 0;
-    int64 num_frames_written = 0;
-    int64 num_frames_skipped = 0;
-    
-    for (; !feat_reader.Done(); feat_reader.Next()) {
-      std::string key = feat_reader.Key();
-      const Matrix<BaseFloat> &feats = feat_reader.Value();
-      if (!pdf_post_reader.HasKey(key)) {
-        KALDI_WARN << "No pdf-level posterior for key " << key;
-        num_err++;
-      } else {
-        const Posterior &pdf_post = pdf_post_reader.Value(key);
-        if (pdf_post.size() != feats.NumRows()) {
-          KALDI_WARN << "Posterior has wrong size " << pdf_post.size()
-                     << " versus " << feats.NumRows();
-          num_err++;
-          continue;
-        }
-        if (!weights_reader.HasKey(key)) {
-          KALDI_ERR << "No weights for utterance " << key;
-          //ProcessFile(feats, pdf_post, NULL,
-          //    left_context, right_context, const_feat_dim, keep_proportion,
-          //    weight_threshold, false, false, &num_frames_written, 
-          //    &num_frames_skipped, &example_writer);
-        } else {
-          Vector<BaseFloat> weights = weights_reader.Value(key);
-          if (weights.Dim() != static_cast<int32>(pdf_post.size())) {
-            KALDI_WARN << "Weights for utterance " << key
-              << " have wrong size, " << weights.Dim()
-              << " vs. " << pdf_post.size();
-            num_err++;
-            continue;
-          }
-          ProcessFile(feats, pdf_post, key, weights, left_context, right_context,
-                      const_feat_dim, keep_proportion, weight_threshold,
-                      use_frame_selection, use_frame_weights,
-                      &num_frames_written, &num_frames_skipped, &example_writer);
-        }
-        num_done++;
-      }
-    }
-
-    KALDI_LOG << "Finished generating examples, "
-              << "successfully processed " << num_done
-              << " feature files, wrote " << num_frames_written << " examples, "
-              << "skipped " << num_frames_skipped << " examples, "
-              << num_err << " files had errors.";
-    return (num_done == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-init.cc b/src/nnet2bin/nnet-init.cc
deleted file mode 100644
index 7f29f5e8306..00000000000
--- a/src/nnet2bin/nnet-init.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-// nnet2bin/nnet-init.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Initialize the nnet2 neural network from a config file with a line for each\n"
-        "component.  Note, this only outputs the neural net itself, not the associated\n"
-        "information such as the transition-model; you'll probably want to pipe\n"
-        "the output into something like nnet-am-init.\n"
-        "\n"
-        "Usage:  nnet-init [options] <config-in> <raw-nnet-out>\n"
-        "e.g.:\n"
-        " nnet-init nnet.config 1.raw\n";
-    
-    bool binary_write = true;
-    int32 srand_seed = 0;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("srand", &srand_seed, "Seed for random number generator");
-    
-    po.Read(argc, argv);
-    srand(srand_seed);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string config_rxfilename = po.GetArg(1),
-        raw_nnet_wxfilename = po.GetArg(2);
-    
-    Nnet nnet;
-    {
-      bool binary;
-      Input ki(config_rxfilename, &binary);
-      KALDI_ASSERT(!binary && "Expect config file to contain text.");
-      nnet.Init(ki.Stream());
-    }
-
-    WriteKaldiObject(nnet, raw_nnet_wxfilename, binary_write);
-    KALDI_LOG << "Initialized raw neural net and wrote it to "
-              << raw_nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-insert.cc b/src/nnet2bin/nnet-insert.cc
deleted file mode 100644
index caf4f58a5e8..00000000000
--- a/src/nnet2bin/nnet-insert.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// nnet2bin/nnet-insert.cc
-
-// Copyright 2012-2014  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-functions.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Insert components into a neural network-based acoustic model.\n"
-        "This is mostly intended for adding new hidden layers to neural networks.\n"
-        "You can either specify the option --insert-at=n (specifying the index of\n"
-        "the component after which you want your neural network inserted), or by\n"
-        "default this program will insert it just before the component before the\n"
-        "softmax component.  CAUTION: It will also randomize the parameters of the\n"
-        "component before the softmax (typically AffineComponent), with stddev equal\n"
-        "to the --stddev-factor option (default 0.1), times the inverse square root\n"
-        "of the number of inputs to that component.\n"
-        "Set --randomize-next-component=false to turn this off.\n"
-        "\n"
-        "Usage:  nnet-insert [options] <nnet-in> <raw-nnet-to-insert-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-insert 1.nnet \"nnet-init hidden_layer.config -|\" 2.nnet\n";
-
-    bool binary_write = true;
-    bool randomize_next_component = true;
-    int32 insert_at = -1;
-    BaseFloat stddev_factor = 0.1;
-    int32 srand_seed = 0;
-    
-    ParseOptions po(usage);
-    
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("randomize-next-component", &randomize_next_component,
-                "If true, randomize the parameters of the next component after "
-                "what we insert (which must be updatable).");
-    po.Register("insert-at", &insert_at, "Inserts new components before the "
-                "specified component (note: indexes are zero-based).  If <0, "
-                "inserts before the component before the softmax.");
-    po.Register("stddev-factor", &stddev_factor, "Factor on the standard "
-                "deviation when randomizing next component (only relevant if "
-                "--randomize-next-component=true");
-    po.Register("srand", &srand_seed, "Seed for random number generator");
-    
-    po.Read(argc, argv);
-    srand(srand_seed);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        raw_nnet_rxfilename = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    Nnet src_nnet; // the one we'll insert.
-    ReadKaldiObject(raw_nnet_rxfilename, &src_nnet);
-
-    if (insert_at == -1) {
-      if ((insert_at = IndexOfSoftmaxLayer(am_nnet.GetNnet())) == -1)
-        KALDI_ERR << "We don't know where to insert the new components: "
-            "the neural net doesn't have exactly one softmax component, "
-            "and you didn't use the --insert-at option.";
-      insert_at--; // we want to insert before the linearity before
-      // the softmax layer.
-    }
-    
-    // This function is declared in nnet-functions.h
-    InsertComponents(src_nnet,
-                     insert_at,
-                     &(am_nnet.GetNnet()));
-    KALDI_LOG << "Inserted " << src_nnet.NumComponents() << " components at "
-              << "position " << insert_at;
-
-    if (randomize_next_component) {
-      int32 c = insert_at + src_nnet.NumComponents();
-      kaldi::nnet2::Component *component = &(am_nnet.GetNnet().GetComponent(c));
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(component);
-      if (!uc)
-        KALDI_ERR << "You have --randomize-next-component=true, but the "
-                  << "component to randomize is not updatable: "
-                  << component->Info();
-      bool treat_as_gradient = false;
-      uc->SetZero(treat_as_gradient);
-      BaseFloat stddev = stddev_factor /
-          std::sqrt(static_cast<BaseFloat>(uc->InputDim()));
-      uc->PerturbParams(stddev);
-      KALDI_LOG << "Randomized component index " << c << " with stddev "
-                << stddev;
-    }
-
-   
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Write neural-net acoustic model to " <<  nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-latgen-faster-parallel.cc b/src/nnet2bin/nnet-latgen-faster-parallel.cc
deleted file mode 100644
index 658d1fd8db6..00000000000
--- a/src/nnet2bin/nnet-latgen-faster-parallel.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-// nnet2bin/nnet-latgen-faster-parallel.cc
-
-// Copyright 2009-2013   Microsoft Corporation
-//                       Johns Hopkins University (author: Daniel Povey)
-//                2014   Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/kaldi-fst-io.h"
-#include "decoder/decoder-wrappers.h"
-#include "nnet2/decodable-am-nnet.h"
-#include "base/timer.h"
-#include "util/kaldi-thread.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::Fst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Generate lattices using neural net model.\n"
-        "Usage: nnet-latgen-faster-parallel [options] <nnet-in> <fst-in|fsts-rspecifier> <features-rspecifier>"
-        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n";
-    ParseOptions po(usage);
-    Timer timer;
-    bool allow_partial = false;
-    BaseFloat acoustic_scale = 0.1;
-    LatticeFasterDecoderConfig config;
-    TaskSequencerConfig sequencer_config; // has --num-threads option
-
-    std::string word_syms_filename;
-    sequencer_config.Register(&po);
-    config.Register(&po);
-    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
-    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        lattice_wspecifier = po.GetArg(4),
-        words_wspecifier = po.GetOptArg(5),
-        alignment_wspecifier = po.GetOptArg(6);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    bool determinize = config.determinize_lattice;
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-
-    TaskSequencer<DecodeUtteranceLatticeFasterClass> sequencer(sequencer_config);
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "")
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-    // We support reading in a vector to describe each speaker, if the neural
-    // net requires this (i.e. it was trained with this).
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_done = 0, num_err = 0;
-    Fst<StdArc> *decode_fst = NULL;
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
-      decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-      timer.Reset();
-
-      {
-
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          std::string utt = feature_reader.Key();
-          const Matrix<BaseFloat> &features (feature_reader.Value());
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_err++;
-            continue;
-          }
-          bool pad_input = true;
-          DecodableAmNnetParallel *nnet_decodable = new DecodableAmNnetParallel(
-              trans_model, am_nnet,
-              new CuMatrix<BaseFloat>(features),
-              pad_input, acoustic_scale);
-
-          LatticeFasterDecoder *decoder = new LatticeFasterDecoder(*decode_fst,
-                                                                   config);
-
-          DecodeUtteranceLatticeFasterClass *task =
-              new DecodeUtteranceLatticeFasterClass(
-                  decoder, nnet_decodable, // takes ownership of these two.
-                  trans_model, word_syms, utt, acoustic_scale, determinize,
-                  allow_partial, &alignment_writer, &words_writer,
-                  &compact_lattice_writer, &lattice_writer,
-                  &tot_like, &frame_count, &num_done, &num_err, NULL);
-
-          sequencer.Run(task); // takes ownership of "task",
-                               // and will delete it when done.
-        }
-      }
-    } else { // We have different FSTs for different utterances.
-      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !fst_reader.Done(); fst_reader.Next()) {
-        std::string utt = fst_reader.Key();
-        if (!feature_reader.HasKey(utt)) {
-          KALDI_WARN << "Not decoding utterance " << utt
-                     << " because no features available.";
-          num_err++;
-          continue;
-        }
-        const Matrix<BaseFloat> &features = feature_reader.Value(utt);
-        if (features.NumRows() == 0) {
-          KALDI_WARN << "Zero-length utterance: " << utt;
-          num_err++;
-          continue;
-        }
-
-        // This constructor of LatticeFasterDecoder takes ownership of the FST.
-        LatticeFasterDecoder *decoder =
-            new LatticeFasterDecoder(config, fst_reader.Value().Copy());
-
-        bool pad_input = true;
-        DecodableAmNnetParallel *nnet_decodable = new DecodableAmNnetParallel(
-            trans_model, am_nnet,
-            new CuMatrix<BaseFloat>(features),
-            pad_input, acoustic_scale);
-
-        DecodeUtteranceLatticeFasterClass *task =
-            new DecodeUtteranceLatticeFasterClass(
-                decoder, nnet_decodable, // takes ownership of these two.
-                trans_model, word_syms, utt, acoustic_scale, determinize,
-                allow_partial, &alignment_writer, &words_writer,
-                &compact_lattice_writer, &lattice_writer,
-                &tot_like, &frame_count, &num_done, &num_err, NULL);
-
-        sequencer.Run(task); // takes ownership of "task",
-                             // and will delete it when done.
-      }
-    }
-    sequencer.Wait(); // Waits for all tasks to be done.
-    delete decode_fst;
-
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken "<< elapsed
-              << "s: real-time factor per thread assuming 100 frames/sec is "
-              << (sequencer_config.num_threads * elapsed * 100.0 / frame_count);
-    KALDI_LOG << "Done " << num_done << " utterances, failed for "
-              << num_err;
-    KALDI_LOG << "Overall log-likelihood per frame is "
-              << (tot_like / frame_count) << " over " << frame_count
-              << " frames.";
-
-    delete word_syms;
-    if (num_done != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-latgen-faster.cc b/src/nnet2bin/nnet-latgen-faster.cc
deleted file mode 100644
index e83c65f5fc9..00000000000
--- a/src/nnet2bin/nnet-latgen-faster.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-// nnet2bin/nnet-latgen-faster.cc
-
-// Copyright 2009-2012   Microsoft Corporation
-//                       Johns Hopkins University (author: Daniel Povey)
-//                2014   Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/kaldi-fst-io.h"
-#include "decoder/decoder-wrappers.h"
-#include "nnet2/decodable-am-nnet.h"
-#include "base/timer.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::Fst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Generate lattices using neural net model.\n"
-        "Usage: nnet-latgen-faster [options] <nnet-in> <fst-in|fsts-rspecifier> <features-rspecifier>"
-        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n";
-    ParseOptions po(usage);
-    Timer timer;
-    bool allow_partial = false;
-    BaseFloat acoustic_scale = 0.1;
-    LatticeFasterDecoderConfig config;
-
-    std::string word_syms_filename;
-    config.Register(&po);
-    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
-    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        lattice_wspecifier = po.GetArg(4),
-        words_wspecifier = po.GetOptArg(5),
-        alignment_wspecifier = po.GetOptArg(6);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    bool determinize = config.determinize_lattice;
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "")
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_fail = 0;
-
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
-      SequentialBaseFloatCuMatrixReader feature_reader(feature_rspecifier);
-
-      // Input FST is just one FST, not a table of FSTs.
-      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-      timer.Reset();
-
-      {
-        LatticeFasterDecoder decoder(*decode_fst, config);
-
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          std::string utt = feature_reader.Key();
-          const CuMatrix<BaseFloat> &features (feature_reader.Value());
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_fail++;
-            continue;
-          }
-          bool pad_input = true;
-          DecodableAmNnet nnet_decodable(trans_model,
-                                         am_nnet,
-                                         features,
-                                         pad_input,
-                                         acoustic_scale);
-          double like;
-          if (DecodeUtteranceLatticeFaster(
-                  decoder, nnet_decodable, trans_model, word_syms, utt,
-                  acoustic_scale, determinize, allow_partial, &alignment_writer,
-                  &words_writer, &compact_lattice_writer, &lattice_writer,
-                  &like)) {
-            tot_like += like;
-            frame_count += features.NumRows();
-            num_success++;
-          } else num_fail++;
-        }
-      }
-      delete decode_fst; // delete this only after decoder goes out of scope.
-    } else { // We have different FSTs for different utterances.
-      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
-      RandomAccessBaseFloatCuMatrixReader feature_reader(feature_rspecifier);
-      for (; !fst_reader.Done(); fst_reader.Next()) {
-        std::string utt = fst_reader.Key();
-        if (!feature_reader.HasKey(utt)) {
-          KALDI_WARN << "Not decoding utterance " << utt
-                     << " because no features available.";
-          num_fail++;
-          continue;
-        }
-        const CuMatrix<BaseFloat> &features = feature_reader.Value(utt);
-        if (features.NumRows() == 0) {
-          KALDI_WARN << "Zero-length utterance: " << utt;
-          num_fail++;
-          continue;
-        }
-
-        LatticeFasterDecoder decoder(fst_reader.Value(), config);
-
-        bool pad_input = true;
-        DecodableAmNnet nnet_decodable(trans_model,
-                                       am_nnet,
-                                       features,
-                                       pad_input,
-                                       acoustic_scale);
-        double like;
-        if (DecodeUtteranceLatticeFaster(
-                decoder, nnet_decodable, trans_model, word_syms, utt,
-                acoustic_scale, determinize, allow_partial, &alignment_writer,
-                &words_writer, &compact_lattice_writer, &lattice_writer,
-                &like)) {
-          tot_like += like;
-          frame_count += features.NumRows();
-          num_success++;
-        } else num_fail++;
-      }
-    }
-
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken "<< elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_fail;
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
-              << frame_count<<" frames.";
-
-    delete word_syms;
-    if (num_success != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-modify-learning-rates.cc b/src/nnet2bin/nnet-modify-learning-rates.cc
deleted file mode 100644
index 55cab630080..00000000000
--- a/src/nnet2bin/nnet-modify-learning-rates.cc
+++ /dev/null
@@ -1,211 +0,0 @@
-// nnet2bin/nnet-modify-learning-rates.cc
-
-// Copyright 2013  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-namespace kaldi {
-namespace nnet2 {
-void SetMaxChange(BaseFloat max_change, Nnet *nnet) {
-  for (int32 c = 0; c < nnet->NumComponents(); c++) {
-    Component *component = &(nnet->GetComponent(c));
-    AffineComponentPreconditioned *ac =
-        dynamic_cast<AffineComponentPreconditioned*>(component);
-    if (ac != NULL)
-      ac->SetMaxChange(max_change);
-  }
-}
-}
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "This program modifies the learning rates so as to equalize the\n"
-        "relative changes in parameters for each layer, while keeping their\n"
-        "geometric mean the same (or changing it to a value specified using\n"
-        "the --average-learning-rate option).\n"
-        "\n"
-        "Usage: nnet-modify-learning-rates [options] <prev-model> \\\n"
-        "                                  <cur-model> <modified-cur-model>\n"
-        "e.g.: nnet-modify-learning-rates --average-learning-rate=0.0002 \\\n"
-        "                                 5.mdl 6.mdl 6.mdl\n";
-
-    bool binary_write = true;
-    bool retroactive = false;
-    BaseFloat average_learning_rate = 0.0;
-    BaseFloat first_layer_factor = 1.0;
-    BaseFloat last_layer_factor = 1.0;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("average-learning-rate", &average_learning_rate,
-                "If supplied, change learning rate geometric mean to the given "
-                "value.");
-    po.Register("first-layer-factor", &first_layer_factor, "Factor that "
-                "reduces the target relative learning rate for first layer.");
-    po.Register("last-layer-factor", &last_layer_factor, "Factor that "
-                "reduces the target relative learning rate for last layer.");
-    po.Register("retroactive", &retroactive, "If true, scale the parameter "
-                "differences as well.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    KALDI_ASSERT(average_learning_rate >= 0);
-
-    std::string prev_nnet_rxfilename = po.GetArg(1),
-        cur_nnet_rxfilename = po.GetArg(2),
-        modified_cur_nnet_rxfilename = po.GetOptArg(3);
-
-    TransitionModel trans_model;
-    AmNnet am_prev_nnet, am_cur_nnet;
-    {
-      bool binary_read;
-      Input ki(prev_nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_prev_nnet.Read(ki.Stream(), binary_read);
-    }
-    {
-      bool binary_read;
-      Input ki(cur_nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_cur_nnet.Read(ki.Stream(), binary_read);
-    }
-
-    if (am_prev_nnet.GetNnet().GetParameterDim() !=
-        am_cur_nnet.GetNnet().GetParameterDim()) {
-      KALDI_WARN << "Parameter-dim mismatch, cannot equalize the relative "
-                 << "changes in parameters for each layer.";
-      exit(0);
-    }
-
-    int32 ret = 0;
-
-    // Gets relative parameter differences.
-    int32 num_updatable = am_prev_nnet.GetNnet().NumUpdatableComponents();
-    Vector<BaseFloat> relative_diff(num_updatable);
-    {
-      Nnet diff_nnet(am_prev_nnet.GetNnet());
-      diff_nnet.AddNnet(-1.0, am_cur_nnet.GetNnet());
-      diff_nnet.ComponentDotProducts(diff_nnet, &relative_diff);
-      relative_diff.ApplyPow(0.5);
-      Vector<BaseFloat> baseline_prod(num_updatable);
-      am_prev_nnet.GetNnet().ComponentDotProducts(am_prev_nnet.GetNnet(),
-                                                  &baseline_prod);
-      baseline_prod.ApplyPow(0.5);
-      relative_diff.DivElements(baseline_prod);
-      KALDI_LOG << "Relative parameter differences per layer are "
-                << relative_diff;
-
-      // If relative parameter difference for a certain is zero, set it to the
-      // mean of the rest values.
-      int32 num_zero = 0;
-      for (int32 i = 0; i < num_updatable; i++) {
-        if (relative_diff(i) == 0.0) {
-          num_zero++;
-        }
-      }
-      if (num_zero > 0) {
-        BaseFloat average_diff = relative_diff.Sum()
-            / static_cast<BaseFloat>(num_updatable - num_zero);
-        for (int32 i = 0; i < num_updatable; i++) {
-          if (relative_diff(i) == 0.0) {
-            relative_diff(i) = average_diff;
-          }
-        }
-        KALDI_LOG << "Zeros detected in the relative parameter difference "
-                  << "vector, updating the vector to " << relative_diff;
-      }
-    }
-
-    // Gets learning rates for previous neural net.
-    Vector<BaseFloat> prev_nnet_learning_rates(num_updatable),
-        cur_nnet_learning_rates(num_updatable);
-    am_prev_nnet.GetNnet().GetLearningRates(&prev_nnet_learning_rates);
-    am_cur_nnet.GetNnet().GetLearningRates(&cur_nnet_learning_rates);
-    KALDI_LOG << "Learning rates for previous model per layer are "
-              << prev_nnet_learning_rates;
-    KALDI_LOG << "Learning rates for current model per layer are "
-              << cur_nnet_learning_rates;
-    
-    // Gets target geometric mean.
-    BaseFloat target_geometric_mean = 0.0; 
-    if (average_learning_rate == 0.0) {
-      target_geometric_mean = Exp(cur_nnet_learning_rates.SumLog()
-                                  / static_cast<BaseFloat>(num_updatable));
-    } else {
-      target_geometric_mean = average_learning_rate;
-    }
-    KALDI_ASSERT(target_geometric_mean > 0.0);
-
-    // Works out the new learning rates.  We start from the previous model;
-    // this ensures that if this program is run twice, we get consistent
-    // results even if it's overwritten the current model.
-    Vector<BaseFloat> nnet_learning_rates(prev_nnet_learning_rates);
-    nnet_learning_rates.DivElements(relative_diff);
-    KALDI_ASSERT(last_layer_factor > 0.0);
-    nnet_learning_rates(num_updatable - 1) *= last_layer_factor;
-    KALDI_ASSERT(first_layer_factor > 0.0);
-    nnet_learning_rates(0) *= first_layer_factor;
-    BaseFloat cur_geometric_mean = Exp(nnet_learning_rates.SumLog()
-                                 / static_cast<BaseFloat>(num_updatable));
-    nnet_learning_rates.Scale(target_geometric_mean / cur_geometric_mean);
-    KALDI_LOG << "New learning rates for current model per layer are "
-              << nnet_learning_rates;
-
-    // Changes the parameter differences if --retroactivate is set to true.
-    if (retroactive) {
-      Vector<BaseFloat> scale_factors(nnet_learning_rates);
-      scale_factors.DivElements(prev_nnet_learning_rates);
-      am_cur_nnet.GetNnet().AddNnet(-1.0, am_prev_nnet.GetNnet());
-      am_cur_nnet.GetNnet().ScaleComponents(scale_factors);
-      am_cur_nnet.GetNnet().AddNnet(1.0, am_prev_nnet.GetNnet());
-      KALDI_LOG << "Scale parameter difference retroactively. Scaling factors "
-                << "are " << scale_factors;
-    }
-
-    // Sets learning rates and writes updated model.
-    am_cur_nnet.GetNnet().SetLearningRates(nnet_learning_rates);
-
-    SetMaxChange(0.0, &(am_cur_nnet.GetNnet()));
-    
-    Output ko(modified_cur_nnet_rxfilename, binary_write);
-    trans_model.Write(ko.Stream(), binary_write);
-    am_cur_nnet.Write(ko.Stream(), binary_write);
-
-    return ret;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-normalize-stddev.cc b/src/nnet2bin/nnet-normalize-stddev.cc
deleted file mode 100644
index b23faef5fc1..00000000000
--- a/src/nnet2bin/nnet-normalize-stddev.cc
+++ /dev/null
@@ -1,174 +0,0 @@
-// nnet2bin/nnet-normalize-stddev.cc
-
-// Copyright 2013  Guoguo Chen
-//           2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "This program first identifies any affine or block affine layers that\n"
-        "are followed by pnorm and then renormalize layers. Then it rescales\n"
-        "those layers such that the parameter stddev is 1.0 after scaling\n"
-        "(the target stddev is configurable by the --stddev option).\n"
-        "If you supply the option --stddev-from=<model-filename>, it rescales\n"
-        "those layers to match the standard deviations of corresponding layers\n"
-        "in the specified model.\n"
-        "\n"
-        "Usage: nnet-normalize-stddev [options] <model-in> <model-out>\n"
-        " e.g.: nnet-normalize-stddev final.mdl final.mdl\n";
-
-    bool binary_write = true;
-    BaseFloat stddev = 1.0;
-    std::string reference_model_filename;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("stddev-from", &reference_model_filename, "Reference model");
-    po.Register("stddev", &stddev, "Target standard deviation that we normalize "
-                "to (note: is overridden by --stddev-from option, if supplied)");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        normalized_nnet_rxfilename = po.GetArg(2);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-
-    int32 ret = 0;
-
-    // Works out the layers that we would like to normalize: any affine or block
-    // affine layers that are followed by pnorm and then renormalize layers.
-    std::vector<int32> identified_components;
-    for (int32 c = 0; c < am_nnet.GetNnet().NumComponents() - 2; c++) {
-      // Checks if the current layer is an affine layer or block affine layer.
-      // Also includes PreconditionedAffineComponent and
-      // PreconditionedAffineComponentOnline, since they are child classes of
-      // AffineComponent.
-      kaldi::nnet2::Component *component = &(am_nnet.GetNnet().GetComponent(c));
-      AffineComponent *ac = dynamic_cast<AffineComponent*>(component);
-      BlockAffineComponent *bac =
-        dynamic_cast<BlockAffineComponent*>(component);
-      if (ac == NULL && bac == NULL)
-        continue;
-
-      // Checks if the next layer is a pnorm layer.
-      component = &(am_nnet.GetNnet().GetComponent(c + 1));
-      PnormComponent *pc = dynamic_cast<PnormComponent*>(component);
-      if (pc == NULL)
-        continue;
-
-      // Checks if the layer after the pnorm layer is a NormalizeComponent
-      // or a PowerComponent followed by a NormalizeComponent
-      component = &(am_nnet.GetNnet().GetComponent(c + 2));
-      NormalizeComponent *nc = dynamic_cast<NormalizeComponent*>(component);
-      PowerComponent *pwc = dynamic_cast<PowerComponent*>(component);
-      if (nc == NULL && pwc == NULL)
-        continue;
-      if (pwc != NULL) {  // verify it's PowerComponent followed by
-                         // NormalizeComponent.
-        if (c + 3 >= am_nnet.GetNnet().NumComponents())
-          continue;
-        component = &(am_nnet.GetNnet().GetComponent(c + 3));
-        nc = dynamic_cast<NormalizeComponent*>(component);
-        if (nc == NULL)
-          continue;
-      }
-      // This is the layer that we would like to normalize.
-      identified_components.push_back(c);
-    }
-
-    AmNnet am_nnet_ref;
-    if (!reference_model_filename.empty()) {
-      bool binary_read;
-      Input ki(reference_model_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet_ref.Read(ki.Stream(), binary_read);
-      KALDI_ASSERT(am_nnet_ref.GetNnet().NumComponents() == am_nnet.GetNnet().NumComponents());
-    }
-
-    BaseFloat ref_stddev = 0.0;
-
-    // Normalizes the identified layers.
-    for (int32 c = 0; c < identified_components.size(); c++) {
-      ref_stddev = stddev;
-      if (!reference_model_filename.empty()) {
-        kaldi::nnet2::Component *component =
-            &(am_nnet_ref.GetNnet().GetComponent(identified_components[c]));
-        UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(component);
-        KALDI_ASSERT(uc != NULL);
-        Vector<BaseFloat> params(uc->GetParameterDim());
-        uc->Vectorize(&params);
-        BaseFloat params_average = params.Sum()
-            / static_cast<BaseFloat>(params.Dim());
-        params.Add(-1.0 * params_average);
-        ref_stddev = sqrt(VecVec(params, params)
-            / static_cast<BaseFloat>(params.Dim()));
-      }
-
-      kaldi::nnet2::Component *component =
-          &(am_nnet.GetNnet().GetComponent(identified_components[c]));
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(component);
-      KALDI_ASSERT(uc != NULL);
-      Vector<BaseFloat> params(uc->GetParameterDim());
-      uc->Vectorize(&params);
-      BaseFloat params_average = params.Sum()
-          / static_cast<BaseFloat>(params.Dim());
-      params.Add(-1.0 * params_average);
-      BaseFloat params_stddev = sqrt(VecVec(params, params)
-          / static_cast<BaseFloat>(params.Dim()));
-      if (params_stddev > 0.0) {
-        uc->Scale(ref_stddev / params_stddev);
-        KALDI_LOG << "Normalized component " << identified_components[c];
-      }
-    }
-
-    // Writes the normalized model.
-    Output ko(normalized_nnet_rxfilename, binary_write);
-    trans_model.Write(ko.Stream(), binary_write);
-    am_nnet.Write(ko.Stream(), binary_write);
-
-    return ret;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-relabel-egs.cc b/src/nnet2bin/nnet-relabel-egs.cc
deleted file mode 100644
index 69c6c9923b8..00000000000
--- a/src/nnet2bin/nnet-relabel-egs.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-// nnet2bin/nnet-relabel-egs.cc
-
-// Copyright 2014   Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-/** @brief Relabels neural network egs with the read pdf-id alignments
-*/
-
-#include <sstream>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/nnet-example.h"
-
-namespace kaldi {
-  
-  // this functions splits an egs key like <utt_id>-<frame_id> into 
-  // separate utterance id and frame id on the last delimiter.
-  // Returns false if the delimiter is not found in the key.
-  bool SplitEgsKey(const std::string &key, 
-                    std::string *utt_id, int32 *frame_id) {
-    size_t start = 0, found = 0, end = key.size();
-    utt_id->clear();
- 
-    found = key.find_last_of("-", end);
-    // start != end condition is for when the delimiter is at the end
-    
-    if (found != start && start != end && found < end) {
-      *utt_id = key.substr(start, found - start);
-      std::istringstream tmp(key.substr(found + 1, end));
-      tmp >> *frame_id;
-      return true;
-    }
-
-    return false;
-  }
-}
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet2;
-
-  typedef kaldi::int32 int32;
-  typedef kaldi::int64 int64;
-  try {
-    const char *usage =
-        "Relabel neural network egs with the read pdf-id alignments, "
-        "zero-based..\n"
-        "Usage: nnet-relabel-egs [options] <pdf-aligment-rspecifier> "
-        "<egs_rspecifier1> ... <egs_rspecifierN> "
-        "<egs_wspecifier1> ... <egs_wspecifierN>\n"
-        "e.g.: \n"
-        " nnet-relabel-egs ark:1.ali egs_in/egs.1.ark egs_in/egs.2.ark "
-        "egs_out/egs.1.ark egs_out/egs.2.ark\n"
-        "See also: nnet-get-egs, nnet-copy-egs, steps/nnet2/relabel_egs.sh\n";
-
-    ParseOptions po(usage);
-
-    po.Read(argc, argv);
-
-    // Here we expect equal number of input egs archive and output egs archives. 
-    // So the total number of arguments including the alignment specifier must be odd.
-    if (po.NumArgs() < 3 || po.NumArgs() % 2 == 0) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string alignments_rspecifier = po.GetArg(1);
-    int32 num_archives = (po.NumArgs() - 1) / 2;
-    
-    SequentialInt32VectorReader ali_reader(alignments_rspecifier);
-
-    unordered_map<std::string, std::vector<int32>* > utt_to_pdf_ali;
-
-    // Keep statistics
-    int32 num_ali = 0;
-    int64 num_frames_ali = 0, num_frames_egs = 0, 
-          num_frames_missing = 0, num_frames_relabelled = 0;
-
-    // Read alignments and put the pointer in an unordered map
-    // indexed by the key. This is so that we can efficiently find the 
-    // alignment corresponding to the utterance to 
-    // which a particular frame belongs
-    for (; !ali_reader.Done(); ali_reader.Next(), num_ali++) {
-      std::string key = ali_reader.Key();
-      std::vector<int32> *alignment = new std::vector<int32>(ali_reader.Value());
-      std::pair<std::string, std::vector<int32>* > map(key, alignment);
-      utt_to_pdf_ali.insert(map);
-      num_frames_ali += alignment->size();
-    }
-
-    // Read archives of egs sequentially
-    for (int32 i = 0; i < num_archives; i++) {
-      std::string egs_rspecifier(po.GetArg(i+2));
-      std::string egs_wspecifier(po.GetArg(i+2+num_archives));
-
-      SequentialNnetExampleReader egs_reader(egs_rspecifier);
-      NnetExampleWriter egs_writer(egs_wspecifier);
-
-      for (; !egs_reader.Done(); egs_reader.Next(), num_frames_egs++) {
-      
-        std::string key(egs_reader.Key());
-
-        std::string utt_id;
-        int32 frame_id;
-
-        if (!SplitEgsKey(key, &utt_id, &frame_id)) {
-          KALDI_ERR << "Unable to split key " << key << " on delimiter - " 
-                    << " into utterance id and frame id";
-        }
-        NnetExample eg(egs_reader.Value());
-
-        if (utt_to_pdf_ali.find(utt_id) == utt_to_pdf_ali.end()) {
-          KALDI_WARN << "Unable to find utterance id " << utt_id;
-          egs_writer.Write(key, eg);
-          num_frames_missing++;
-          continue;
-        }
-        const std::vector<int32> *alignment = utt_to_pdf_ali[utt_id];
-
-        int32 num_frames_in_eg = eg.labels.size();
-        for (int32 t_offset = 0; t_offset < num_frames_in_eg; t_offset++) {
-          int32 t = frame_id + t_offset;
-          if (t >= static_cast<int32>(alignment->size())) {
-            KALDI_ERR << "Time index " << t << " out of range for alignment, "
-                      << "should be < " << alignment->size();
-          }
-          if (eg.GetLabelSingle(t_offset) != (*alignment)[t])
-            num_frames_relabelled++; 
-          eg.SetLabelSingle(t_offset, (*alignment)[t]);
-        }
-        egs_writer.Write(key, eg);
-      }
-    }
-
-    unordered_map<std::string, std::vector<int32>*>::iterator iter;
-    
-    for (iter = utt_to_pdf_ali.begin(); iter != utt_to_pdf_ali.end(); ++iter)
-      delete iter->second;
-    
-    KALDI_LOG << "Read " << num_ali << " alignments containing a total of " 
-              << num_frames_ali << " frames; labelled " 
-              << num_frames_egs - num_frames_missing << " frames out of " 
-              << num_frames_egs << " examples; labels changed for " 
-              << num_frames_relabelled << " of those frames.\n.";
-
-    return (num_frames_missing > 0.5  * num_frames_egs);
-
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/nnet2bin/nnet-replace-last-layers.cc b/src/nnet2bin/nnet-replace-last-layers.cc
deleted file mode 100644
index 70ebf91095d..00000000000
--- a/src/nnet2bin/nnet-replace-last-layers.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// nnet2bin/nnet-replace-last-layers.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-functions.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "This program is for adding new layers to a neural-network acoustic model.\n"
-        "It removes the last --remove-layers layers, and adds the layers from the\n"
-        "supplied raw-nnet.  The typical use is to remove the last two layers\n"
-        "(the softmax, and the affine component before it), and add in replacements\n"
-        "for them newly initialized by nnet-init.  This program is a more flexible\n"
-        "way of adding layers than nnet-insert, but the inserted network needs to\n"
-        "contain replacements for the removed layers.\n"
-        "\n"
-        "Usage:  nnet-replace-last-layers [options] <nnet-in> <raw-nnet-to-insert-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-replace-last-layers 1.nnet \"nnet-init hidden_layer.config -|\" 2.nnet\n";
-
-    bool binary_write = true;
-    int32 remove_layers = 2;
-
-    ParseOptions po(usage);
-    
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("remove-layers", &remove_layers, "Number of final layers "
-                "to remove before adding input raw network.");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        raw_nnet_rxfilename = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    Nnet src_nnet; // the one we'll insert.
-    ReadKaldiObject(raw_nnet_rxfilename, &src_nnet);
-
-    
-    // This function is declared in nnet-functions.h
-    ReplaceLastComponents(src_nnet,
-                          remove_layers,
-                          &(am_nnet.GetNnet()));
-    KALDI_LOG << "Removed " << remove_layers << " components and added "
-              << src_nnet.NumComponents();
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Write neural-net acoustic model to " <<  nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-show-progress.cc b/src/nnet2bin/nnet-show-progress.cc
deleted file mode 100644
index 97e13089034..00000000000
--- a/src/nnet2bin/nnet-show-progress.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-// nnet2bin/nnet-show-progress.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Given an old and a new model and some training examples (possibly held-out),\n"
-        "show the average objective function given the mean of the two models,\n"
-        "and the breakdown by component of why this happened (computed from\n"
-        "derivative information).  Also shows parameter differences per layer.\n"
-        "If training examples not provided, only shows parameter differences per\n"
-        "layer.\n"
-        "\n"
-        "Usage:  nnet-show-progress [options] <old-model-in> <new-model-in> [<training-examples-in>]\n"
-        "e.g.: nnet-show-progress 1.nnet 2.nnet ark:valid.egs\n";
-    
-    ParseOptions po(usage);
-
-    int32 num_segments = 1;
-    int32 batch_size = 1024;
-    std::string use_gpu = "optional";
-    
-    po.Register("num-segments", &num_segments,
-                "Number of line segments used for computing derivatives");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() < 2 || po.NumArgs() > 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    std::string nnet1_rxfilename = po.GetArg(1),
-        nnet2_rxfilename = po.GetArg(2),
-        examples_rspecifier = po.GetOptArg(3);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet1, am_nnet2;
-    {
-      bool binary_read;
-      Input ki(nnet1_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet1.Read(ki.Stream(), binary_read);
-    }
-    {
-      bool binary_read;
-      Input ki(nnet2_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet2.Read(ki.Stream(), binary_read);
-    }    
-    
-    if (am_nnet1.GetNnet().GetParameterDim() !=
-        am_nnet2.GetNnet().GetParameterDim()) {
-      KALDI_WARN << "Parameter-dim mismatch, cannot show progress.";
-      exit(0);
-    }
-
-    int32 ret = 0;
-    
-    if (!examples_rspecifier.empty()) { 
-      Nnet nnet_gradient(am_nnet2.GetNnet());
-      const bool treat_as_gradient = true;
-      nnet_gradient.SetZero(treat_as_gradient);
-
-      std::vector<NnetExample> examples;
-      SequentialNnetExampleReader example_reader(examples_rspecifier);
-      for (; !example_reader.Done(); example_reader.Next())
-        examples.push_back(example_reader.Value());
-
-      int32 num_examples = examples.size();
-    
-      int32 num_updatable = am_nnet1.GetNnet().NumUpdatableComponents();
-      Vector<BaseFloat> diff(num_updatable);
-    
-      for (int32 s = 0; s < num_segments; s++) {
-        // start and end segments of the line between 0 and 1
-        BaseFloat start = (s + 0.0) / num_segments,
-            end = (s + 1.0) / num_segments, middle = 0.5 * (start + end);
-        Nnet interp_nnet(am_nnet2.GetNnet());
-        interp_nnet.Scale(middle);
-        interp_nnet.AddNnet(1.0 - middle, am_nnet1.GetNnet());
-      
-        Nnet nnet_gradient(am_nnet2.GetNnet());
-        const bool treat_as_gradient = true;
-        nnet_gradient.SetZero(treat_as_gradient);
-
-        double objf_per_frame = ComputeNnetGradient(interp_nnet, examples,
-                                                    batch_size, &nnet_gradient);
-        KALDI_LOG << "At position " << middle << ", objf per frame is " << objf_per_frame;
-
-        Vector<BaseFloat> old_dotprod(num_updatable), new_dotprod(num_updatable);
-        nnet_gradient.ComponentDotProducts(am_nnet1.GetNnet(), &old_dotprod);
-        nnet_gradient.ComponentDotProducts(am_nnet2.GetNnet(), &new_dotprod);
-        old_dotprod.Scale(1.0 / num_examples);
-        new_dotprod.Scale(1.0 / num_examples);
-        diff.AddVec(1.0/ num_segments, new_dotprod);
-        diff.AddVec(-1.0 / num_segments, old_dotprod);
-        KALDI_VLOG(1) << "By segment " << s << ", objf change is " << diff;
-      }
-      KALDI_LOG << "Total objf change per component is " << diff;
-      if (num_examples == 0) ret = 1;
-    }
-   
-    { // Get info about magnitude of parameter change.
-      Nnet diff_nnet(am_nnet1.GetNnet());
-      diff_nnet.AddNnet(-1.0, am_nnet2.GetNnet());
-      int32 num_updatable = diff_nnet.NumUpdatableComponents();
-      Vector<BaseFloat> dot_prod(num_updatable);
-      diff_nnet.ComponentDotProducts(diff_nnet, &dot_prod);
-      dot_prod.ApplyPow(0.5); // take sqrt to get l2 norm of diff
-      KALDI_LOG << "Parameter differences per layer are "
-                << dot_prod;
-
-      Vector<BaseFloat> baseline_prod(num_updatable);
-      am_nnet1.GetNnet().ComponentDotProducts(am_nnet1.GetNnet(),
-                                              &baseline_prod);
-      baseline_prod.ApplyPow(0.5);
-      dot_prod.DivElements(baseline_prod);
-      KALDI_LOG << "Relative parameter differences per layer are "
-                << dot_prod;
-    }
-
-    return ret;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-shuffle-egs-discriminative.cc b/src/nnet2bin/nnet-shuffle-egs-discriminative.cc
deleted file mode 100644
index ebef378d16b..00000000000
--- a/src/nnet2bin/nnet-shuffle-egs-discriminative.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// nnet2bin/nnet-shuffle-egs-discriminative.cc
-
-// Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Copy examples (typically single frames) for neural network training,\n"
-        "from the input to output, but randomly shuffle the order.  This program will keep\n"
-        "all of the examples in memory at once, so don't give it too many.\n"
-        "\n"
-        "Usage:  nnet-shuffle-egs-discriminative [options] <egs-rspecifier> <egs-wspecifier>\n"
-        "\n"
-        "nnet-shuffle-egs-discriminative --srand=1 ark:train.degs ark:shuffled.degs\n";
-    
-    int32 srand_seed = 0;
-    int32 buffer_size = 0;
-    ParseOptions po(usage);
-    po.Register("srand", &srand_seed, "Seed for random number generator ");
-    po.Register("buffer-size", &buffer_size, "If >0, size of a buffer we use "
-                "to do limited-memory partial randomization.  Otherwise, do "
-                "full randomization.");
-    
-    po.Read(argc, argv);
-
-    srand(srand_seed);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string examples_rspecifier = po.GetArg(1),
-        examples_wspecifier = po.GetArg(2);
-
-    int64 num_done = 0;
-
-    std::vector<DiscriminativeNnetExample*> egs;
-    SequentialDiscriminativeNnetExampleReader example_reader(
-        examples_rspecifier);
-    DiscriminativeNnetExampleWriter example_writer(
-        examples_wspecifier);
-    if (buffer_size == 0) { // Do full randomization
-      // Putting in an extra level of indirection here to avoid excessive
-      // computation and memory demands when we have to resize the vector.
-    
-      for (; !example_reader.Done(); example_reader.Next())
-        egs.push_back(new DiscriminativeNnetExample(
-            example_reader.Value()));
-      
-      std::random_shuffle(egs.begin(), egs.end());
-    } else {
-      KALDI_ASSERT(buffer_size > 0);
-      egs.resize(buffer_size, NULL);
-      for (; !example_reader.Done(); example_reader.Next()) {
-        int32 index = RandInt(0, buffer_size - 1);
-        if (egs[index] == NULL) {
-          egs[index] = new DiscriminativeNnetExample(example_reader.Value());
-        } else {
-          std::ostringstream ostr;
-          ostr << num_done;
-          example_writer.Write(ostr.str(), *(egs[index]));
-          *(egs[index]) = example_reader.Value();
-          num_done++;
-        }
-      }      
-    }
-    for (size_t i = 0; i < egs.size(); i++) {
-      std::ostringstream ostr;
-      ostr << num_done;
-      if (egs[i] != NULL) {
-        example_writer.Write(ostr.str(), *(egs[i]));
-        delete egs[i];
-      }
-      num_done++;
-    }
-
-    KALDI_LOG << "Shuffled order of " << num_done
-              << " neural-network training examples "
-              << (buffer_size ? "using a buffer (partial randomization)" : "");
-                  
-    return (num_done == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-shuffle-egs.cc b/src/nnet2bin/nnet-shuffle-egs.cc
deleted file mode 100644
index 7c4872b48b6..00000000000
--- a/src/nnet2bin/nnet-shuffle-egs.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-// nnet2bin/nnet-shuffle-egs.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-// Copyright 2014  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Copy examples (typically single frames) for neural network training,\n"
-        "from the input to output, but randomly shuffle the order.  This program will keep\n"
-        "all of the examples in memory at once, unless you use the --buffer-size option\n"
-        "\n"
-        "Usage:  nnet-shuffle-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
-        "\n"
-        "nnet-shuffle-egs --srand=1 ark:train.egs ark:shuffled.egs\n";
-
-    int32 srand_seed = 0;
-    int32 buffer_size = 0;
-    ParseOptions po(usage);
-    po.Register("srand", &srand_seed, "Seed for random number generator ");
-    po.Register("buffer-size", &buffer_size, "If >0, size of a buffer we use "
-                "to do limited-memory partial randomization.  Otherwise, do "
-                "full randomization.");
-
-    po.Read(argc, argv);
-
-    srand(srand_seed);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string examples_rspecifier = po.GetArg(1),
-        examples_wspecifier = po.GetArg(2);
-
-    int64 num_done = 0;
-
-    std::vector<std::pair<std::string, NnetExample*> > egs;
-
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-    NnetExampleWriter example_writer(examples_wspecifier);
-    if (buffer_size == 0) {  // Do full randomization
-      // Putting in an extra level of indirection here to avoid excessive
-      // computation and memory demands when we have to resize the vector.
-
-      for (; !example_reader.Done(); example_reader.Next())
-        egs.push_back(std::make_pair(example_reader.Key(),
-                                     new NnetExample(example_reader.Value())));
-
-      std::random_shuffle(egs.begin(), egs.end());
-    } else {
-      KALDI_ASSERT(buffer_size > 0);
-      egs.resize(buffer_size,
-          std::pair<std::string, NnetExample*>("", static_cast<NnetExample *>(NULL)));
-      for (; !example_reader.Done(); example_reader.Next()) {
-        int32 index = RandInt(0, buffer_size - 1);
-        if (egs[index].second == NULL) {
-          egs[index] = std::make_pair(example_reader.Key(),
-                                    new NnetExample(example_reader.Value()));
-        } else {
-          example_writer.Write(egs[index].first, *(egs[index].second));
-          egs[index].first = example_reader.Key();
-          *(egs[index].second) = example_reader.Value();
-          num_done++;
-        }
-      }
-    }
-    for (size_t i = 0; i < egs.size(); i++) {
-      if (egs[i].second != NULL) {
-        example_writer.Write(egs[i].first, *(egs[i].second));
-        delete egs[i].second;
-        num_done++;
-      }
-    }
-
-    KALDI_LOG << "Shuffled order of " << num_done
-              << " neural-network training examples "
-              << (buffer_size ? "using a buffer (partial randomization)" : "");
-
-    return (num_done == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-subset-egs.cc b/src/nnet2bin/nnet-subset-egs.cc
deleted file mode 100644
index 4511870eab1..00000000000
--- a/src/nnet2bin/nnet-subset-egs.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-// nnet2bin/nnet-subset-egs.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-// Copyright 2014  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-example-functions.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Creates a random subset of the input examples, of a specified size.\n"
-        "Uses no more memory than the size of the subset.\n"
-        "\n"
-        "Usage:  nnet-subset-egs [options] <egs-rspecifier> [<egs-wspecifier2> ...]\n"
-        "\n"
-        "e.g.\n"
-        "nnet-subset-egs [args] ark:- | nnet-subset-egs --n=1000 ark:- ark:subset.egs\n";
-    
-    int32 srand_seed = 0;
-    int32 n = 1000;
-    bool randomize_order = true;
-    ParseOptions po(usage);
-    po.Register("srand", &srand_seed, "Seed for random number generator ");
-    po.Register("n", &n, "Number of examples to output");
-    po.Register("randomize-order", &randomize_order, "If true, randomize the order "
-                "of the output");
-    
-    po.Read(argc, argv);
-    
-    srand(srand_seed);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string examples_rspecifier = po.GetArg(1),
-        examples_wspecifier = po.GetArg(2);
-
-    std::vector<std::pair<std::string, NnetExample> > egs;
-    egs.reserve(n);    
-    
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-
-    int64 num_read = 0;
-    for (; !example_reader.Done(); example_reader.Next()) {
-      num_read++;
-      if (num_read <= n) {
-        egs.resize(egs.size() + 1);
-        egs.back().first = example_reader.Key();
-        egs.back().second = example_reader.Value();
-      } else {
-        BaseFloat keep_prob = n / static_cast<BaseFloat>(num_read);
-        if (WithProb(keep_prob)) { // With probability "keep_prob"
-          int32 index = RandInt(0, n-1);
-          egs[index].first = example_reader.Key();
-          egs[index].second = example_reader.Value();
-        }
-      }
-    }
-    if (randomize_order)
-      std::random_shuffle(egs.begin(), egs.end());
-
-    NnetExampleWriter writer(examples_wspecifier);
-    for (size_t i = 0; i < egs.size(); i++) {
-      writer.Write(egs[i].first, egs[i].second);
-    }
-    
-    KALDI_LOG << "Selected a subset of " << egs.size() << " out of " << num_read
-              << " neural-network training examples ";
-    
-    return (num_read != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-to-raw-nnet.cc b/src/nnet2bin/nnet-to-raw-nnet.cc
deleted file mode 100644
index 30085c96572..00000000000
--- a/src/nnet2bin/nnet-to-raw-nnet.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// nnet2bin/nnet-to-raw-nnet.cc
-
-// Copyright 2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Copy a (cpu-based) neural net: reads the AmNnet with its transition model, but\n"
-        "writes just the Nnet with no transition model (i.e. the raw neural net.)\n"
-        "\n"
-        "Usage:  nnet-to-raw-nnet [options] <nnet-in> <raw-nnet-out>\n"
-        "e.g.:\n"
-        " nnet-to-raw-nnet --binary=false 1.mdl 1.raw\n";
-
-    int32 truncate = -1;
-    bool binary_write = true;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("truncate", &truncate, "If set, will truncate the neural net "
-                "to this many components by removing the last components.");
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        raw_nnet_wxfilename = po.GetArg(2);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    if (truncate >= 0) {
-      KALDI_LOG << "Truncating neural net to " << truncate << " layers.";
-      am_nnet.GetNnet().Resize(truncate);
-    }
-
-    const Nnet &nnet = am_nnet.GetNnet();
-    WriteKaldiObject(nnet, raw_nnet_wxfilename, binary_write);
-    
-    KALDI_LOG << "Read neural net from " << nnet_rxfilename
-              << " and wrote raw neural net to " << raw_nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet-train-discriminative-parallel.cc b/src/nnet2bin/nnet-train-discriminative-parallel.cc
deleted file mode 100644
index fbc338803b2..00000000000
--- a/src/nnet2bin/nnet-train-discriminative-parallel.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-// nnet2bin/nnet-train-discriminative-parallel.cc
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-compute-discriminative-parallel.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Train the neural network parameters with a discriminative objective\n"
-        "function (MMI, SMBR or MPFE).  This uses training examples prepared with\n"
-        "nnet-get-egs-discriminative\n"
-        "This version uses multiple threads (but no GPU)"
-        "\n"
-        "Usage:  nnet-train-discriminative-parallel [options] <model-in> <training-examples-in> <model-out>\n"
-        "e.g.:\n"
-        "nnet-train-discriminative-parallel --num-threads=8 1.nnet ark:1.degs 2.nnet\n";
-    
-    bool binary_write = true;
-    std::string use_gpu = "yes";
-    int32 num_threads = 1;
-    NnetDiscriminativeUpdateOptions update_opts;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("num-threads", &num_threads, "Number of threads to use");
-    update_opts.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-
-    
-    NnetDiscriminativeStats stats;
-    SequentialDiscriminativeNnetExampleReader example_reader(
-        examples_rspecifier);
-
-    NnetDiscriminativeUpdateParallel(am_nnet, trans_model,
-                                     update_opts, num_threads, &example_reader,
-                                     &(am_nnet.GetNnet()), &stats);
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-
-    return (stats.tot_t == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-train-discriminative-simple.cc b/src/nnet2bin/nnet-train-discriminative-simple.cc
deleted file mode 100644
index 5b2caf4c6f3..00000000000
--- a/src/nnet2bin/nnet-train-discriminative-simple.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// nnet2bin/nnet-train-discriminative-simple.cc
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/am-nnet.h"
-#include "nnet2/nnet-compute-discriminative.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Train the neural network parameters with a discriminative objective\n"
-        "function (MMI, SMBR or MPFE).  This uses training examples prepared with\n"
-        "nnet-get-egs-discriminative\n"
-        "\n"
-        "Usage:  nnet-train-discriminative-simple [options] <model-in> <training-examples-in> <model-out>\n"
-        "e.g.:\n"
-        "nnet-train-discriminative-simple 1.nnet ark:1.degs 2.nnet\n";
-    
-    bool binary_write = true;
-    std::string use_gpu = "yes";
-    NnetDiscriminativeUpdateOptions update_opts;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    update_opts.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-
-    int64 num_examples = 0;
-
-    {
-      TransitionModel trans_model;
-      AmNnet am_nnet;
-      {
-        bool binary_read;
-        Input ki(nnet_rxfilename, &binary_read);
-        trans_model.Read(ki.Stream(), binary_read);
-        am_nnet.Read(ki.Stream(), binary_read);
-      }
-
-    
-      NnetDiscriminativeStats stats;
-      SequentialDiscriminativeNnetExampleReader example_reader(examples_rspecifier);
-
-      for (; !example_reader.Done(); example_reader.Next(), num_examples++) {
-        NnetDiscriminativeUpdate(am_nnet, trans_model, update_opts,
-                                 example_reader.Value(),
-                                 &(am_nnet.GetNnet()), &stats);
-        if (num_examples % 10 == 0 && num_examples != 0) { // each example might be 500 frames.
-          if (GetVerboseLevel() >= 2) {
-            stats.Print(update_opts.criterion);
-          }
-        }          
-      }
-
-      stats.Print(update_opts.criterion);
-        
-      {
-        Output ko(nnet_wxfilename, binary_write);
-        trans_model.Write(ko.Stream(), binary_write);
-        am_nnet.Write(ko.Stream(), binary_write);
-      }
-    }
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    KALDI_LOG << "Finished training, processed " << num_examples
-              << " training examples.  Wrote model to "
-              << nnet_wxfilename;
-    return (num_examples == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-train-ensemble.cc b/src/nnet2bin/nnet-train-ensemble.cc
deleted file mode 100644
index 86e78936279..00000000000
--- a/src/nnet2bin/nnet-train-ensemble.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// nnet2bin/nnet-train-ensemble.cc
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-//           2014  Xiaohui Zhang
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet-ensemble.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Train an ensemble of neural networks with backprop and stochastic\n"
-        "gradient descent using minibatches.  Modified version of nnet-train-simple.\n"
-        "Implements parallel gradient descent with a term that encourages the nnets to\n"
-        "produce similar outputs.\n"
-        "\n"
-        "Usage:  nnet-train-ensemble [options] <model-in-1> <model-in-2> ... <model-in-n>"
-        "  <training-examples-in> <model-out-1> <model-out-2> ... <model-out-n> \n"
-        "\n"
-        "e.g.:\n"
-        " nnet-train-ensemble 1.1.nnet 2.1.nnet ark:egs.ark 2.1.nnet 2.2.nnet \n";
-    
-    bool binary_write = true;
-    bool zero_stats = true;
-    int32 srand_seed = 0;
-    std::string use_gpu = "yes";
-    NnetEnsembleTrainerConfig train_config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("zero-stats", &zero_stats, "If true, zero occupation "
-                "counts stored with the neural net (only affects mixing up).");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(relevant if you have layers of type AffineComponentPreconditioned "
-                "with l2-penalty != 0.0");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
- 
-    train_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() <= 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    srand(srand_seed);
-    
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-    
-    int32 num_nnets = (po.NumArgs() - 1) / 2;
-    std::string nnet_rxfilename = po.GetArg(1);
-    std::string examples_rspecifier = po.GetArg(num_nnets + 1);
-
-    std::string nnet1_rxfilename = po.GetArg(1);
-    
-    TransitionModel trans_model;
-    std::vector<AmNnet> am_nnets(num_nnets);
-    {
-      bool binary_read;
-      Input ki(nnet1_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      KALDI_LOG << nnet1_rxfilename;
-      am_nnets[0].Read(ki.Stream(), binary_read);
-    }
-
-    std::vector<Nnet*> nnets(num_nnets);
-    nnets[0] = &(am_nnets[0].GetNnet());
-
-    for (int32 n = 1; n < num_nnets; n++) {
-      TransitionModel trans_model;
-      bool binary_read;
-      Input ki(po.GetArg(1 + n), &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnets[n].Read(ki.Stream(), binary_read);
-      nnets[n] = &am_nnets[n].GetNnet();
-    }      
-    
-
-    int64 num_examples = 0;
-
-    {
-      if (zero_stats) {
-        for (int32 n = 1; n < num_nnets; n++) 
-          nnets[n]->ZeroStats();
-      }
-      { // want to make sure this object deinitializes before
-        // we write the model, as it does something in the destructor.
-        NnetEnsembleTrainer trainer(train_config,
-                                    nnets);
-      
-        SequentialNnetExampleReader example_reader(examples_rspecifier);
-
-        for (; !example_reader.Done(); example_reader.Next(), num_examples++)
-          trainer.TrainOnExample(example_reader.Value());  // It all happens here!
-      }
-    
-      {
-        for (int32 n = 0; n < num_nnets; n++) {
-          Output ko(po.GetArg(po.NumArgs() - num_nnets + n + 1), binary_write);
-          trans_model.Write(ko.Stream(), binary_write);
-          am_nnets[n].Write(ko.Stream(), binary_write);
-        }
-      }
-    }
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    
-    KALDI_LOG << "Finished training, processed " << num_examples
-              << " training examples.";
-    return (num_examples == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-train-parallel.cc b/src/nnet2bin/nnet-train-parallel.cc
deleted file mode 100644
index a094a069db0..00000000000
--- a/src/nnet2bin/nnet-train-parallel.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// nnet2bin/nnet-train-parallel.cc
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-update-parallel.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Train the neural network parameters with backprop and stochastic\n"
-        "gradient descent using minibatches.  As nnet-train-simple, but\n"
-        "uses multiple threads in a Hogwild type of update (for CPU, not GPU).\n"
-        "\n"
-        "Usage:  nnet-train-parallel [options] <model-in> <training-examples-in> <model-out>\n"
-        "\n"
-        "e.g.:\n"
-        "nnet-train-parallel --num-threads=8 1.nnet ark:1.1.egs 2.nnet\n";
-    
-    bool binary_write = true;
-    bool zero_stats = true;
-    int32 minibatch_size = 1024;
-    int32 srand_seed = 0;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("zero-stats", &zero_stats, "If true, zero stats "
-                "stored with the neural net (only affects mixing up).");
-    po.Register("srand", &srand_seed,
-                "Seed for random number generator (e.g., for dropout)");
-    po.Register("num-threads", &g_num_threads, "Number of training threads to use "
-                "in the parallel update. [Note: if you use a parallel "
-                "implementation of BLAS, the actual number of threads may be larger.]");
-    po.Register("minibatch-size", &minibatch_size, "Number of examples to use for "
-                "each minibatch during training.");
-    
-    po.Read(argc, argv);
-    srand(srand_seed);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-
-    KALDI_ASSERT(minibatch_size > 0);
-
-    if (zero_stats) am_nnet.GetNnet().ZeroStats();
-
-    double num_examples = 0;
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-    
-
-    DoBackpropParallel(am_nnet.GetNnet(),
-                       minibatch_size,
-                       &example_reader,
-                       &num_examples,
-                       &(am_nnet.GetNnet()));
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    
-    KALDI_LOG << "Finished training, processed " << num_examples
-              << " training examples (weighted).  Wrote model to "
-              << nnet_wxfilename;
-    return (num_examples == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-train-simple.cc b/src/nnet2bin/nnet-train-simple.cc
deleted file mode 100644
index 322868458fc..00000000000
--- a/src/nnet2bin/nnet-train-simple.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// nnet2bin/nnet-train-simple.cc
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Train the neural network parameters with backprop and stochastic\n"
-        "gradient descent using minibatches.  Training examples would be\n"
-        "produced by nnet-get-egs.\n"
-        "\n"
-        "Usage:  nnet-train-simple [options] <model-in> <training-examples-in> <model-out>\n"
-        "\n"
-        "e.g.:\n"
-        "nnet-train-simple 1.nnet ark:1.egs 2.nnet\n";
-    
-    bool binary_write = true;
-    bool zero_stats = true;
-    int32 srand_seed = 0;
-    std::string use_gpu = "yes";
-    NnetSimpleTrainerConfig train_config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("zero-stats", &zero_stats, "If true, zero occupation "
-                "counts stored with the neural net (only affects mixing up).");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(relevant if you have layers of type AffineComponentPreconditioned "
-                "with l2-penalty != 0.0");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    
-    train_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    srand(srand_seed);
-    
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-
-    int64 num_examples;
-    
-    {
-      TransitionModel trans_model;
-      AmNnet am_nnet;
-      {
-        bool binary_read;
-        Input ki(nnet_rxfilename, &binary_read);
-        trans_model.Read(ki.Stream(), binary_read);
-        am_nnet.Read(ki.Stream(), binary_read);
-      }
-
-      if (zero_stats) am_nnet.GetNnet().ZeroStats();
-
-      SequentialNnetExampleReader example_reader(examples_rspecifier);
-      
-      num_examples = TrainNnetSimple(train_config, &(am_nnet.GetNnet()),
-                                     &example_reader);
-    
-      {
-        Output ko(nnet_wxfilename, binary_write);
-        trans_model.Write(ko.Stream(), binary_write);
-        am_nnet.Write(ko.Stream(), binary_write);
-      }
-    }
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    
-    KALDI_LOG << "Finished training, processed " << num_examples
-              << " training examples.  Wrote model to "
-              << nnet_wxfilename;
-    return (num_examples == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-train-transitions.cc b/src/nnet2bin/nnet-train-transitions.cc
deleted file mode 100644
index 111b6909991..00000000000
--- a/src/nnet2bin/nnet-train-transitions.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// nnet2bin/nnet-train-transitions.cc
-
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-namespace kaldi {
-namespace nnet2 {
-void SetPriors(const TransitionModel &tmodel,
-               const Vector<double> &transition_accs,
-               double prior_floor,
-               AmNnet *am_nnet) {
-  KALDI_ASSERT(tmodel.NumPdfs() == am_nnet->NumPdfs());
-  Vector<BaseFloat> pdf_counts(tmodel.NumPdfs());
-  KALDI_ASSERT(transition_accs(0) == 0.0); // There is
-  // no zero transition-id.
-  for (int32 tid = 1; tid < transition_accs.Dim(); tid++) {
-    int32 pdf = tmodel.TransitionIdToPdf(tid);
-    pdf_counts(pdf) += transition_accs(tid);
-  }
-  BaseFloat sum = pdf_counts.Sum();
-  KALDI_ASSERT(sum != 0.0);
-  KALDI_ASSERT(prior_floor > 0.0 && prior_floor < 1.0);
-  pdf_counts.Scale(1.0 / sum);
-  pdf_counts.ApplyFloor(prior_floor);
-  pdf_counts.Scale(1.0 / pdf_counts.Sum()); // normalize again.
-  am_nnet->SetPriors(pdf_counts);
-}               
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Train the transition probabilities of a neural network acoustic model\n"
-        "\n"
-        "Usage:  nnet-train-transitions [options] <nnet-in> <alignments-rspecifier> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-train-transitions 1.nnet \"ark:gunzip -c ali.*.gz|\" 2.nnet\n";
-    
-    bool binary_write = true;
-    bool set_priors = true; // Also set the per-pdf priors in the model.
-    BaseFloat prior_floor = 5.0e-06; // The default was previously 1e-8, but
-                                     // once we had problems with a pdf-id that
-                                     // was not being seen in training, being
-                                     // recognized all the time.  This value
-                                     // seemed to be the smallest prior of the
-                                     // "seen" pdf-ids in one run.
-    MleTransitionUpdateConfig transition_update_config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("set-priors", &set_priors, "If true, also set priors in neural "
-                "net (we divide by these in test time)");
-    po.Register("prior-floor", &prior_floor, "When setting priors, floor for "
-                "priors");
-    transition_update_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        ali_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-    
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
-
-    int32 num_done = 0;
-    SequentialInt32VectorReader ali_reader(ali_rspecifier);
-    for (; ! ali_reader.Done(); ali_reader.Next()) {
-      const std::vector<int32> alignment(ali_reader.Value());
-      for (size_t i = 0; i < alignment.size(); i++) {
-        int32 tid = alignment[i];
-        BaseFloat weight = 1.0;
-        trans_model.Accumulate(weight, tid, &transition_accs);
-      }
-      num_done++;
-    }
-    KALDI_LOG << "Accumulated transition stats from " << num_done
-              << " utterances.";
-
-    {
-      BaseFloat objf_impr, count;
-      trans_model.MleUpdate(transition_accs, transition_update_config,
-                            &objf_impr, &count);
-      KALDI_LOG << "Transition model update: average " << (objf_impr/count)
-                << " log-like improvement per frame over " << count
-                << " frames.";
-    }
-
-    if (set_priors) {
-      KALDI_LOG << "Setting priors of pdfs in the model.";
-      SetPriors(trans_model, transition_accs, prior_floor, &am_nnet);
-    }
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Trained transitions of neural network model and wrote it to "
-              << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/nnet1-to-raw-nnet.cc b/src/nnet2bin/nnet1-to-raw-nnet.cc
deleted file mode 100644
index 96e058075d9..00000000000
--- a/src/nnet2bin/nnet1-to-raw-nnet.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-// nnet2bin/nnet1-to-raw-nnet.cc
-
-// Copyright 2013  Johns Hopkins University (author:  Daniel Povey, Hainan Xu)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-affine-transform.h"
-#include "nnet/nnet-activation.h"
-#include "nnet/nnet-various.h"
-#include "nnet2/nnet-nnet.h"
-#include "nnet2/nnet-component.h"
-
-namespace kaldi {
-
-nnet2::Component *ConvertAffineTransformComponent(
-    const nnet1::Component &nnet1_component,
-    const bool use_preconditioned_affine_component) {
-  const nnet1::AffineTransform *affine =
-      dynamic_cast<const nnet1::AffineTransform*>(&nnet1_component);
-  KALDI_ASSERT(affine != NULL);
-  // default learning rate is 1.0e-05, you can use the --learning-rate or
-  // --learning-rates option to nnet-am-copy to change it if you need.
-  BaseFloat learning_rate = 1.0e-05;
-  if (use_preconditioned_affine_component) {
-    int32 rank_in = 20,
-          rank_out = 80,
-          update_period = 4;
-    BaseFloat num_samples_history = 2000.,
-              alpha = 4.;
-    return new nnet2::AffineComponentPreconditionedOnline(
-      nnet2::AffineComponent(affine->GetLinearity(),
-        affine->GetBias(),
-        learning_rate),
-      rank_in,
-      rank_out,
-      update_period,
-      num_samples_history,
-      alpha);
-  } else {
-    return new nnet2::AffineComponent(affine->GetLinearity(),
-      affine->GetBias(),
-      learning_rate);
-  }
-}
-
-nnet2::Component *ConvertSoftmaxComponent(
-    const nnet1::Component &nnet1_component) {
-  const nnet1::Softmax *softmax =
-      dynamic_cast<const nnet1::Softmax*>(&nnet1_component);
-  KALDI_ASSERT(softmax != NULL);
-  return new nnet2::SoftmaxComponent(softmax->InputDim());
-}
-
-nnet2::Component *ConvertSigmoidComponent(
-    const nnet1::Component &nnet1_component) {
-  const nnet1::Sigmoid *sigmoid =
-      dynamic_cast<const nnet1::Sigmoid*>(&nnet1_component);
-  KALDI_ASSERT(sigmoid != NULL);
-  return new nnet2::SigmoidComponent(sigmoid->InputDim());
-}
-
-nnet2::Component *ConvertSpliceComponent(
-    const nnet1::Component &nnet1_component) {
-  const nnet1::Splice *splice =
-      dynamic_cast<const nnet1::Splice*>(&nnet1_component);
-  KALDI_ASSERT(splice != NULL);
-//  int32 low, high;
-  std::vector<int32> frame_offsets;
-
-  std::ostringstream ostr;
-  splice->WriteData(ostr, false);
-
-  std::istringstream istr(ostr.str());
-  ReadIntegerVector(istr, false, &frame_offsets);
-
-  nnet2::SpliceComponent *res = new nnet2::SpliceComponent();
-  res->Init(splice->InputDim(), frame_offsets);
-  return res;
-}
-
-
-nnet2::Component *ConvertAddShiftComponent(
-    const nnet1::Component &nnet1_component) {
-  const nnet1::AddShift *add_shift =
-      dynamic_cast<const nnet1::AddShift*>(&nnet1_component);
-  KALDI_ASSERT(add_shift != NULL);
-  Vector<BaseFloat> bias(add_shift->NumParams());
-
-  add_shift->GetParams(&bias);
-  CuVector<BaseFloat> cu_bias(bias);
-
-  nnet2::FixedBiasComponent *res = new nnet2::FixedBiasComponent();
-  res->Init(cu_bias);
-  return res;
-}
-
-nnet2::Component *ConvertRescaleComponent(
-    const nnet1::Component &nnet1_component) {
-  const nnet1::Rescale *rescale =
-      dynamic_cast<const nnet1::Rescale*>(&nnet1_component);
-  KALDI_ASSERT(rescale != NULL);
-
-  Vector<BaseFloat> scale(rescale->NumParams());
-  rescale->GetParams(&scale);
-
-  CuVector<BaseFloat> cu_scale(scale);
-
-  nnet2::FixedScaleComponent *res = new nnet2::FixedScaleComponent();
-  res->Init(cu_scale);
-  return res;
-}
-
-nnet2::Component *ConvertComponent(const nnet1::Component &nnet1_component,
-    const bool use_preconditioned_affine_component) {
-  nnet1::Component::ComponentType type_in = nnet1_component.GetType();
-  switch (type_in) {
-    case nnet1::Component::kAffineTransform:
-      return ConvertAffineTransformComponent(nnet1_component,
-          use_preconditioned_affine_component);
-    case nnet1::Component::kSoftmax:
-      return ConvertSoftmaxComponent(nnet1_component);
-    case nnet1::Component::kSigmoid:
-      return ConvertSigmoidComponent(nnet1_component);
-    case nnet1::Component::kSplice:
-      return ConvertSpliceComponent(nnet1_component); // note, this will for now only handle the
-      // special case nnet1::Component::where all splice indexes in nnet1_component are contiguous, e.g.
-      // -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5 .
-    case nnet1::Component::kAddShift:
-      return ConvertAddShiftComponent(nnet1_component); // convert to FixedBiasComponent
-    case nnet1::Component::kRescale:
-      return ConvertRescaleComponent(nnet1_component); // convert to FixedScaleComponent
-    default: KALDI_ERR << "Un-handled nnet1 component type "
-                       << nnet1::Component::TypeToMarker(type_in);
-    return NULL;
-  }
-}
-
-
-nnet2::Nnet *ConvertNnet1ToNnet2(const nnet1::Nnet &nnet1,
-    const bool use_preconditioned_affine_component) {
-  // get a vector of nnet2::Component pointers and initialize the nnet2::Nnet with it.
-  size_t size = nnet1.NumComponents();
-  std::vector<nnet2::Component*> *components = new std::vector<nnet2::Component*>();
-  components->resize(size);
-  for (size_t i = 0; i < size; i++) {
-      (*components)[i] = ConvertComponent(nnet1.GetComponent(i),
-          use_preconditioned_affine_component);
-  }
-
-  nnet2::Nnet *res = new nnet2::Nnet();
-  res->Init(components);
-  delete components;
-  return res;
-}
-
-}  // namespace kaldi
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Convert nnet1 neural net to nnet2 'raw' neural net\n"
-        "\n"
-        "Usage:  nnet1-to-raw-nnet [options] <nnet1-in> <nnet2-out>\n"
-        "e.g.:\n"
-        " nnet1-to-raw-nnet srcdir/final.nnet - | nnet-am-init dest/tree dest/topo - dest/0.mdl\n";
-
-    bool binary_write = true, use_preconditioned_affine_component = false;
-    int32 srand_seed = 0;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Register("use_preconditioned_affine_component",
-        &use_preconditioned_affine_component,
-        "Using AffineComponentPreconditionOnline instead AffineComponent");
-
-    po.Read(argc, argv);
-    srand(srand_seed);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet1_rxfilename = po.GetArg(1),
-        raw_nnet2_wxfilename = po.GetArg(2);
-
-    nnet1::Nnet nnet1;
-    ReadKaldiObject(nnet1_rxfilename, &nnet1);
-    nnet2::Nnet *nnet2 = ConvertNnet1ToNnet2(nnet1,
-        use_preconditioned_affine_component);
-    WriteKaldiObject(*nnet2, raw_nnet2_wxfilename, binary_write);
-    KALDI_LOG << "Converted nnet1 neural net to raw nnet2 and wrote it to "
-              << PrintableWxfilename(raw_nnet2_wxfilename);
-    delete nnet2;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/raw-nnet-concat.cc b/src/nnet2bin/raw-nnet-concat.cc
deleted file mode 100644
index a4664007779..00000000000
--- a/src/nnet2bin/raw-nnet-concat.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-// nnet2bin/raw-nnet-concat.cc
-
-// Copyright 2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet2/am-nnet.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Concatenate two 'raw' neural nets, e.g. as output by nnet-init or\n"
-        "nnet-to-raw-nnet\n"
-        "\n"
-        "Usage:  raw-nnet-concat [options] <raw-nnet-in1> <raw-nnet-in2> <raw-nnet-out>\n"
-        "e.g.:\n"
-        " raw-nnet-concat nnet1 nnet2 nnet_concat\n";
-    
-    bool binary_write = true;
-    int32 srand_seed = 0;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    
-    po.Read(argc, argv);
-    srand(srand_seed);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string raw_nnet1_rxfilename = po.GetArg(1),
-        raw_nnet2_rxfilename = po.GetArg(2),
-        raw_nnet_wxfilename = po.GetArg(3);
-    
-    Nnet nnet1;
-    ReadKaldiObject(raw_nnet1_rxfilename, &nnet1);
-    Nnet nnet2;
-    ReadKaldiObject(raw_nnet2_rxfilename, &nnet2);
-
-    Nnet nnet_concat(nnet1, nnet2); // Constructor concatenates them.
-
-    WriteKaldiObject(nnet_concat, raw_nnet_wxfilename, binary_write);
-    
-    KALDI_LOG << "Concatenated neural nets from "
-              << raw_nnet1_rxfilename << " and " << raw_nnet2_rxfilename
-              << " and wrote to " << raw_nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/raw-nnet-copy.cc b/src/nnet2bin/raw-nnet-copy.cc
deleted file mode 100644
index 57b5ee0e6f1..00000000000
--- a/src/nnet2bin/raw-nnet-copy.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// nnet2bin/raw-nnet-copy.cc
-
-// Copyright 2014 Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <typeinfo>
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/am-nnet.h"
-#include "tree/context-dep.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Copy a raw neural net (this version works on raw nnet2 neural nets,\n"
-        "without the transition model.  Supports the 'truncate' option.\n"
-        "\n"
-        "Usage:  raw-nnet-copy [options] <raw-nnet-in> <raw-nnet-out>\n"
-        "e.g.:\n"
-        " raw-nnet-copy --binary=false 1.mdl text.mdl\n"
-        "See also: nnet-to-raw-nnet, nnet-am-copy\n";
-    
-    int32 truncate = -1;
-    bool binary_write = true;
-    std::string learning_rate_scales_str = " ";
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("truncate", &truncate, "If set, will truncate the neural net "
-                "to this many components by removing the last components.");
-    po.Register("learning-rate-scales", &learning_rate_scales_str,
-                "Colon-separated list of scaling factors for learning rates, "
-                "applied after the --learning-rate and --learning-rates options."
-                "Used to scale learning rates for particular layer types.  E.g."
-                "--learning-rate-scales=AffineComponent=0.5");
-
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string raw_nnet_rxfilename = po.GetArg(1),
-        raw_nnet_wxfilename = po.GetArg(2);
-    
-    Nnet nnet;
-    ReadKaldiObject(raw_nnet_rxfilename, &nnet);
-    
-    if (truncate >= 0)
-      nnet.Resize(truncate);
-
-    if (learning_rate_scales_str != " ")  {
-      // parse the learning_rate_scales provided as an option
-      std::map<std::string, BaseFloat> learning_rate_scales;
-      std::vector<std::string> learning_rate_scale_vec;
-      SplitStringToVector(learning_rate_scales_str, ":", true,
-                          &learning_rate_scale_vec);
-      for (int32 index = 0; index < learning_rate_scale_vec.size();
-          index++) {
-        std::vector<std::string> parts;
-        BaseFloat scale_factor;
-        SplitStringToVector(learning_rate_scale_vec[index],
-                            "=", false,  &parts);
-        if (!ConvertStringToReal(parts[1], &scale_factor)) {
-          KALDI_ERR << "Unknown format for --learning-rate-scales option. "
-              << "Expected format is "
-              << "--learning-rate-scales=AffineComponent=0.1:AffineComponentPreconditioned=0.5 "
-              << "instead got "
-              << learning_rate_scales_str;
-        }
-        learning_rate_scales.insert(std::pair<std::string, BaseFloat>(
-                parts[0], scale_factor));
-      }
-      // use the learning_rate_scales to scale the component learning rates
-      nnet.ScaleLearningRates(learning_rate_scales);
-    }
-
-    WriteKaldiObject(nnet, raw_nnet_wxfilename, binary_write);
-
-    KALDI_LOG << "Copied raw neural net from " << raw_nnet_rxfilename
-              << " to " << raw_nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet2bin/raw-nnet-info.cc b/src/nnet2bin/raw-nnet-info.cc
deleted file mode 100644
index f72e5219821..00000000000
--- a/src/nnet2bin/raw-nnet-info.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// nnet2bin/raw-nnet-info.cc
-
-// Copyright 2013  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/nnet-nnet.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Print human-readable information about the raw neural network\n"
-        "to the standard output\n"
-        "Usage:  raw-nnet-info [options] <nnet-in>\n"
-        "e.g.:\n"
-        " raw-nnet-info 1.nnet\n";
-        
-    ParseOptions po(usage);
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 1) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string raw_nnet_rxfilename = po.GetArg(1);
-    
-    Nnet nnet;
-    ReadKaldiObject(raw_nnet_rxfilename, &nnet);
-    
-    std::cout << nnet.Info();
-    
-    KALDI_LOG << "Printed info about " << raw_nnet_rxfilename;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
-
-
diff --git a/src/nnet2bin/raw-nnet-init b/src/nnet2bin/raw-nnet-init
deleted file mode 120000
index 181aa3a3cf0..00000000000
--- a/src/nnet2bin/raw-nnet-init
+++ /dev/null
@@ -1 +0,0 @@
-nnet-init
\ No newline at end of file
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 5e67211c3a7..a39362b0af0 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -31,7 +31,8 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   nnet-compile-looped.o decodable-simple-looped.o \
   decodable-online-looped.o convolution.o \
   nnet-convolutional-component.o attention.o \
-  nnet-attention-component.o nnet-tdnn-component.o nnet-batch-compute.o
+  nnet-attention-component.o nnet-tdnn-component.o nnet-batch-compute.o \
+  get-feature-transform.o
 
 
 LIBNAME = kaldi-nnet3
@@ -40,7 +41,7 @@ ADDLIBS = ../chain/kaldi-chain.a ../cudamatrix/kaldi-cudamatrix.a \
           ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3/decodable-online-looped.h b/src/nnet3/decodable-online-looped.h
index cc79e58feba..2b12c02656c 100644
--- a/src/nnet3/decodable-online-looped.h
+++ b/src/nnet3/decodable-online-looped.h
@@ -28,7 +28,7 @@
 #include "nnet3/nnet-compute.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/decodable-simple-looped.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -174,7 +174,7 @@ class DecodableNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
 
 
 // This is for traditional decoding where the graph has transition-ids
-// on the arcs, and you need the TransitionModel to map those to
+// on the arcs, and you need the Transitions to map those to
 // pdf-ids.
 // Note: whether or not division by the prior takes place depends on
 // whether you supplied class AmNnetSimple (or just Nnet), to the constructor
@@ -183,7 +183,7 @@ class DecodableNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
 class DecodableAmNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
  public:
   DecodableAmNnetLoopedOnline(
-      const TransitionModel &trans_model,
+      const Transitions &trans_model,
       const DecodableNnetSimpleLoopedInfo &info,
       OnlineFeatureInterface *input_features,
       OnlineFeatureInterface *ivector_features):
@@ -200,7 +200,7 @@ class DecodableAmNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
                                   int32 transition_id);
 
  private:
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
 
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnetLoopedOnline);
 
diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc
index 71aa7daaa17..d7b680519d0 100644
--- a/src/nnet3/decodable-simple-looped.cc
+++ b/src/nnet3/decodable-simple-looped.cc
@@ -244,7 +244,7 @@ void DecodableNnetSimpleLooped::GetCurrentIvector(int32 input_frame,
 
 DecodableAmNnetSimpleLooped::DecodableAmNnetSimpleLooped(
     const DecodableNnetSimpleLoopedInfo &info,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const MatrixBase<BaseFloat> &feats,
     const VectorBase<BaseFloat> *ivector,
     const MatrixBase<BaseFloat> *online_ivectors,
diff --git a/src/nnet3/decodable-simple-looped.h b/src/nnet3/decodable-simple-looped.h
index ca3f732641e..71724508af4 100644
--- a/src/nnet3/decodable-simple-looped.h
+++ b/src/nnet3/decodable-simple-looped.h
@@ -23,7 +23,7 @@
 #include <vector>
 #include "base/kaldi-common.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "itf/decodable-itf.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-compute.h"
@@ -295,7 +295,7 @@ class DecodableAmNnetSimpleLooped: public DecodableInterface {
                         (in frames) with which the iVectors are estimated.
   */
   DecodableAmNnetSimpleLooped(const DecodableNnetSimpleLoopedInfo &info,
-                              const TransitionModel &trans_model,
+                              const Transitions &trans_model,
                               const MatrixBase<BaseFloat> &feats,
                               const VectorBase<BaseFloat> *ivector = NULL,
                               const MatrixBase<BaseFloat> *online_ivectors = NULL,
@@ -318,7 +318,7 @@ class DecodableAmNnetSimpleLooped: public DecodableInterface {
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnetSimpleLooped);
   DecodableNnetSimpleLooped decodable_nnet_;
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
 };
 
 
diff --git a/src/nnet3/discriminative-supervision.cc b/src/nnet3/discriminative-supervision.cc
index 0f8f8a4aef7..94294c4fbce 100644
--- a/src/nnet3/discriminative-supervision.cc
+++ b/src/nnet3/discriminative-supervision.cc
@@ -135,7 +135,7 @@ void DiscriminativeSupervision::Check() const {
 
 DiscriminativeSupervisionSplitter::DiscriminativeSupervisionSplitter(
     const SplitDiscriminativeSupervisionOptions &config,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const DiscriminativeSupervision &supervision):
     config_(config), tmodel_(tmodel), supervision_(supervision) {
   if (supervision_.num_sequences != 1) {
@@ -182,7 +182,7 @@ void DiscriminativeSupervisionSplitter::CollapseTransitionIds(
       KALDI_ASSERT(t >= 0 && t < num_frames);
       Arc arc = aiter.Value();
       KALDI_ASSERT(arc.ilabel != 0 && arc.ilabel == arc.olabel);
-      int32 pdf = tmodel_.TransitionIdToPdf(arc.ilabel);
+      int32 pdf = tmodel_.TransitionIdToPdfFast(arc.ilabel);
       if (pdf_to_tid[t].count(pdf) != 0) {
         arc.ilabel = arc.olabel = pdf_to_tid[t][pdf];
         aiter.SetValue(arc);
diff --git a/src/nnet3/discriminative-supervision.h b/src/nnet3/discriminative-supervision.h
index 17c0b1cdb1e..ac563814d56 100644
--- a/src/nnet3/discriminative-supervision.h
+++ b/src/nnet3/discriminative-supervision.h
@@ -23,7 +23,7 @@
 
 #include "util/table-types.h"
 #include "hmm/posterior.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "lat/kaldi-lattice.h"
 
 namespace kaldi {
@@ -144,7 +144,7 @@ class DiscriminativeSupervisionSplitter {
 
   DiscriminativeSupervisionSplitter(
       const SplitDiscriminativeSupervisionOptions &config,
-      const TransitionModel &tmodel,
+      const Transitions &tmodel,
       const DiscriminativeSupervision &supervision);
 
   // A structure used to store the forward and backward scores
@@ -185,7 +185,7 @@ class DiscriminativeSupervisionSplitter {
 
   // Transition model is used by the function
   // CollapseTransitionIds()
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
 
   // A reference to the supervision object that we will be splitting
   const DiscriminativeSupervision &supervision_;
diff --git a/src/nnet3/discriminative-training.cc b/src/nnet3/discriminative-training.cc
index 8076da2b753..fcbf31eea9b 100644
--- a/src/nnet3/discriminative-training.cc
+++ b/src/nnet3/discriminative-training.cc
@@ -94,7 +94,7 @@ class DiscriminativeComputation {
   // even though this does not offer any computational advantages here
   // as in the 'chain' case.
   DiscriminativeComputation(const DiscriminativeOptions &opts,
-      const TransitionModel &tmodel,
+      const Transitions &tmodel,
       const CuVectorBase<BaseFloat> &log_priors,
       const DiscriminativeSupervision &supervision,
       const CuMatrixBase<BaseFloat> &nnet_output,
@@ -109,7 +109,7 @@ class DiscriminativeComputation {
 
  private:
   const DiscriminativeOptions &opts_;
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
 
   // Vector of log-priors of pdfs.
   // This can be a size zero vector e.g. for 'chain' model
@@ -180,7 +180,7 @@ class DiscriminativeComputation {
 
 DiscriminativeComputation::DiscriminativeComputation(
                             const DiscriminativeOptions &opts,
-                            const TransitionModel &tmodel,
+                            const Transitions &tmodel,
                             const CuVectorBase<BaseFloat> &log_priors,
                             const DiscriminativeSupervision &supervision,
                             const CuMatrixBase<BaseFloat> &nnet_output,
@@ -234,7 +234,7 @@ void DiscriminativeComputation::LookupNnetOutput(
     for (fst::ArcIterator<Lattice> aiter(den_lat_, s); !aiter.Done(); aiter.Next()) {
       const Arc &arc = aiter.Value();
       if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
-        int32 tid = arc.ilabel, pdf_id = tmodel_.TransitionIdToPdf(tid);
+        int32 tid = arc.ilabel, pdf_id = tmodel_.TransitionIdToPdfFast(tid);
         // The ordering of the indexes is similar to that in chain models
         requested_indexes->push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
       }
@@ -247,7 +247,7 @@ void DiscriminativeComputation::LookupNnetOutput(
       int32 seq = t / supervision_.frames_per_sequence,
             idx = t % supervision_.frames_per_sequence;
       int32 tid = supervision_.num_ali[t],
-                  pdf_id = tmodel_.TransitionIdToPdf(tid);
+                  pdf_id = tmodel_.TransitionIdToPdfFast(tid);
       KALDI_ASSERT(pdf_id >= 0 && pdf_id < num_pdfs);
       requested_indexes->push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
     }
@@ -544,7 +544,7 @@ double DiscriminativeComputation::ComputeObjfAndDeriv(Posterior *post,
 
 
 void ComputeDiscriminativeObjfAndDeriv(const DiscriminativeOptions &opts,
-                                       const TransitionModel &tmodel,
+                                       const Transitions &tmodel,
                                        const CuVectorBase<BaseFloat> &log_priors,
                                        const DiscriminativeSupervision &supervision,
                                        const CuMatrixBase<BaseFloat> &nnet_output,
diff --git a/src/nnet3/discriminative-training.h b/src/nnet3/discriminative-training.h
index 4ec7109d64f..96d95a54ca5 100644
--- a/src/nnet3/discriminative-training.h
+++ b/src/nnet3/discriminative-training.h
@@ -29,7 +29,7 @@
 #include "tree/context-dep.h"
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/discriminative-supervision.h"
 #include "lat/lattice-functions.h"
 #include "cudamatrix/cu-matrix-lib.h"
@@ -235,7 +235,7 @@ struct DiscriminativeObjectiveInfo {
 */
 void ComputeDiscriminativeObjfAndDeriv(
     const DiscriminativeOptions &opts,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const CuVectorBase<BaseFloat> &log_priors,
     const DiscriminativeSupervision &supervision,
     const CuMatrixBase<BaseFloat> &nnet_output,
diff --git a/src/nnet2/get-feature-transform.cc b/src/nnet3/get-feature-transform.cc
similarity index 98%
rename from src/nnet2/get-feature-transform.cc
rename to src/nnet3/get-feature-transform.cc
index 38ec9bc3da9..3eef63765fe 100644
--- a/src/nnet2/get-feature-transform.cc
+++ b/src/nnet3/get-feature-transform.cc
@@ -1,4 +1,4 @@
-// nnet2/get-feature-transform.cc
+// nnet3/get-feature-transform.cc
 
 // Copyright 2009-2011  Jan Silovsky
 //                2013  Johns Hopkins University (author: Daniel Povey)
@@ -19,7 +19,7 @@
 // limitations under the License.
 
 
-#include "nnet2/get-feature-transform.h"
+#include "nnet3/get-feature-transform.h"
 
 namespace kaldi {
 
@@ -27,7 +27,7 @@ namespace kaldi {
 
 void FeatureTransformEstimate::Estimate(const FeatureTransformEstimateOptions &opts,
                                         Matrix<BaseFloat> *M,
-                                        TpMatrix<BaseFloat> *C) const { 
+                                        TpMatrix<BaseFloat> *C) const {
   double count;
   Vector<double> total_mean;
   SpMatrix<double> total_covar, between_covar;
@@ -44,14 +44,14 @@ void FeatureTransformEstimate::EstimateInternal(
     const Vector<double> &total_mean,
     Matrix<BaseFloat> *M,
     TpMatrix<BaseFloat> *C) {
-  
+
   int32 target_dim = opts.dim, dim = total_covar.NumRows();
   // Interpret zero or negative target_dim as the full dim
   if (target_dim <= 0)
     target_dim = dim;
   // between-class covar is of most rank C-1
   KALDI_ASSERT(target_dim <= dim);
-  
+
   // within-class covariance
   SpMatrix<double> wc_covar(total_covar);
   wc_covar.AddSp(-1.0, between_covar);
@@ -68,7 +68,7 @@ void FeatureTransformEstimate::EstimateInternal(
               << " to diagonal and trying again.\n";
     for (int32 i = 0; i < wc_covar.NumRows(); i++)
       wc_covar(i, i) += smooth;
-    wc_covar_sqrt.Cholesky(wc_covar);    
+    wc_covar_sqrt.Cholesky(wc_covar);
   }
   Matrix<double> wc_covar_sqrt_mat(wc_covar_sqrt);
   wc_covar_sqrt_mat.Invert();
@@ -86,14 +86,14 @@ void FeatureTransformEstimate::EstimateInternal(
   KALDI_LOG << "Sum of all singular values is " << svd_d.Sum();
   KALDI_LOG << "Sum of selected singular values is " <<
       SubVector<double>(svd_d, 0, target_dim).Sum();
-  
+
   Matrix<double> lda_mat(dim, dim);
   lda_mat.AddMatMat(1.0, svd_u, kTrans, wc_covar_sqrt_mat, kNoTrans, 0.0);
 
   // finally, copy first target_dim rows to m
   M->Resize(target_dim, dim);
   M->CopyFromMat(lda_mat.Range(0, target_dim, 0, dim));
-  
+
   if (opts.within_class_factor != 1.0) {
     for (int32 i = 0; i < svd_d.Dim(); i++) {
       BaseFloat old_var = 1.0 + svd_d(i), // the total variance of that dim..
@@ -180,7 +180,7 @@ void FeatureTransformEstimateMulti::Estimate(
 
   int32 input_dim_ext = (opts.remove_offset ? input_dim + 1 : input_dim);
   M->Resize(output_dim, input_dim_ext);
-  
+
   double count;
   Vector<double> total_mean;
   SpMatrix<double> total_covar, between_covar;
@@ -196,7 +196,7 @@ void FeatureTransformEstimateMulti::Estimate(
         CopyFromMat(M_tmp);
     cur_output_index += this_output_dim;
   }
-  
+
 }
 
 
diff --git a/src/nnet2/get-feature-transform.h b/src/nnet3/get-feature-transform.h
similarity index 97%
rename from src/nnet2/get-feature-transform.h
rename to src/nnet3/get-feature-transform.h
index a1ba31a36bf..d3a52d55552 100644
--- a/src/nnet2/get-feature-transform.h
+++ b/src/nnet3/get-feature-transform.h
@@ -1,4 +1,4 @@
-// nnet2/get-feature-transform.h
+// nnet3/get-feature-transform.h
 
 // Copyright 2009-2011  Jan Silovsky
 //                2013  Johns Hopkins University (author: Daniel Povey)
@@ -18,8 +18,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef KALDI_NNET2_GET_FEATURE_TRANSFORM_H_
-#define KALDI_NNET2_GET_FEATURE_TRANSFORM_H_
+#ifndef KALDI_NNET3_GET_FEATURE_TRANSFORM_H_
+#define KALDI_NNET3_GET_FEATURE_TRANSFORM_H_
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
@@ -43,7 +43,7 @@ struct FeatureTransformEstimateOptions {
   BaseFloat max_singular_value;
   FeatureTransformEstimateOptions(): remove_offset(true), dim(-1),
                                      within_class_factor(0.001), max_singular_value(5.0) { }
-  
+
   void Register(OptionsItf *opts) {
     opts->Register("remove-offset", &remove_offset, "If true, output an affine "
                    "transform that makes the projected data mean equal to zero.");
@@ -56,7 +56,7 @@ struct FeatureTransformEstimateOptions {
     opts->Register("max-singular-value", &max_singular_value, "If >0, maximum "
                    "allowed singular value of final transform (they are floored "
                    "to this)");
-  }    
+  }
 };
 
 /**
@@ -114,9 +114,9 @@ struct FeatureTransformEstimateOptions {
       variance is \lambda_i + 1.
       Below, "within-class-factor" is a constant that we set by default to
       0.001.  We scale the i'th dimension of the features by:
-      
+
          \f$  sqrt( (within-class-factor + \lambda_i) / (1 + \lambda_i) ) \f$
-           
+
       If \lambda_i >> 1, this scaling factor approaches 1 (we don't need to
       scale up dimensions with high between-class variance as they already
       naturally have a higher variance than other dimensions.  As \lambda_i
@@ -128,7 +128,7 @@ struct FeatureTransformEstimateOptions {
       \lambda_i), so by scaling the features by approximately sqrt((\lambda_i) /
       (1 + \lambda_i)), the variance becomes approximately \lambda_i [this is
       clear after noting that the variance gets scaled by the square of the
-      feature scale].      
+      feature scale].
  */
 class FeatureTransformEstimate: public LdaEstimate {
  public:
@@ -176,5 +176,4 @@ class FeatureTransformEstimateMulti: public FeatureTransformEstimate {
 
 }  // End namespace kaldi
 
-#endif  // KALDI_NNET2_GET_FEATURE_TRANSFORM_H_
-
+#endif  // KALDI_NNET3_GET_FEATURE_TRANSFORM_H_
diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc
index 9682bd96bc7..1b0ddc066a9 100644
--- a/src/nnet3/nnet-am-decodable-simple.cc
+++ b/src/nnet3/nnet-am-decodable-simple.cc
@@ -57,7 +57,7 @@ DecodableNnetSimple::DecodableNnetSimple(
 
 DecodableAmNnetSimple::DecodableAmNnetSimple(
     const NnetSimpleComputationOptions &opts,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const AmNnetSimple &am_nnet,
     const MatrixBase<BaseFloat> &feats,
     const VectorBase<BaseFloat> *ivector,
@@ -312,7 +312,7 @@ void DecodableNnetSimple::CheckAndFixConfigs() {
 
 DecodableAmNnetSimpleParallel::DecodableAmNnetSimpleParallel(
     const NnetSimpleComputationOptions &opts,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const AmNnetSimple &am_nnet,
     const MatrixBase<BaseFloat> &feats,
     const VectorBase<BaseFloat> *ivector,
diff --git a/src/nnet3/nnet-am-decodable-simple.h b/src/nnet3/nnet-am-decodable-simple.h
index e83b9e4bab2..34560bf247c 100644
--- a/src/nnet3/nnet-am-decodable-simple.h
+++ b/src/nnet3/nnet-am-decodable-simple.h
@@ -23,7 +23,7 @@
 #include <vector>
 #include "base/kaldi-common.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "itf/decodable-itf.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-compute.h"
@@ -283,7 +283,7 @@ class DecodableAmNnetSimple: public DecodableInterface {
                         compiler(am_nnet.GetNnet(), opts.optimize_config).
   */
   DecodableAmNnetSimple(const NnetSimpleComputationOptions &opts,
-                        const TransitionModel &trans_model,
+                        const Transitions &trans_model,
                         const AmNnetSimple &am_nnet,
                         const MatrixBase<BaseFloat> &feats,
                         const VectorBase<BaseFloat> *ivector = NULL,
@@ -311,7 +311,7 @@ class DecodableAmNnetSimple: public DecodableInterface {
   // argument to the constructor is NULL.
   CachingOptimizingCompiler compiler_;
   DecodableNnetSimple decodable_nnet_;
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
 };
 
 
@@ -355,7 +355,7 @@ class DecodableAmNnetSimpleParallel: public DecodableInterface {
   */
   DecodableAmNnetSimpleParallel(
       const NnetSimpleComputationOptions &opts,
-      const TransitionModel &trans_model,
+      const Transitions &trans_model,
       const AmNnetSimple &am_nnet,
       const MatrixBase<BaseFloat> &feats,
       const VectorBase<BaseFloat> *ivector = NULL,
@@ -382,7 +382,7 @@ class DecodableAmNnetSimpleParallel: public DecodableInterface {
   void DeletePointers();
 
   CachingOptimizingCompiler compiler_;
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
 
   Matrix<BaseFloat> *feats_copy_;
   Vector<BaseFloat> *ivector_copy_;
diff --git a/src/nnet3/nnet-attention-component.cc b/src/nnet3/nnet-attention-component.cc
index 4dd9f606f0b..6e154786f22 100644
--- a/src/nnet3/nnet-attention-component.cc
+++ b/src/nnet3/nnet-attention-component.cc
@@ -555,9 +555,9 @@ bool RestrictedAttentionComponent::IsComputable(
     // All required time-offsets of the output were computable. -> return true.
     return true;
   } else {
-    int32 t = output_index.t,
-        first_time_required = t - (time_stride_ * num_left_inputs_required_),
-        last_time_required = t + (time_stride_ * num_right_inputs_required_);
+    int32 output_t = output_index.t,
+        first_time_required = output_t - (time_stride_ * num_left_inputs_required_),
+        last_time_required = output_t + (time_stride_ * num_right_inputs_required_);
     for (int32 t = first_time_required;
          t <= last_time_required;
          t += time_stride_) {
diff --git a/src/nnet3/nnet-attention-component.h b/src/nnet3/nnet-attention-component.h
index 6072fcd681e..c280e827817 100644
--- a/src/nnet3/nnet-attention-component.h
+++ b/src/nnet3/nnet-attention-component.h
@@ -96,7 +96,9 @@ namespace nnet3 {
                       if you set this, online (looped) decoding will not work
                       correctly.  It might be wiser just to reduce num-right-inputs
                       if you care about real-time decoding.
-     key-scale        Scale on the keys (but not the added context).  Defaults to 1.0 /
+     key-scale        Scale on the keys (but not the added context, by which we
+                      mean the encoding of the position of the input frame relative
+                      to the current frame).  Defaults to 1.0 /
                       sqrt(key-dim), like the 1/sqrt(d_k) value in the
                       "Attention is all you need" paper.  This helps prevent saturation
                       of the softmax.
@@ -169,8 +171,10 @@ class RestrictedAttentionComponent: public Component {
                                const Index &output_index,
                                std::vector<Index> *desired_indexes) const;
 
-  // This function returns true if at least one of the input indexes used to
-  // compute this output index is computable.
+  // This function returns true if all of the required input indexes used to
+  // compute the output were computable (depends on num_left_inputs_required_
+  // and num_right_inputs_required_).  If used_inputs is non-NULL it outputs to
+  // there all of the input indexes (required or not) that were available.
   virtual bool IsComputable(const MiscComputationInfo &misc_info,
                             const Index &output_index,
                             const IndexSet &input_index_set,
diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc
index 9d71a021f05..c7ce8668d1f 100644
--- a/src/nnet3/nnet-batch-compute.cc
+++ b/src/nnet3/nnet-batch-compute.cc
@@ -393,11 +393,11 @@ void NnetBatchComputer::FormatInputs(
       ivector_dim = tasks[0]->ivector.Dim(),
       num_tasks = tasks.size();
   KALDI_ASSERT(num_tasks > 0 && num_tasks <= minibatch_size);
-  
+
   // destination matrix
   input->Resize(minibatch_size * num_input_frames, input_dim,
                 kUndefined);
- 
+
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
 
@@ -422,10 +422,10 @@ void NnetBatchComputer::FormatInputs(
     }
 
     // execute batched copy
-    cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0], 
+    cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0],
         &ldi[0], &outputs[0], &ldo[0]);
 
-  } else 
+  } else
 #endif
   {
     for (int32 n = 0; n < num_tasks; n++) {
@@ -451,9 +451,9 @@ void NnetBatchComputer::FormatInputs(
 
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
-     
+
       // using the batched matrix copy routine for this.  This isn't
-      // extremely efficient but the kernel takes a minimal amount of 
+      // extremely efficient but the kernel takes a minimal amount of
       // time so making a batched vector copy is not worth the effort.
       std::vector<const BaseFloat*> inputs(num_tasks);
       std::vector<BaseFloat*> outputs(num_tasks);
@@ -474,10 +474,10 @@ void NnetBatchComputer::FormatInputs(
       }
 
       // execute batched copy
-      cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
+      cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0],
           &outputs[0], &ldo[0]);
 
-    } else 
+    } else
 #endif
     {
       for (int32 n = 0; n < num_tasks; n++) {
@@ -509,7 +509,7 @@ void NnetBatchComputer::FormatOutputs(
   // un-comment the commented lines of code below to do so and add equivalent
   // calls to the cuda version.
 
-#if HAVE_CUDA == 1 
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
 
     std::vector<const BaseFloat*> inputs(num_tasks);
@@ -524,9 +524,9 @@ void NnetBatchComputer::FormatOutputs(
       int32 left_unused = task->num_initial_unused_output_frames,
             used = task->num_used_output_frames;
       // int32 right_unused = num_output_frames - used - left_unused;
-      
-      // TODO do we really expect different tasks to output CPU or GPU? 
-      // This adds a bit of code complexity.  Perhaps output_to_cpu should 
+
+      // TODO do we really expect different tasks to output CPU or GPU?
+      // This adds a bit of code complexity.  Perhaps output_to_cpu should
       // be a property of the batch computer and not the tasks
       if (task->output_to_cpu) {
         task->output_cpu.Resize(num_output_frames, output_dim,
@@ -548,7 +548,7 @@ void NnetBatchComputer::FormatOutputs(
             left_unused, used);
         const CuSubMatrix<BaseFloat> input_mat = output.RowRange(
             n * num_output_frames + left_unused, used);
-       
+
         // create matrix batch description arrays
         num_rows[b] = output_mat.NumRows();
         num_cols[b] = output_mat.NumCols();
@@ -559,15 +559,15 @@ void NnetBatchComputer::FormatOutputs(
         b++; // increase batch count
       }
     }
-    
+
     // execute batched copy
-    cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
+    cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0],
         &outputs[0], &ldo[0]);
-  
+
   } else
 #endif
   {
-    //TODO i don't think all of these paths are actually possible.  We should simplify this.  
+    //TODO i don't think all of these paths are actually possible.  We should simplify this.
     //Is it possible to output_to_gpu with HAVE_CUDA == 0 or when the device is disabled?
     for (int32 n = 0; n < num_tasks; n++) {
       NnetInferenceTask *task = tasks[n];
@@ -873,7 +873,7 @@ static void SplitInputToTasks(const NnetBatchComputerOptions &opts,
                       input.NumCols(), kUndefined);
 
     // Copy from intput into task input with clamping
-    task.input.CopyRangeFromMatClamped(input, begin_input_t_padded, 
+    task.input.CopyRangeFromMatClamped(input, begin_input_t_padded,
         end_input_t_padded, 0, num_input_frames-1);
   }
 }
@@ -888,7 +888,7 @@ void NnetBatchComputer::SplitUtteranceIntoTasks(
     int32 online_ivector_period,
     std::vector<NnetInferenceTask> *tasks) {
 
-  // Inputs are expected to be in device memory. 
+  // Inputs are expected to be in device memory.
   // create temporary device arrays and copy
   // inputs into them
   CuMatrix<BaseFloat> cu_input(input);
@@ -954,7 +954,7 @@ void NnetBatchComputer::SplitUtteranceIntoTasks(
   if (ivector != NULL) {
     KALDI_ASSERT(online_ivectors == NULL);
 
-#if HAVE_CUDA == 1 
+#if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
       int32_t num_tasks = tasks->size();
 
@@ -964,7 +964,7 @@ void NnetBatchComputer::SplitUtteranceIntoTasks(
       std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
 
       int b=0;  // batch counter
-        
+
       for (size_t i = 0; i < tasks->size(); i++) {
         CuVector<BaseFloat> &output_vec = (*tasks)[i].ivector;
         const CuVector<BaseFloat> &input_vec =  *ivector;
@@ -980,9 +980,9 @@ void NnetBatchComputer::SplitUtteranceIntoTasks(
         ldi[b] = 0;
         b++; // increase batch count
       }
-    
+
       // execute batched copy
-      cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
+      cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0],
           &outputs[0], &ldo[0]);
     } else
 #endif
@@ -1057,8 +1057,8 @@ void MergeTaskOutput(
   KALDI_ASSERT(num_output_frames != 0 && output_dim != 0);
   int32 cur_output_frame = 0;
   output->Resize(num_output_frames, output_dim, kUndefined);
-  
-#if HAVE_CUDA == 1 
+
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
 
     std::vector<const BaseFloat*> inputs(num_tasks);
@@ -1076,9 +1076,9 @@ void MergeTaskOutput(
         output->RowRange(cur_output_frame, num_used).CopyFromMat(
             task.output_cpu.RowRange(skip, num_used));
       } else {
-        CuSubMatrix<BaseFloat> output_mat = 
+        CuSubMatrix<BaseFloat> output_mat =
           output->RowRange(cur_output_frame, num_used);
-        const CuSubMatrix<BaseFloat> input_mat =  
+        const CuSubMatrix<BaseFloat> input_mat =
           task.output.RowRange(skip, num_used);
 
         // create matrix batch description arrays
@@ -1094,7 +1094,7 @@ void MergeTaskOutput(
     }
 
     // execute batched copy
-    cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
+    cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0],
         &outputs[0], &ldo[0]);
 
   } else
@@ -1115,7 +1115,7 @@ void MergeTaskOutput(
     cur_output_frame += num_used;
   }
  }
- 
+
   KALDI_ASSERT(cur_output_frame == num_output_frames);
 }
 
@@ -1228,7 +1228,7 @@ void NnetBatchInference::Compute() {
 NnetBatchDecoder::NnetBatchDecoder(
     const fst::Fst<fst::StdArc> &fst,
     const LatticeFasterDecoderConfig &decoder_opts,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const fst::SymbolTable *word_syms,
     bool allow_partial,
     int32 num_threads,
diff --git a/src/nnet3/nnet-batch-compute.h b/src/nnet3/nnet-batch-compute.h
index a29973761d9..a421fd4cd93 100644
--- a/src/nnet3/nnet-batch-compute.h
+++ b/src/nnet3/nnet-batch-compute.h
@@ -28,7 +28,7 @@
 #include <condition_variable>
 #include "base/kaldi-common.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "itf/decodable-itf.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-compute.h"
@@ -635,7 +635,7 @@ class NnetBatchDecoder {
    */
   NnetBatchDecoder(const fst::Fst<fst::StdArc> &fst,
                    const LatticeFasterDecoderConfig &decoder_config,
-                   const TransitionModel &trans_model,
+                   const Transitions &trans_model,
                    const fst::SymbolTable *word_syms,
                    bool allow_partial,
                    int32 num_threads,
@@ -775,7 +775,7 @@ class NnetBatchDecoder {
 
   const fst::Fst<fst::StdArc> &fst_;
   const LatticeFasterDecoderConfig &decoder_opts_;
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   const fst::SymbolTable *word_syms_;  // May be NULL.  Owned here.
   bool allow_partial_;
   NnetBatchComputer *computer_;
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index b5052c71759..76a162ef7b4 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -446,7 +446,7 @@ void NnetComputer::ExecuteCommand() {
 
 CuSubMatrix<BaseFloat> NnetComputer::GetSubMatrix(int32 submatrix_index) {
   KALDI_PARANOID_ASSERT(static_cast<size_t>(submatrix_index) <
-                        computation_.submatrices.size());
+                       computation_.submatrices.size());
   const NnetComputation::SubMatrixInfo &info =
       computation_.submatrices[submatrix_index];
   const CuMatrix<BaseFloat> &mat = matrices_[info.matrix_index];
diff --git a/src/nnet3/nnet-discriminative-diagnostics.cc b/src/nnet3/nnet-discriminative-diagnostics.cc
index 488372be8e1..36891804a18 100644
--- a/src/nnet3/nnet-discriminative-diagnostics.cc
+++ b/src/nnet3/nnet-discriminative-diagnostics.cc
@@ -28,7 +28,7 @@ namespace nnet3 {
 NnetDiscriminativeComputeObjf::NnetDiscriminativeComputeObjf(
     const NnetComputeProbOptions &nnet_config,
     const discriminative::DiscriminativeOptions &discriminative_config,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     const VectorBase<BaseFloat> &priors,
     const Nnet &nnet):
     nnet_config_(nnet_config),
diff --git a/src/nnet3/nnet-discriminative-diagnostics.h b/src/nnet3/nnet-discriminative-diagnostics.h
index 3bcae8fac30..ca1b1f82915 100644
--- a/src/nnet3/nnet-discriminative-diagnostics.h
+++ b/src/nnet3/nnet-discriminative-diagnostics.h
@@ -41,7 +41,7 @@ class NnetDiscriminativeComputeObjf {
   // does not store a reference to 'config' but does store one to 'nnet'.
   NnetDiscriminativeComputeObjf(const NnetComputeProbOptions &nnet_config,
       const discriminative::DiscriminativeOptions &discriminative_config,
-      const TransitionModel &tmodel,
+      const Transitions &tmodel,
       const VectorBase<BaseFloat> &priors,
       const Nnet &nnet);
 
@@ -71,7 +71,7 @@ class NnetDiscriminativeComputeObjf {
   NnetComputeProbOptions nnet_config_;
 
   discriminative::DiscriminativeOptions discriminative_config_;
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
   CuVector<BaseFloat> log_priors_;
   const Nnet &nnet_;
   CachingOptimizingCompiler compiler_;
diff --git a/src/nnet3/nnet-discriminative-example.h b/src/nnet3/nnet-discriminative-example.h
index c0ea446552e..e516f503794 100644
--- a/src/nnet3/nnet-discriminative-example.h
+++ b/src/nnet3/nnet-discriminative-example.h
@@ -28,7 +28,7 @@
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
 #include "hmm/posterior.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 namespace nnet3 {
diff --git a/src/nnet3/nnet-discriminative-training.cc b/src/nnet3/nnet-discriminative-training.cc
index 91a72c73cca..905fb6dc432 100644
--- a/src/nnet3/nnet-discriminative-training.cc
+++ b/src/nnet3/nnet-discriminative-training.cc
@@ -26,7 +26,7 @@ namespace nnet3 {
 
 NnetDiscriminativeTrainer::NnetDiscriminativeTrainer(
                                    const NnetDiscriminativeOptions &opts,
-                                   const TransitionModel &tmodel,
+                                   const Transitions &tmodel,
                                    const VectorBase<BaseFloat> &priors,
                                    Nnet *nnet):
     opts_(opts), tmodel_(tmodel), log_priors_(priors),
diff --git a/src/nnet3/nnet-discriminative-training.h b/src/nnet3/nnet-discriminative-training.h
index 4846aeca9d3..3eff44efcd9 100644
--- a/src/nnet3/nnet-discriminative-training.h
+++ b/src/nnet3/nnet-discriminative-training.h
@@ -87,7 +87,7 @@ struct DiscriminativeObjectiveFunctionInfo {
 class NnetDiscriminativeTrainer {
  public:
   NnetDiscriminativeTrainer(const NnetDiscriminativeOptions &config,
-                            const TransitionModel &tmodel,
+                            const Transitions &tmodel,
                             const VectorBase<BaseFloat> &priors,
                             Nnet *nnet);
 
@@ -104,7 +104,7 @@ class NnetDiscriminativeTrainer {
 
   const NnetDiscriminativeOptions opts_;
 
-  const TransitionModel &tmodel_;
+  const Transitions &tmodel_;
   CuVector<BaseFloat> log_priors_;
   
   Nnet *nnet_;
diff --git a/src/nnet3/nnet-nnet.cc b/src/nnet3/nnet-nnet.cc
index 334234f53db..12dde9d8f9e 100644
--- a/src/nnet3/nnet-nnet.cc
+++ b/src/nnet3/nnet-nnet.cc
@@ -24,7 +24,7 @@
 #include "nnet3/nnet-utils.h"
 #include "nnet3/nnet-simple-component.h"
 #include "nnet3/am-nnet-simple.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -587,10 +587,10 @@ void Nnet::Read(std::istream &is, bool binary) {
   Destroy();
   int first_char = PeekToken(is, binary);
   if (first_char == 'T') {
-    // This branch is to allow '.mdl' files (containing a TransitionModel
+    // This branch is to allow '.mdl' files (containing a Transitions
     // and then an AmNnetSimple) to be read where .raw files (containing
     // just an Nnet) would be expected.  This is often convenient.
-    TransitionModel temp_trans_model;
+    Transitions temp_trans_model;
     temp_trans_model.Read(is, binary);
     AmNnetSimple temp_am_nnet;
     temp_am_nnet.Read(is, binary);
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 8de3c9716e9..31c52f6bb72 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -8,7 +8,7 @@ LDLIBS += $(CUDA_LDLIBS)
 
 BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \
-   nnet3-compute-from-egs nnet3-train nnet3-am-init nnet3-am-train-transitions \
+   nnet3-compute-from-egs nnet3-train nnet3-am-init \
    nnet3-am-adjust-priors nnet3-am-copy nnet3-compute-prob \
    nnet3-average nnet3-am-info nnet3-combine nnet3-latgen-faster \
    nnet3-latgen-faster-parallel nnet3-show-progress nnet3-align-compiled \
@@ -20,7 +20,7 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-discriminative-compute-from-egs nnet3-latgen-faster-looped \
    nnet3-egs-augment-image nnet3-xvector-get-egs nnet3-xvector-compute \
    nnet3-latgen-grammar nnet3-compute-batch nnet3-latgen-faster-batch \
-   cuda-gpu-available cuda-compiled
+   nnet-get-feature-transform cuda-gpu-available cuda-compiled
 
 OBJFILES =
 
@@ -34,6 +34,7 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+	      ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3bin/cuda-compiled.cc b/src/nnet3bin/cuda-compiled.cc
index b6de9257657..50a36450412 100644
--- a/src/nnet3bin/cuda-compiled.cc
+++ b/src/nnet3bin/cuda-compiled.cc
@@ -1,4 +1,4 @@
-// nnet2bin/cuda-compiled.cc
+// nnet3bin/cuda-compiled.cc
 
 // Copyright 2014 Johns Hopkins University (author:  Daniel Povey)
 
diff --git a/src/nnet2bin/nnet-get-feature-transform.cc b/src/nnet3bin/nnet-get-feature-transform.cc
similarity index 97%
rename from src/nnet2bin/nnet-get-feature-transform.cc
rename to src/nnet3bin/nnet-get-feature-transform.cc
index b2e3823a30b..43bbaacbe94 100644
--- a/src/nnet2bin/nnet-get-feature-transform.cc
+++ b/src/nnet3bin/nnet-get-feature-transform.cc
@@ -1,4 +1,4 @@
-// nnet2bin/nnet-get-feature-transform.cc
+// nnet3bin/nnet-get-feature-transform.cc
 
 // Copyright 2013  Johns Hopkins University (author: Daniel Povey)
 
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "nnet2/get-feature-transform.h"
+#include "nnet3/get-feature-transform.h"
 
 int main(int argc, char *argv[]) {
   using namespace kaldi;
@@ -83,5 +83,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/nnet3bin/nnet3-acc-lda-stats.cc b/src/nnet3bin/nnet3-acc-lda-stats.cc
index c8911a4a39f..c7f0a20aa6b 100644
--- a/src/nnet3bin/nnet3-acc-lda-stats.cc
+++ b/src/nnet3bin/nnet3-acc-lda-stats.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-nnet.h"
 #include "nnet3/nnet-example-utils.h"
 #include "nnet3/nnet-optimize.h"
diff --git a/src/nnet3bin/nnet3-align-compiled.cc b/src/nnet3bin/nnet3-align-compiled.cc
index a09ab1cf8ae..a1089b48e30 100644
--- a/src/nnet3bin/nnet3-align-compiled.cc
+++ b/src/nnet3bin/nnet3-align-compiled.cc
@@ -23,7 +23,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/hmm-utils.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
@@ -55,8 +55,6 @@ int main(int argc, char *argv[]) {
     AlignConfig align_config;
     NnetSimpleComputationOptions decodable_opts;
     std::string use_gpu = "yes";
-    BaseFloat transition_scale = 1.0;
-    BaseFloat self_loop_scale = 1.0;
     std::string per_frame_acwt_wspecifier;
 
     std::string ivector_rspecifier,
@@ -68,11 +66,6 @@ int main(int argc, char *argv[]) {
 
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
-    po.Register("transition-scale", &transition_scale,
-                "Transition-probability scale [relative to acoustics]");
-    po.Register("self-loop-scale", &self_loop_scale,
-                "Scale of self-loop versus non-self-loop "
-                "log probs [relative to acoustics]");
     po.Register("write-per-frame-acoustic-loglikes", &per_frame_acwt_wspecifier,
                 "Wspecifier for table of vectors containing the acoustic log-likelihoods "
                 "per frame for each utterance. E.g. ark:foo/per_frame_logprobs.1.ark");
@@ -108,7 +101,7 @@ int main(int argc, char *argv[]) {
 
 
     {
-      TransitionModel trans_model;
+      Transitions trans_model;
       AmNnetSimple am_nnet;
       {
         bool binary;
@@ -176,13 +169,6 @@ int main(int argc, char *argv[]) {
           }
         }
 
-        {  // Add transition-probs to the FST.
-          std::vector<int32> disambig_syms;  // empty.
-          AddTransitionProbs(trans_model, disambig_syms,
-                             transition_scale, self_loop_scale,
-                             &decode_fst);
-        }
-
         DecodableAmNnetSimple nnet_decodable(
             decodable_opts, trans_model, am_nnet,
             features, ivector, online_ivectors,
diff --git a/src/nnet3bin/nnet3-am-adjust-priors.cc b/src/nnet3bin/nnet3-am-adjust-priors.cc
index 957e1dae04a..fa9729682c9 100644
--- a/src/nnet3bin/nnet3-am-adjust-priors.cc
+++ b/src/nnet3bin/nnet3-am-adjust-priors.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet3/am-nnet-simple.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "tree/context-dep.h"
 
 namespace kaldi {
@@ -107,7 +107,7 @@ int main(int argc, char *argv[]) {
         posterior_vec_rxfilename = po.GetArg(2),
         nnet_wxfilename = po.GetArg(3);
     
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary_read;
diff --git a/src/nnet3bin/nnet3-am-copy.cc b/src/nnet3bin/nnet3-am-copy.cc
index c820814db24..2e7a17dc1e4 100644
--- a/src/nnet3bin/nnet3-am-copy.cc
+++ b/src/nnet3bin/nnet3-am-copy.cc
@@ -21,7 +21,7 @@
 #include <typeinfo>
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/am-nnet-simple.h"
 #include "nnet3/nnet-utils.h"
 
@@ -98,7 +98,7 @@ int main(int argc, char *argv[]) {
     std::string nnet_rxfilename = po.GetArg(1),
         nnet_wxfilename = po.GetArg(2);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-am-info.cc b/src/nnet3bin/nnet3-am-info.cc
index be3df5e1e8a..e14c2f1876a 100644
--- a/src/nnet3bin/nnet3-am-info.cc
+++ b/src/nnet3bin/nnet3-am-info.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet3/am-nnet-simple.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -48,7 +48,7 @@ int main(int argc, char *argv[]) {
 
     std::string  nnet_rxfilename = po.GetArg(1);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-am-init.cc b/src/nnet3bin/nnet3-am-init.cc
index 3cb0bfe012b..6d8e69804f8 100644
--- a/src/nnet3bin/nnet3-am-init.cc
+++ b/src/nnet3bin/nnet3-am-init.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "tree/context-dep.h"
 #include "nnet3/am-nnet-simple.h"
 
@@ -56,7 +56,7 @@ int main(int argc, char *argv[]) {
 
     std::string raw_nnet_rxfilename,
         am_nnet_wxfilename;
-    TransitionModel *trans_model = NULL;
+    Transitions *trans_model = NULL;
     
     if (po.NumArgs() == 4) {
       std::string tree_rxfilename = po.GetArg(1),
@@ -67,17 +67,17 @@ int main(int argc, char *argv[]) {
       ContextDependency ctx_dep;
       ReadKaldiObject(tree_rxfilename, &ctx_dep);
     
-      HmmTopology topo;
+      Topology topo;
       ReadKaldiObject(topo_rxfilename, &topo);
       
       // Construct the transition model from the tree and the topology file.
-      trans_model = new TransitionModel(ctx_dep, topo);
+      trans_model = new Transitions(ctx_dep, topo);
     } else {
       std::string trans_model_rxfilename =  po.GetArg(1);
       raw_nnet_rxfilename = po.GetArg(2);
       am_nnet_wxfilename = po.GetArg(3);
       
-      trans_model = new TransitionModel();
+      trans_model = new Transitions();
       ReadKaldiObject(trans_model_rxfilename, trans_model);
     }
 
diff --git a/src/nnet3bin/nnet3-am-train-transitions.cc b/src/nnet3bin/nnet3-am-train-transitions.cc
deleted file mode 100644
index a1956d1e6b9..00000000000
--- a/src/nnet3bin/nnet3-am-train-transitions.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// nnet3bin/nnet3-am-train-transitions.cc
-
-// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet3/am-nnet-simple.h"
-#include "tree/context-dep.h"
-
-namespace kaldi {
-namespace nnet3 {
-void SetPriors(const TransitionModel &tmodel,
-               const Vector<double> &transition_accs,
-               double prior_floor,
-               AmNnetSimple *am_nnet) {
-  KALDI_ASSERT(tmodel.NumPdfs() == am_nnet->NumPdfs());
-  Vector<BaseFloat> pdf_counts(tmodel.NumPdfs());
-  KALDI_ASSERT(transition_accs(0) == 0.0); // There is
-  // no zero transition-id.
-  for (int32 tid = 1; tid < transition_accs.Dim(); tid++) {
-    int32 pdf = tmodel.TransitionIdToPdf(tid);
-    pdf_counts(pdf) += transition_accs(tid);
-  }
-  BaseFloat sum = pdf_counts.Sum();
-  KALDI_ASSERT(sum != 0.0);
-  KALDI_ASSERT(prior_floor > 0.0 && prior_floor < 1.0);
-  pdf_counts.Scale(1.0 / sum);
-  pdf_counts.ApplyFloor(prior_floor);
-  pdf_counts.Scale(1.0 / pdf_counts.Sum()); // normalize again.
-  am_nnet->SetPriors(pdf_counts);
-}               
-
-
-} // namespace nnet3
-} // namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet3;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Train the transition probabilities of an nnet3 neural network acoustic model\n"
-        "\n"
-        "Usage:  nnet3-am-train-transitions [options] <nnet-in> <alignments-rspecifier> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet3-am-train-transitions 1.nnet \"ark:gunzip -c ali.*.gz|\" 2.nnet\n";
-    
-    bool binary_write = true;
-    bool set_priors = true; // Also set the per-pdf priors in the model.
-    BaseFloat prior_floor = 5.0e-06; // The default was previously 1e-8, but
-                                     // once we had problems with a pdf-id that
-                                     // was not being seen in training, being
-                                     // recognized all the time.  This value
-                                     // seemed to be the smallest prior of the
-                                     // "seen" pdf-ids in one run.
-    MleTransitionUpdateConfig transition_update_config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("set-priors", &set_priors, "If true, also set priors in neural "
-                "net (we divide by these in test time)");
-    po.Register("prior-floor", &prior_floor, "When setting priors, floor for "
-                "priors");
-    transition_update_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1),
-        ali_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    TransitionModel trans_model;
-    AmNnetSimple am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-    
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
-
-    int32 num_done = 0;
-    SequentialInt32VectorReader ali_reader(ali_rspecifier);
-    for (; ! ali_reader.Done(); ali_reader.Next()) {
-      const std::vector<int32> alignment(ali_reader.Value());
-      for (size_t i = 0; i < alignment.size(); i++) {
-        int32 tid = alignment[i];
-        BaseFloat weight = 1.0;
-        trans_model.Accumulate(weight, tid, &transition_accs);
-      }
-      num_done++;
-    }
-    KALDI_LOG << "Accumulated transition stats from " << num_done
-              << " utterances.";
-
-    {
-      BaseFloat objf_impr, count;
-      trans_model.MleUpdate(transition_accs, transition_update_config,
-                            &objf_impr, &count);
-      KALDI_LOG << "Transition model update: average " << (objf_impr/count)
-                << " log-like improvement per frame over " << count
-                << " frames.";
-    }
-
-    if (set_priors) {
-      KALDI_LOG << "Setting priors of pdfs in the model.";
-      SetPriors(trans_model, transition_accs, prior_floor, &am_nnet);
-    }
-    
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Trained transitions of neural network model and wrote it to "
-              << nnet_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
diff --git a/src/nnet3bin/nnet3-average.cc b/src/nnet3bin/nnet3-average.cc
index d794e37e50d..50085010d07 100644
--- a/src/nnet3bin/nnet3-average.cc
+++ b/src/nnet3bin/nnet3-average.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-utils.h"
 
 
diff --git a/src/nnet3bin/nnet3-compute-batch.cc b/src/nnet3bin/nnet3-compute-batch.cc
index 5d4b9b1db48..2c3606bb896 100644
--- a/src/nnet3bin/nnet3-compute-batch.cc
+++ b/src/nnet3bin/nnet3-compute-batch.cc
@@ -104,7 +104,7 @@ int main(int argc, char *argv[]) {
     AmNnetSimple am_nnet;
     if (use_priors) {
       bool binary;
-      TransitionModel trans_model;
+      Transitions trans_model;
       Input ki(nnet_rxfilename, &binary);
       trans_model.Read(ki.Stream(), binary);
       am_nnet.Read(ki.Stream(), binary);
diff --git a/src/nnet3bin/nnet3-compute-from-egs.cc b/src/nnet3bin/nnet3-compute-from-egs.cc
index 8f29675211f..f04044e6c88 100644
--- a/src/nnet3bin/nnet3-compute-from-egs.cc
+++ b/src/nnet3bin/nnet3-compute-from-egs.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-nnet.h"
 #include "nnet3/nnet-example-utils.h"
 #include "nnet3/nnet-optimize.h"
diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc
index cf133025aae..681207ef813 100644
--- a/src/nnet3bin/nnet3-compute.cc
+++ b/src/nnet3bin/nnet3-compute.cc
@@ -101,7 +101,7 @@ int main(int argc, char *argv[]) {
     AmNnetSimple am_nnet;
     if (use_priors) {
       bool binary;
-      TransitionModel trans_model;
+      Transitions trans_model;
       Input ki(nnet_rxfilename, &binary);
       trans_model.Read(ki.Stream(), binary);
       am_nnet.Read(ki.Stream(), binary);
diff --git a/src/nnet3bin/nnet3-copy-egs.cc b/src/nnet3bin/nnet3-copy-egs.cc
index 19c205461ae..9ff9395f47c 100644
--- a/src/nnet3bin/nnet3-copy-egs.cc
+++ b/src/nnet3bin/nnet3-copy-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
 
diff --git a/src/nnet3bin/nnet3-copy.cc b/src/nnet3bin/nnet3-copy.cc
index 6d53d44e087..3d001ad051c 100644
--- a/src/nnet3bin/nnet3-copy.cc
+++ b/src/nnet3bin/nnet3-copy.cc
@@ -21,7 +21,7 @@
 #include <typeinfo>
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/am-nnet-simple.h"
 #include "nnet3/nnet-utils.h"
 
diff --git a/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc b/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
index d8b0f469beb..480523e202d 100644
--- a/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
@@ -19,7 +19,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-nnet.h"
 #include "nnet3/nnet-example-utils.h"
 #include "nnet3/nnet-discriminative-example.h"
diff --git a/src/nnet3bin/nnet3-discriminative-compute-objf.cc b/src/nnet3bin/nnet3-discriminative-compute-objf.cc
index cc3d0f1e3ac..a2cc2c3e794 100644
--- a/src/nnet3bin/nnet3-discriminative-compute-objf.cc
+++ b/src/nnet3bin/nnet3-discriminative-compute-objf.cc
@@ -70,7 +70,7 @@ int main(int argc, char *argv[]) {
     std::string model_rxfilename = po.GetArg(1),
         examples_rspecifier = po.GetArg(2);
 
-    TransitionModel tmodel;
+    Transitions tmodel;
     AmNnetSimple am_nnet;
 
     {
diff --git a/src/nnet3bin/nnet3-discriminative-copy-egs.cc b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
index 17dc2ee4e13..bde1dc29c5d 100644
--- a/src/nnet3bin/nnet3-discriminative-copy-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-discriminative-example.h"
 
 namespace kaldi {
diff --git a/src/nnet3bin/nnet3-discriminative-get-egs.cc b/src/nnet3bin/nnet3-discriminative-get-egs.cc
index 4a31876532f..37ff3d8863a 100644
--- a/src/nnet3bin/nnet3-discriminative-get-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-get-egs.cc
@@ -22,7 +22,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-discriminative-example.h"
 #include "nnet3/discriminative-supervision.h"
@@ -37,7 +37,7 @@ namespace nnet3 {
 // returns true if we got as far as calling GetChunksForUtterance()
 // [in which case stats will be accumulated by class UtteranceSplitter]
 static bool ProcessFile(const discriminative::SplitDiscriminativeSupervisionOptions &config,
-                        const TransitionModel &tmodel,
+                        const Transitions &tmodel,
                         const MatrixBase<BaseFloat> &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
                         int32 ivector_period,
@@ -214,7 +214,7 @@ int main(int argc, char *argv[]) {
         examples_wspecifier = po.GetArg(5);
 
 
-    TransitionModel tmodel;
+    Transitions tmodel;
     {
       bool binary;
       Input ki(model_wxfilename, &binary);
diff --git a/src/nnet3bin/nnet3-discriminative-merge-egs.cc b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
index bc4cdfb2941..e3e27f8c2f6 100644
--- a/src/nnet3bin/nnet3-discriminative-merge-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-discriminative-example.h"
 
 
diff --git a/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc b/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc
index 2a029123852..251ea693197 100644
--- a/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-discriminative-example.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/nnet3bin/nnet3-discriminative-train.cc b/src/nnet3bin/nnet3-discriminative-train.cc
index 0d201b1ae8d..624b6417acf 100644
--- a/src/nnet3bin/nnet3-discriminative-train.cc
+++ b/src/nnet3bin/nnet3-discriminative-train.cc
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
         examples_rspecifier = po.GetArg(2),
         model_wxfilename = po.GetArg(3);
 
-    TransitionModel tmodel;
+    Transitions tmodel;
     AmNnetSimple am_nnet;
 
     bool binary;
diff --git a/src/nnet3bin/nnet3-egs-augment-image.cc b/src/nnet3bin/nnet3-egs-augment-image.cc
index ef724d0c6a6..331169e3edf 100644
--- a/src/nnet3bin/nnet3-egs-augment-image.cc
+++ b/src/nnet3bin/nnet3-egs-augment-image.cc
@@ -21,7 +21,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
 
diff --git a/src/nnet3bin/nnet3-get-egs-dense-targets.cc b/src/nnet3bin/nnet3-get-egs-dense-targets.cc
index a1902071b60..2773fc57cd0 100644
--- a/src/nnet3bin/nnet3-get-egs-dense-targets.cc
+++ b/src/nnet3bin/nnet3-get-egs-dense-targets.cc
@@ -22,7 +22,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
diff --git a/src/nnet3bin/nnet3-get-egs-simple.cc b/src/nnet3bin/nnet3-get-egs-simple.cc
index 031d941f0d5..3821f6ab8e1 100644
--- a/src/nnet3bin/nnet3-get-egs-simple.cc
+++ b/src/nnet3bin/nnet3-get-egs-simple.cc
@@ -20,7 +20,7 @@
 #include <sstream>
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc
index fed6d529a82..96410f7d678 100644
--- a/src/nnet3bin/nnet3-get-egs.cc
+++ b/src/nnet3bin/nnet3-get-egs.cc
@@ -21,7 +21,7 @@
 #include <sstream>
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
diff --git a/src/nnet3bin/nnet3-init.cc b/src/nnet3bin/nnet3-init.cc
index d913ee4e016..21f43c07783 100644
--- a/src/nnet3bin/nnet3-init.cc
+++ b/src/nnet3bin/nnet3-init.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet3/nnet-nnet.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "tree/context-dep.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/nnet3bin/nnet3-latgen-faster-batch.cc b/src/nnet3bin/nnet3-latgen-faster-batch.cc
index 9a3dc1a112f..a61663588ab 100644
--- a/src/nnet3bin/nnet3-latgen-faster-batch.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-batch.cc
@@ -23,7 +23,7 @@
 #include "base/kaldi-common.h"
 #include "decoder/decoder-wrappers.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-batch-compute.h"
 #include "nnet3/nnet-utils.h"
 #include "util/kaldi-thread.h"
@@ -129,7 +129,7 @@ int main(int argc, char *argv[]) {
         feature_rspecifier = po.GetArg(3),
         lattice_wspecifier = po.GetArg(4);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-latgen-faster-looped.cc b/src/nnet3bin/nnet3-latgen-faster-looped.cc
index 1d46d001d22..1472f047f16 100644
--- a/src/nnet3bin/nnet3-latgen-faster-looped.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-looped.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "nnet3/decodable-simple-looped.h"
@@ -90,7 +90,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-latgen-faster-parallel.cc b/src/nnet3bin/nnet3-latgen-faster-parallel.cc
index e3d02410368..6e89b9e090e 100644
--- a/src/nnet3bin/nnet3-latgen-faster-parallel.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-parallel.cc
@@ -23,7 +23,7 @@
 #include "base/kaldi-common.h"
 #include "decoder/decoder-wrappers.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-am-decodable-simple.h"
 #include "nnet3/nnet-utils.h"
 #include "util/kaldi-thread.h"
@@ -95,7 +95,7 @@ int main(int argc, char *argv[]) {
         alignment_wspecifier = po.GetOptArg(6);
 
     TaskSequencer<DecodeUtteranceLatticeFasterClass> sequencer(sequencer_config);
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-latgen-faster.cc b/src/nnet3bin/nnet3-latgen-faster.cc
index 42cd843cf15..de81f94ebe4 100644
--- a/src/nnet3bin/nnet3-latgen-faster.cc
+++ b/src/nnet3bin/nnet3-latgen-faster.cc
@@ -22,7 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "nnet3/nnet-am-decodable-simple.h"
@@ -90,7 +90,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-latgen-grammar.cc b/src/nnet3bin/nnet3-latgen-grammar.cc
index 9d2304fb1d8..68f0bdb0a1a 100644
--- a/src/nnet3bin/nnet3-latgen-grammar.cc
+++ b/src/nnet3bin/nnet3-latgen-grammar.cc
@@ -21,7 +21,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "tree/context-dep.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "fstext/fstext-lib.h"
 #include "decoder/decoder-wrappers.h"
 #include "nnet3/nnet-am-decodable-simple.h"
@@ -92,7 +92,7 @@ int main(int argc, char *argv[]) {
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/nnet3bin/nnet3-merge-egs.cc b/src/nnet3bin/nnet3-merge-egs.cc
index 8a345a4b703..f98ff3a5f43 100644
--- a/src/nnet3bin/nnet3-merge-egs.cc
+++ b/src/nnet3bin/nnet3-merge-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"
 
diff --git a/src/nnet3bin/nnet3-show-progress.cc b/src/nnet3bin/nnet3-show-progress.cc
index 25a65dbed5c..01b0265d2c1 100644
--- a/src/nnet3bin/nnet3-show-progress.cc
+++ b/src/nnet3bin/nnet3-show-progress.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-utils.h"
 #include "nnet3/nnet-diagnostics.h"
 
diff --git a/src/nnet3bin/nnet3-shuffle-egs.cc b/src/nnet3bin/nnet3-shuffle-egs.cc
index 1cf08085975..f017cb945cf 100644
--- a/src/nnet3bin/nnet3-shuffle-egs.cc
+++ b/src/nnet3bin/nnet3-shuffle-egs.cc
@@ -20,7 +20,7 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "nnet3/nnet-example.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/nnetbin/Makefile b/src/nnetbin/Makefile
deleted file mode 100644
index 5e934ff523a..00000000000
--- a/src/nnetbin/Makefile
+++ /dev/null
@@ -1,34 +0,0 @@
-
-# cuda-gpu-available was moved to nnet3bin/; remove it in case an outdated
-# version persists.
-all:
-	-rm -f cuda-gpu-available
-
-EXTRA_CXXFLAGS = -Wno-sign-compare
-include ../kaldi.mk
-
-LDFLAGS += $(CUDA_LDFLAGS)
-LDLIBS += $(CUDA_LDLIBS)
-
-BINFILES = nnet-train-frmshuff \
-        nnet-train-perutt \
-        nnet-train-mmi-sequential \
-        nnet-train-mpe-sequential \
-        nnet-train-multistream nnet-train-multistream-perutt \
-        rbm-train-cd1-frmshuff rbm-convert-to-nnet \
-        nnet-forward nnet-copy nnet-info nnet-concat \
-        transf-to-nnet cmvn-to-nnet nnet-initialize \
-	feat-to-post paste-post train-transitions \
-	nnet-set-learnrate
-
-OBJFILES =
-
-
-
-TESTFILES =
-
-ADDLIBS = ../nnet/kaldi-nnet.a ../cudamatrix/kaldi-cudamatrix.a \
-          ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a
-
-include ../makefiles/default_rules.mk
diff --git a/src/nnetbin/cmvn-to-nnet.cc b/src/nnetbin/cmvn-to-nnet.cc
deleted file mode 100644
index c06851e632a..00000000000
--- a/src/nnetbin/cmvn-to-nnet.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-// nnetbin/cmvn-to-nnet.cc
-
-// Copyright 2012-2016  Brno University of Technology
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-various.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-      "Convert cmvn-stats into <AddShift> and <Rescale> components.\n"
-      "Usage:  cmvn-to-nnet [options] <transf-in> <nnet-out>\n"
-      "e.g.:\n"
-      " cmvn-to-nnet --binary=false transf.mat nnet.mdl\n";
-
-
-    bool binary_write = false;
-    float std_dev = 1.0;
-    float var_floor = 1e-10;
-    float learn_rate_coef = 0.0;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("std-dev", &std_dev, "Standard deviation of the output.");
-    po.Register("var-floor", &var_floor,
-        "Floor the variance, so the factors in <Rescale> are bounded.");
-    po.Register("learn-rate-coef", &learn_rate_coef,
-        "Initialize learning-rate coefficient to a value.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string cmvn_stats_rxfilename = po.GetArg(1),
-        model_out_filename = po.GetArg(2);
-
-    // read the matrix,
-    Matrix<double> cmvn_stats;
-    {
-      bool binary_read;
-      Input ki(cmvn_stats_rxfilename, &binary_read);
-      cmvn_stats.Read(ki.Stream(), binary_read);
-    }
-    KALDI_ASSERT(cmvn_stats.NumRows() == 2);
-    KALDI_ASSERT(cmvn_stats.NumCols() > 1);
-
-    int32 num_dims = cmvn_stats.NumCols() - 1;
-    double frame_count = cmvn_stats(0, cmvn_stats.NumCols() - 1);
-
-    // buffers for shift and scale
-    Vector<BaseFloat> shift(num_dims);
-    Vector<BaseFloat> scale(num_dims);
-
-    // compute the shift and scale per each dimension
-    for (int32 d = 0; d < num_dims; d++) {
-      BaseFloat mean = cmvn_stats(0, d) / frame_count;
-      BaseFloat var = cmvn_stats(1, d) / frame_count - mean * mean;
-      if (var <= var_floor) {
-        KALDI_WARN << "Very small variance " << var
-                   << " flooring to " << var_floor;
-        var = var_floor;
-      }
-      shift(d) = -mean;
-      scale(d) = std_dev / sqrt(var);
-    }
-
-    // create empty nnet,
-    Nnet nnet;
-
-    // append shift component to nnet,
-    {
-      AddShift shift_component(shift.Dim(), shift.Dim());
-      shift_component.SetParams(shift);
-      shift_component.SetLearnRateCoef(learn_rate_coef);
-      nnet.AppendComponent(shift_component);
-    }
-
-    // append scale component to nnet,
-    {
-      Rescale scale_component(scale.Dim(), scale.Dim());
-      scale_component.SetParams(scale);
-      scale_component.SetLearnRateCoef(learn_rate_coef);
-      nnet.AppendComponent(scale_component);
-    }
-
-    // write the nnet,
-    {
-      Output ko(model_out_filename, binary_write);
-      nnet.Write(ko.Stream(), binary_write);
-      KALDI_LOG << "Written cmvn in 'nnet1' model to: " << model_out_filename;
-    }
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/feat-to-post.cc b/src/nnetbin/feat-to-post.cc
deleted file mode 100644
index 6e4f4306938..00000000000
--- a/src/nnetbin/feat-to-post.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// nnetbin/feat-to-post.cc
-
-// Copyright 2014       Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/posterior.h"
-
-/** @brief Converts features into posterior format, which is the generic
- *  format of NN training targets in 'nnet1'. */
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  typedef kaldi::int32 int32;
-  try {
-    const char *usage =
-      "Convert features into posterior format, which is the generic format \n"
-      "of NN training targets in Karel's nnet1 tools.\n"
-      "(speed is not an issue for reasonably low NN-output dimensions)\n"
-      "Usage:  feat-to-post [options] feat-rspecifier posteriors-wspecifier\n"
-      "e.g.:\n"
-      " feat-to-post scp:feats.scp ark:feats.post\n";
-
-    ParseOptions po(usage);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feats_rspecifier = po.GetArg(1);
-    std::string posteriors_wspecifier = po.GetArg(2);
-
-    int32 num_done = 0;
-    SequentialBaseFloatMatrixReader feats_reader(feats_rspecifier);
-    PosteriorWriter posterior_writer(posteriors_wspecifier);
-
-    for (; !feats_reader.Done(); feats_reader.Next()) {
-      num_done++;
-      const Matrix<BaseFloat> &mat = feats_reader.Value();
-      int32 num_frames = mat.NumRows(),
-        num_dims = mat.NumCols();
-      // Posterior is vector<vector<pair<int32, BaseFloat> > >
-      Posterior post(num_frames);
-      // Fill posterior with matrix values,
-      for (int32 f = 0; f < num_frames; f++) {
-        for (int32 d = 0; d < num_dims; d++) {
-          post[f].push_back(std::make_pair(d, mat(f, d)));
-        }
-        KALDI_ASSERT(post[f].size() == num_dims);
-      }
-      // Store
-      posterior_writer.Write(feats_reader.Key(), post);
-    }
-    KALDI_LOG << "Converted " << num_done << " alignments.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/nnetbin/nnet-concat.cc b/src/nnetbin/nnet-concat.cc
deleted file mode 100644
index 71c72d05b0a..00000000000
--- a/src/nnetbin/nnet-concat.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// nnetbin/nnet-concat.cc
-
-// Copyright 2012-2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-      "Concatenate Neural Networks (and possibly change binary/text format)\n"
-      "Usage: nnet-concat [options] <nnet-in1> <...> <nnet-inN> <nnet-out>\n"
-      "e.g.:\n"
-      " nnet-concat --binary=false nnet.1 nnet.2 nnet.1.2\n";
-
-    ParseOptions po(usage);
-
-    bool binary_write = true;
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1);
-    std::string model_in_filename_next;
-    std::string model_out_filename = po.GetArg(po.NumArgs());
-
-    // read the first nnet,
-    KALDI_LOG << "Reading " << model_in_filename;
-    Nnet nnet;
-    {
-      bool binary_read;
-      Input ki(model_in_filename, &binary_read);
-      nnet.Read(ki.Stream(), binary_read);
-    }
-
-    // read all the other nnets,
-    for (int32 i = 2; i < po.NumArgs(); i++) {
-      // read the nnet,
-      model_in_filename_next = po.GetArg(i);
-      KALDI_LOG << "Concatenating " << model_in_filename_next;
-      Nnet nnet_next;
-      {
-        bool binary_read;
-        Input ki(model_in_filename_next, &binary_read);
-        nnet_next.Read(ki.Stream(), binary_read);
-      }
-      // append nnet_next to the network nnet,
-      nnet.AppendNnet(nnet_next);
-    }
-
-    // finally write the nnet to disk,
-    {
-      Output ko(model_out_filename, binary_write);
-      nnet.Write(ko.Stream(), binary_write);
-    }
-
-    KALDI_LOG << "Written model to " << model_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/nnetbin/nnet-copy.cc b/src/nnetbin/nnet-copy.cc
deleted file mode 100644
index c4a27f2dd69..00000000000
--- a/src/nnetbin/nnet-copy.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-// nnetbin/nnet-copy.cc
-
-// Copyright 2012-2015  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-parallel-component.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-      "Copy Neural Network model (and possibly change binary/text format)\n"
-      "Usage:  nnet-copy [options] <model-in> <model-out>\n"
-      "e.g.:\n"
-      " nnet-copy --binary=false nnet.mdl nnet_txt.mdl\n";
-
-    bool binary_write = true;
-    int32 remove_first_components = 0;
-    int32 remove_last_components = 0;
-    BaseFloat dropout_rate = -1.0;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Register("remove-first-layers", &remove_first_components,
-        "Deprecated, please use --remove-first-components");
-    po.Register("remove-last-layers", &remove_last_components,
-        "Deprecated, please use --remove-last-components");
-
-    po.Register("remove-first-components", &remove_first_components,
-        "Remove N first Components from the Nnet");
-    po.Register("remove-last-components", &remove_last_components,
-        "Remove N last layers Components from the Nnet");
-
-    po.Register("dropout-rate", &dropout_rate,
-        "Probability that neuron is dropped"
-        "(-1.0 keeps original value).");
-
-    std::string from_parallel_component;
-    po.Register("from-parallel-component", &from_parallel_component,
-        "Extract nested network from parallel component (two possibilities: "
-        "'3' = search for ParallelComponent and get its 3rd network; "
-        "'1:3' = get 3nd network from 1st component; ID = 1..N).");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        model_out_filename = po.GetArg(2);
-
-    // load the network
-    Nnet nnet;
-    {
-      bool binary_read;
-      Input ki(model_in_filename, &binary_read);
-      nnet.Read(ki.Stream(), binary_read);
-    }
-
-    // eventually replace 'nnet' by nested network from <ParallelComponent>,
-    if (from_parallel_component != "") {
-      std::vector<int32> component_id_nested_id;
-      kaldi::SplitStringToIntegers(from_parallel_component, ":", false,
-                                   &component_id_nested_id);
-      // parse the argument,
-      int32 component_id = -1, nested_id = 0;
-      switch (component_id_nested_id.size()) {
-        case 1:
-          nested_id = component_id_nested_id[0];
-          break;
-        case 2:
-          component_id = component_id_nested_id[0];
-          nested_id = component_id_nested_id[1];
-          break;
-        default:
-          KALDI_ERR << "Check the csl '--from-parallel-component='"
-                    << from_parallel_component
-                    << " There must be 1 or 2 elements.";
-      }
-      // search for first <ParallelComponent> (we don't know component_id yet),
-      if (component_id == -1) {
-        for (int32 i = 0; i < nnet.NumComponents(); i++) {
-          if (nnet.GetComponent(i).GetType() == Component::kParallelComponent) {
-            component_id = i+1;
-            break;
-          }
-        }
-      }
-      // replace the nnet,
-      KALDI_ASSERT(nnet.GetComponent(component_id-1).GetType() ==
-                   Component::kParallelComponent);
-      ParallelComponent& parallel_comp =
-        dynamic_cast<ParallelComponent&>(nnet.GetComponent(component_id-1));
-      nnet = parallel_comp.GetNestedNnet(nested_id-1);  // replace!
-    }
-
-    // optionally remove N first components,
-    if (remove_first_components > 0) {
-      for (int32 i = 0; i < remove_first_components; i++) {
-        nnet.RemoveComponent(0);
-      }
-    }
-
-    // optionally remove N last components,
-    if (remove_last_components > 0) {
-      for (int32 i = 0; i < remove_last_components; i++) {
-        nnet.RemoveLastComponent();
-      }
-    }
-
-    // dropout,
-    if (dropout_rate != -1.0) {
-      nnet.SetDropoutRate(dropout_rate);
-    }
-
-    // store the network,
-    {
-      Output ko(model_out_filename, binary_write);
-      nnet.Write(ko.Stream(), binary_write);
-    }
-
-    KALDI_LOG << "Written 'nnet1' to " << model_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-forward.cc b/src/nnetbin/nnet-forward.cc
deleted file mode 100644
index 062bca7da9d..00000000000
--- a/src/nnetbin/nnet-forward.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-// nnetbin/nnet-forward.cc
-
-// Copyright 2011-2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <limits>
-
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-pdf-prior.h"
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "base/timer.h"
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  try {
-    const char *usage =
-      "Perform forward pass through Neural Network.\n"
-      "Usage: nnet-forward [options] <nnet1-in> <feature-rspecifier> <feature-wspecifier>\n"
-      "e.g.: nnet-forward final.nnet ark:input.ark ark:output.ark\n";
-
-    ParseOptions po(usage);
-
-    PdfPriorOptions prior_opts;
-    prior_opts.Register(&po);
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in front of main network (in nnet format)");
-
-    bool no_softmax = false;
-    po.Register("no-softmax", &no_softmax,
-        "Removes the last component with Softmax, if found. The pre-softmax "
-        "activations are the output of the network. Decoding them leads to "
-        "the same lattices as if we had used 'log-posteriors'.");
-
-    bool apply_log = false;
-    po.Register("apply-log", &apply_log, "Transform NN output by log()");
-
-    std::string use_gpu="no";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        feature_wspecifier = po.GetArg(3);
-
-    // Select the GPU
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if (feature_transform != "") {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    // optionally remove softmax,
-    Component::ComponentType last_comp_type = nnet.GetLastComponent().GetType();
-    if (no_softmax) {
-      if (last_comp_type == Component::kSoftmax ||
-          last_comp_type == Component::kBlockSoftmax) {
-        KALDI_LOG << "Removing " << Component::TypeToMarker(last_comp_type)
-                  << " from the nnet " << model_filename;
-        nnet.RemoveLastComponent();
-      } else {
-        KALDI_WARN << "Last component 'NOT-REMOVED' by --no-softmax=true, "
-          << "the component was " << Component::TypeToMarker(last_comp_type);
-      }
-    }
-
-    // avoid some bad option combinations,
-    if (apply_log && no_softmax) {
-      KALDI_ERR << "Cannot use both --apply-log=true --no-softmax=true, "
-                << "use only one of the two!";
-    }
-
-    // we will subtract log-priors later,
-    PdfPrior pdf_prior(prior_opts);
-
-    // disable dropout,
-    nnet_transf.SetDropoutRate(0.0);
-    nnet.SetDropoutRate(0.0);
-
-    kaldi::int64 tot_t = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    BaseFloatMatrixWriter feature_writer(feature_wspecifier);
-
-    CuMatrix<BaseFloat> feats, feats_transf, nnet_out;
-    Matrix<BaseFloat> nnet_out_host;
-
-    Timer time;
-    double time_now = 0;
-    int32 num_done = 0;
-
-    // main loop,
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      // read
-      Matrix<BaseFloat> mat = feature_reader.Value();
-      std::string utt = feature_reader.Key();
-      KALDI_VLOG(2) << "Processing utterance " << num_done+1
-                    << ", " << utt
-                    << ", " << mat.NumRows() << "frm";
-
-
-      if (!KALDI_ISFINITE(mat.Sum())) {  // check there's no nan/inf,
-        KALDI_ERR << "NaN or inf found in features for " << utt;
-      }
-
-      // push it to gpu,
-      feats = mat;
-
-      // fwd-pass, feature transform,
-      nnet_transf.Feedforward(feats, &feats_transf);
-      if (!KALDI_ISFINITE(feats_transf.Sum())) {  // check there's no nan/inf,
-        KALDI_ERR << "NaN or inf found in transformed-features for " << utt;
-      }
-
-      // fwd-pass, nnet,
-      nnet.Feedforward(feats_transf, &nnet_out);
-      if (!KALDI_ISFINITE(nnet_out.Sum())) {  // check there's no nan/inf,
-        KALDI_ERR << "NaN or inf found in nn-output for " << utt;
-      }
-
-      // convert posteriors to log-posteriors,
-      if (apply_log) {
-        if (!(nnet_out.Min() >= 0.0 && nnet_out.Max() <= 1.0)) {
-          KALDI_WARN << "Applying 'log()' to data which don't seem to be "
-                     << "probabilities," << utt;
-        }
-        nnet_out.Add(1e-20);  // avoid log(0),
-        nnet_out.ApplyLog();
-      }
-
-      // subtract log-priors from log-posteriors or pre-softmax,
-      if (prior_opts.class_frame_counts != "") {
-        pdf_prior.SubtractOnLogpost(&nnet_out);
-      }
-
-      // download from GPU,
-      nnet_out_host = Matrix<BaseFloat>(nnet_out);
-
-      // write,
-      if (!KALDI_ISFINITE(nnet_out_host.Sum())) {  // check there's no nan/inf,
-        KALDI_ERR << "NaN or inf found in final output nn-output for " << utt;
-      }
-      feature_writer.Write(feature_reader.Key(), nnet_out_host);
-
-      // progress log,
-      if (num_done % 100 == 0) {
-        time_now = time.Elapsed();
-        KALDI_VLOG(1) << "After " << num_done << " utterances: time elapsed = "
-                      << time_now/60 << " min; processed " << tot_t/time_now
-                      << " frames per second.";
-      }
-      num_done++;
-      tot_t += mat.NumRows();
-    }
-
-    // final message,
-    KALDI_LOG << "Done " << num_done << " files"
-              << " in " << time.Elapsed()/60 << "min,"
-              << " (fps " << tot_t/time.Elapsed() << ")";
-
-#if HAVE_CUDA == 1
-    if (GetVerboseLevel() >= 1) {
-      CuDevice::Instantiate().PrintProfile();
-    }
-#endif
-
-    if (num_done == 0) return -1;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-info.cc b/src/nnetbin/nnet-info.cc
deleted file mode 100644
index b35ef7da605..00000000000
--- a/src/nnetbin/nnet-info.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-// nnetbin/nnet-info.cc
-
-// Copyright 2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Print human-readable information about the neural network.\n"
-        "(topology, various weight statistics, etc.) It prints to stdout.\n"
-        "Usage:  nnet-info [options] <nnet-in>\n"
-        "e.g.:\n"
-        " nnet-info 1.nnet\n";
-
-    ParseOptions po(usage);
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 1) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_rxfilename = po.GetArg(1);
-
-    // load the network
-    Nnet nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      nnet.Read(ki.Stream(), binary_read);
-    }
-
-    std::cout << nnet.Info();
-
-    KALDI_LOG << "Printed info about " << nnet_rxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/nnetbin/nnet-initialize.cc b/src/nnetbin/nnet-initialize.cc
deleted file mode 100644
index fed255575d0..00000000000
--- a/src/nnetbin/nnet-initialize.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// nnetbin/nnet-initialize.cc
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-      "Initialize Neural Network parameters according to a prototype (nnet1).\n"
-      "Usage:  nnet-initialize [options] <nnet-prototype-in> <nnet-out>\n"
-      "e.g.: nnet-initialize --binary=false nnet.proto nnet.init\n";
-
-    SetVerboseLevel(1);  // be verbose by default,
-
-    ParseOptions po(usage);
-    bool binary_write = true;
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    int32 seed = 777;
-    po.Register("seed", &seed, "Seed for random number generator");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_config_in_filename = po.GetArg(1),
-        nnet_out_filename = po.GetArg(2);
-
-    std::srand(seed);
-
-    // initialize the network
-    Nnet nnet;
-    nnet.Init(nnet_config_in_filename);
-
-    // store the network
-    Output ko(nnet_out_filename, binary_write);
-    nnet.Write(ko.Stream(), binary_write);
-
-    KALDI_LOG << "Written initialized model to " << nnet_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/nnetbin/nnet-set-learnrate.cc b/src/nnetbin/nnet-set-learnrate.cc
deleted file mode 100644
index c520e6bdbbc..00000000000
--- a/src/nnetbin/nnet-set-learnrate.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// nnetbin/nnet-set-learnrate.cc
-
-// Copyright 2016,  Brno University of Technology
-//                  (author: Katerina Zmolikova, Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-affine-transform.h"
-#include "nnet/nnet-activation.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-      "Sets learning rate coefficient inside of 'nnet1' model\n"
-      "Usage: nnet-set-learnrate --components=<csl> --coef=<float> <nnet-in> <nnet-out>\n"
-      "e.g.: nnet-set-learnrate --components=1:3:5 --coef=0.5 --bias-coef=0.1 nnet-in nnet-out\n";
-
-    ParseOptions po(usage);
-    bool binary = true;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    std::string components_str = "";
-    po.Register("components", &components_str,
-        "Select components by 'csl' of 1..N values. Layout is the same as in "
-        "'nnet-info' output, (example 1:3:5)");
-
-    float coef = 1.0,
-          weight_coef = 1.0,
-          bias_coef = 1.0;
-
-    po.Register("coef", &coef,
-        "Learn-rate coefficient for both weight matrices and biases.");
-    po.Register("weight-coef", &weight_coef,
-        "Learn-rate coefficient for weight matrices "
-        "(used as: coef * weight_coef).");
-    po.Register("bias-coef", &bias_coef,
-        "Learn-rate coefficient for bias (used as: coef * bias_coef).");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string nnet_in_filename = po.GetArg(1),
-      nnet_out_filename = po.GetArg(2);
-
-    Nnet nnet;
-    nnet.Read(nnet_in_filename);
-
-    // A vector which contains indices of components,
-    // where we will set the 'learn-rate coefficients',
-    std::vector<int32> components;
-    if (components_str != "") {
-      // components were selected by the option,
-      kaldi::SplitStringToIntegers(components_str, ":", false, &components);
-    } else {
-      // otherwise select all the components (1..Ncomp),
-      for (int32 i = 1; i <= nnet.NumComponents(); i++) {
-        components.push_back(i);
-      }
-    }
-
-    // Setting the learning rate coefficients,
-    for (int32 i = 0; i < components.size(); i++) {
-      if (nnet.GetComponent(components[i]-1).IsUpdatable()) {
-        UpdatableComponent& comp =
-          dynamic_cast<UpdatableComponent&>(nnet.GetComponent(components[i]-1));
-        comp.SetLearnRateCoef(coef * weight_coef);  // weight matrices, etc.,
-        comp.SetBiasLearnRateCoef(coef * bias_coef);  // biases,
-      }
-    }
-
-    // Write the 'nnet1' network,
-    nnet.Write(nnet_out_filename, binary);
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/nnetbin/nnet-train-frmshuff.cc b/src/nnetbin/nnet-train-frmshuff.cc
deleted file mode 100644
index cc50e33ea42..00000000000
--- a/src/nnetbin/nnet-train-frmshuff.cc
+++ /dev/null
@@ -1,424 +0,0 @@
-// nnetbin/nnet-train-frmshuff.cc
-
-// Copyright 2013-2016  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-randomizer.h"
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-
-  try {
-    const char *usage =
-      "Perform one iteration (epoch) of Neural Network training with\n"
-      "mini-batch Stochastic Gradient Descent. The training targets\n"
-      "are usually pdf-posteriors, prepared by ali-to-post.\n"
-      "Usage:  nnet-train-frmshuff [options] <feature-rspecifier> <targets-rspecifier> <model-in> [<model-out>]\n"
-      "e.g.: nnet-train-frmshuff scp:feats.scp ark:posterior.ark nnet.init nnet.iter1\n";
-
-    ParseOptions po(usage);
-
-    NnetTrainOptions trn_opts;
-    trn_opts.Register(&po);
-    NnetDataRandomizerOptions rnd_opts;
-    rnd_opts.Register(&po);
-    LossOptions loss_opts;
-    loss_opts.Register(&po);
-
-    bool binary = true;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    bool crossvalidate = false;
-    po.Register("cross-validate", &crossvalidate,
-        "Perform cross-validation (don't back-propagate)");
-
-    bool randomize = true;
-    po.Register("randomize", &randomize,
-        "Perform the frame-level shuffling within the Cache::");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in Nnet format");
-
-    std::string objective_function = "xent";
-    po.Register("objective-function", &objective_function,
-        "Objective function : xent|mse|multitask");
-
-    int32 max_frames = 360000;
-    po.Register("max-frames", &max_frames,
-        "Maximum number of frames an utterance can have (skipped if longer)");
-
-    int32 length_tolerance = 5;
-    po.Register("length-tolerance", &length_tolerance,
-        "Allowed length mismatch of features/targets/weights "
-        "(in frames, we truncate to the shortest)");
-
-    std::string frame_weights;
-    po.Register("frame-weights", &frame_weights,
-        "Per-frame weights, used to re-scale gradients.");
-
-    std::string utt_weights;
-    po.Register("utt-weights", &utt_weights,
-        "Per-utterance weights, used to re-scale frame-weights.");
-
-    std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feature_rspecifier = po.GetArg(1),
-      targets_rspecifier = po.GetArg(2),
-      model_filename = po.GetArg(3);
-
-    std::string target_model_filename;
-    if (!crossvalidate) {
-      target_model_filename = po.GetArg(4);
-    }
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if (feature_transform != "") {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    nnet.SetTrainOptions(trn_opts);
-
-    if (crossvalidate) {
-      nnet_transf.SetDropoutRate(0.0);
-      nnet.SetDropoutRate(0.0);
-    }
-
-    kaldi::int64 total_frames = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessPosteriorReader targets_reader(targets_rspecifier);
-    RandomAccessBaseFloatVectorReader weights_reader;
-    if (frame_weights != "") {
-      weights_reader.Open(frame_weights);
-    }
-    RandomAccessBaseFloatReader utt_weights_reader;
-    if (utt_weights != "") {
-      utt_weights_reader.Open(utt_weights);
-    }
-
-    RandomizerMask randomizer_mask(rnd_opts);
-    MatrixRandomizer feature_randomizer(rnd_opts);
-    PosteriorRandomizer targets_randomizer(rnd_opts);
-    VectorRandomizer weights_randomizer(rnd_opts);
-
-    Xent xent(loss_opts);
-    Mse mse(loss_opts);
-
-    MultiTaskLoss multitask(loss_opts);
-    if (0 == objective_function.compare(0, 9, "multitask")) {
-      // objective_function contains something like :
-      // 'multitask,xent,2456,1.0,mse,440,0.001'
-      //
-      // the meaning is following:
-      // 'multitask,<type1>,<dim1>,<weight1>,...,<typeN>,<dimN>,<weightN>'
-      multitask.InitFromString(objective_function);
-    }
-
-    CuMatrix<BaseFloat> feats_transf, nnet_out, obj_diff;
-
-    Timer time, time_io;
-    KALDI_LOG << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-              << " STARTED";
-
-    int32 num_done = 0,
-          num_no_tgt_mat = 0,
-          num_other_error = 0;
-
-    double time_io_accu = 0.0;
-
-    // main loop,
-    while (!feature_reader.Done()) {
-#if HAVE_CUDA == 1
-      // check that GPU computes accurately,
-      CuDevice::Instantiate().CheckGpuHealth();
-#endif
-      // fill the randomizer,
-      time_io.Reset();
-      for ( ; !feature_reader.Done(); feature_reader.Next()) {
-        if (feature_randomizer.IsFull()) {
-          // break the loop without calling Next(),
-          // we keep the 'utt' for next round,
-          break;
-        }
-        std::string utt = feature_reader.Key();
-        KALDI_VLOG(3) << "Reading " << utt;
-        // check that we have targets,
-        if (!targets_reader.HasKey(utt)) {
-          KALDI_WARN << utt << ", missing targets";
-          num_no_tgt_mat++;
-          continue;
-        }
-        // check we have per-frame weights,
-        if (frame_weights != "" && !weights_reader.HasKey(utt)) {
-          KALDI_WARN << utt << ", missing per-frame weights";
-          num_other_error++;
-          continue;
-        }
-        // check we have per-utterance weights,
-        if (utt_weights != "" && !utt_weights_reader.HasKey(utt)) {
-          KALDI_WARN << utt << ", missing per-utterance weight";
-          num_other_error++;
-          continue;
-        }
-        // get feature / target pair,
-        Matrix<BaseFloat> mat = feature_reader.Value();
-        Posterior targets = targets_reader.Value(utt);
-        // get per-frame weights,
-        Vector<BaseFloat> weights;
-        if (frame_weights != "") {
-          weights = weights_reader.Value(utt);
-        } else {  // all per-frame weights are 1.0,
-          weights.Resize(mat.NumRows());
-          weights.Set(1.0);
-        }
-        // multiply with per-utterance weight,
-        if (utt_weights != "") {
-          BaseFloat w = utt_weights_reader.Value(utt);
-          KALDI_ASSERT(w >= 0.0);
-          if (w == 0.0) continue;  // remove sentence from training,
-          weights.Scale(w);
-        }
-
-        // accumulate the I/O time,
-        time_io_accu += time_io.Elapsed();
-        time_io.Reset(); // to be sure we don't count 2x,
-
-        // skip too long utterances (or we run out of memory),
-        if (mat.NumRows() > max_frames) {
-          KALDI_WARN << "Utterance too long, skipping! " << utt
-            << " (length " << mat.NumRows() << ", max_frames "
-            << max_frames << ")";
-          num_other_error++;
-          continue;
-        }
-
-        // correct small length mismatch or drop sentence,
-        {
-          // add lengths to vector,
-          std::vector<int32> length;
-          length.push_back(mat.NumRows());
-          length.push_back(targets.size());
-          length.push_back(weights.Dim());
-          // find min, max,
-          int32 min = *std::min_element(length.begin(), length.end());
-          int32 max = *std::max_element(length.begin(), length.end());
-          // fix or drop ?
-          if (max - min < length_tolerance) {
-            // we truncate to shortest,
-            if (mat.NumRows() != min) mat.Resize(min, mat.NumCols(), kCopyData);
-            if (targets.size() != min) targets.resize(min);
-            if (weights.Dim() != min) weights.Resize(min, kCopyData);
-          } else {
-            KALDI_WARN << "Length mismatch! Targets " << targets.size()
-                       << ", features " << mat.NumRows() << ", " << utt;
-            num_other_error++;
-            continue;
-          }
-        }
-        // apply feature transform (if empty, input is copied),
-        nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
-
-        // remove frames with '0' weight from training,
-        {
-          // are there any frames to be removed? (frames with zero weight),
-          BaseFloat weight_min = weights.Min();
-          KALDI_ASSERT(weight_min >= 0.0);
-          if (weight_min == 0.0) {
-            // create vector with frame-indices to keep,
-            std::vector<MatrixIndexT> keep_frames;
-            for (int32 i = 0; i < weights.Dim(); i++) {
-              if (weights(i) > 0.0) {
-                keep_frames.push_back(i);
-              }
-            }
-
-            // when all frames are removed, we skip the sentence,
-            if (keep_frames.size() == 0) continue;
-
-            // filter feature-frames,
-            CuMatrix<BaseFloat> tmp_feats(keep_frames.size(), feats_transf.NumCols());
-            tmp_feats.CopyRows(feats_transf, CuArray<MatrixIndexT>(keep_frames));
-            tmp_feats.Swap(&feats_transf);
-
-            // filter targets,
-            Posterior tmp_targets;
-            for (int32 i = 0; i < keep_frames.size(); i++) {
-              tmp_targets.push_back(targets[keep_frames[i]]);
-            }
-            tmp_targets.swap(targets);
-
-            // filter weights,
-            Vector<BaseFloat> tmp_weights(keep_frames.size());
-            for (int32 i = 0; i < keep_frames.size(); i++) {
-              tmp_weights(i) = weights(keep_frames[i]);
-            }
-            tmp_weights.Swap(&weights);
-          }
-        }
-
-        // pass data to randomizers,
-        KALDI_ASSERT(feats_transf.NumRows() == targets.size());
-        feature_randomizer.AddData(feats_transf);
-        targets_randomizer.AddData(targets);
-        weights_randomizer.AddData(weights);
-        num_done++;
-
-        time_io.Reset(); // reset before reading next feature matrix,
-      }
-
-      // randomize,
-      if (!crossvalidate && randomize) {
-        const std::vector<int32>& mask =
-          randomizer_mask.Generate(feature_randomizer.NumFrames());
-        feature_randomizer.Randomize(mask);
-        targets_randomizer.Randomize(mask);
-        weights_randomizer.Randomize(mask);
-      }
-
-      // train with data from randomizers (using mini-batches),
-      for ( ; !feature_randomizer.Done(); feature_randomizer.Next(),
-                                          targets_randomizer.Next(),
-                                          weights_randomizer.Next()) {
-        // get block of feature/target pairs,
-        const CuMatrixBase<BaseFloat>& nnet_in = feature_randomizer.Value();
-        const Posterior& nnet_tgt = targets_randomizer.Value();
-        const Vector<BaseFloat>& frm_weights = weights_randomizer.Value();
-
-        // forward pass,
-        nnet.Propagate(nnet_in, &nnet_out);
-
-        // evaluate objective function we've chosen,
-        if (objective_function == "xent") {
-          // gradients re-scaled by weights in Eval,
-          xent.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
-        } else if (objective_function == "mse") {
-          // gradients re-scaled by weights in Eval,
-          mse.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
-        } else if (0 == objective_function.compare(0, 9, "multitask")) {
-          // gradients re-scaled by weights in Eval,
-          multitask.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
-        } else {
-          KALDI_ERR << "Unknown objective function code : " << objective_function;
-        }
-
-        if (!crossvalidate) {
-          // back-propagate, and do the update,
-          nnet.Backpropagate(obj_diff, NULL);
-        }
-
-        // 1st mini-batch : show what happens in network,
-        if (total_frames == 0) {
-          KALDI_LOG << "### After " << total_frames << " frames,";
-          KALDI_LOG << nnet.InfoPropagate();
-          if (!crossvalidate) {
-            KALDI_LOG << nnet.InfoBackPropagate();
-            KALDI_LOG << nnet.InfoGradient();
-          }
-        }
-
-        // VERBOSE LOG
-        // monitor the NN training (--verbose=2),
-        if (GetVerboseLevel() >= 2) {
-          static int32 counter = 0;
-          counter += nnet_in.NumRows();
-          // print every 25k frames,
-          if (counter >= 25000) {
-            KALDI_VLOG(2) << "### After " << total_frames << " frames,";
-            KALDI_VLOG(2) << nnet.InfoPropagate();
-            if (!crossvalidate) {
-              KALDI_VLOG(2) << nnet.InfoBackPropagate();
-              KALDI_VLOG(2) << nnet.InfoGradient();
-            }
-            counter = 0;
-          }
-        }
-
-        total_frames += nnet_in.NumRows();
-      }
-    }  // main loop,
-
-    // after last mini-batch : show what happens in network,
-    KALDI_LOG << "### After " << total_frames << " frames,";
-    KALDI_LOG << nnet.InfoPropagate();
-    if (!crossvalidate) {
-      KALDI_LOG << nnet.InfoBackPropagate();
-      KALDI_LOG << nnet.InfoGradient();
-    }
-
-    if (!crossvalidate) {
-      nnet.Write(target_model_filename, binary);
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, "
-      << num_no_tgt_mat << " with no tgt_mats, "
-      << num_other_error << " with other errors. "
-      << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-      << ", " << (randomize ? "RANDOMIZED" : "NOT-RANDOMIZED")
-      << ", " << time.Elapsed() / 60 << " min, processing "
-      << total_frames / time.Elapsed() << " frames per sec;"
-      << " i/o time " << 100.*time_io_accu/time.Elapsed() << "%]";
-
-    if (objective_function == "xent") {
-      KALDI_LOG << xent.ReportPerClass();
-      KALDI_LOG << xent.Report();
-    } else if (objective_function == "mse") {
-      KALDI_LOG << mse.Report();
-    } else if (0 == objective_function.compare(0, 9, "multitask")) {
-      KALDI_LOG << multitask.Report();
-    } else {
-      KALDI_ERR << "Unknown objective function code : " << objective_function;
-    }
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-train-mmi-sequential.cc b/src/nnetbin/nnet-train-mmi-sequential.cc
deleted file mode 100644
index 2554d64287a..00000000000
--- a/src/nnetbin/nnet-train-mmi-sequential.cc
+++ /dev/null
@@ -1,481 +0,0 @@
-// nnetbin/nnet-train-mmi-sequential.cc
-
-// Copyright 2012-2016  Brno University of Technology (author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <iomanip>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/faster-decoder.h"
-#include "decoder/decodable-matrix.h"
-#include "lat/kaldi-lattice.h"
-#include "lat/lattice-functions.h"
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-activation.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-pdf-prior.h"
-#include "nnet/nnet-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-
-
-namespace kaldi {
-namespace nnet1 {
-
-void LatticeAcousticRescore(const Matrix<BaseFloat> &log_like,
-                            const TransitionModel &trans_model,
-                            const std::vector<int32> &state_times,
-                            Lattice *lat) {
-  kaldi::uint64 props = lat->Properties(fst::kFstProperties, false);
-  if (!(props & fst::kTopSorted))
-    KALDI_ERR << "Input lattice must be topologically sorted.";
-
-  KALDI_ASSERT(!state_times.empty());
-  std::vector<std::vector<int32> > time_to_state(log_like.NumRows());
-  for (size_t i = 0; i < state_times.size(); i++) {
-    KALDI_ASSERT(state_times[i] >= 0);
-    if (state_times[i] < log_like.NumRows())  // end state may be past this..
-      time_to_state[state_times[i]].push_back(i);
-    else
-      KALDI_ASSERT(state_times[i] == log_like.NumRows()
-                   && "There appears to be lattice/feature mismatch.");
-  }
-
-  for (int32 t = 0; t < log_like.NumRows(); t++) {
-    for (size_t i = 0; i < time_to_state[t].size(); i++) {
-      int32 state = time_to_state[t][i];
-      for (fst::MutableArcIterator<Lattice> aiter(lat, state); !aiter.Done();
-           aiter.Next()) {
-        LatticeArc arc = aiter.Value();
-        int32 trans_id = arc.ilabel;
-        if (trans_id != 0) {  // Non-epsilon input label on arc
-          int32 pdf_id = trans_model.TransitionIdToPdf(trans_id);
-          arc.weight.SetValue2(-log_like(t, pdf_id) + arc.weight.Value2());
-          aiter.SetValue(arc);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-  try {
-    const char *usage =
-      "Perform one iteration of MMI training using SGD with per-utterance"
-      "updates\n"
-
-      "Usage:  nnet-train-mmi-sequential [options] "
-      "<model-in> <transition-model-in> <feature-rspecifier> "
-      "<den-lat-rspecifier> <ali-rspecifier> [<model-out>]\n"
-
-      "e.g.: nnet-train-mmi-sequential nnet.init trans.mdl scp:feats.scp "
-      "scp:denlats.scp ark:ali.ark nnet.iter1\n";
-
-    ParseOptions po(usage);
-
-    NnetTrainOptions trn_opts;
-    trn_opts.learn_rate = 0.00001;  // changing default,
-    trn_opts.Register(&po);
-
-    bool binary = true;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in 'nnet1' format");
-
-    PdfPriorOptions prior_opts;
-    prior_opts.Register(&po);
-
-    BaseFloat acoustic_scale = 1.0,
-        lm_scale = 1.0,
-        old_acoustic_scale = 0.0;
-
-    po.Register("acoustic-scale", &acoustic_scale,
-        "Scaling factor for acoustic likelihoods");
-
-    po.Register("lm-scale", &lm_scale,
-        "Scaling factor for \"graph costs\" (including LM costs)");
-
-    po.Register("old-acoustic-scale", &old_acoustic_scale,
-        "Add in the scores in the input lattices with this scale, "
-        "rather than discarding them.");
-
-    kaldi::int32 max_frames = 6000;
-    po.Register("max-frames", &max_frames,
-        "Maximum number of frames an utterance can have (skipped if longer)");
-
-    bool drop_frames = true;
-    po.Register("drop-frames", &drop_frames,
-        "Drop frames, where is zero den-posterior under numerator path "
-        "(ie. path not in lattice)");
-
-    std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        transition_model_filename = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        den_lat_rspecifier = po.GetArg(4),
-        num_ali_rspecifier = po.GetArg(5),
-        target_model_filename = po.GetArg(6);
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if (feature_transform != "") {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    // we will use pre-softmax activations, removing softmax,
-    // - pre-softmax activations are equivalent to 'log-posterior + C_frame',
-    // - all paths crossing a frame share same 'C_frame',
-    // - with GMM, we also have the unnormalized acoustic likelihoods,
-    if (nnet.GetLastComponent().GetType() ==
-        kaldi::nnet1::Component::kSoftmax) {
-      KALDI_LOG << "Removing softmax from the nnet " << model_filename;
-      nnet.RemoveLastComponent();
-    } else {
-      KALDI_LOG << "The nnet was without softmax. "
-                << "The last component in " << model_filename << " was "
-                << Component::TypeToMarker(nnet.GetLastComponent().GetType());
-    }
-    nnet.SetTrainOptions(trn_opts);
-
-    // Read the class-frame-counts, compute priors,
-    PdfPrior log_prior(prior_opts);
-
-    // Read transition model,
-    TransitionModel trans_model;
-    ReadKaldiObject(transition_model_filename, &trans_model);
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
-    RandomAccessInt32VectorReader num_ali_reader(num_ali_rspecifier);
-
-    CuMatrix<BaseFloat> feats_transf, nnet_out, nnet_diff;
-    Matrix<BaseFloat> nnet_out_h, nnet_diff_h;
-
-    if (drop_frames) {
-      KALDI_LOG << "--drop-frames=true :"
-                   " we will zero gradient for frames with total den/num mismatch."
-                   " The mismatch is likely to be caused by missing correct path "
-                   " from den-lattice due wrong annotation or search error."
-                   " Leaving such frames out stabilizes the training.";
-    }
-
-    Timer time;
-    double time_now = 0;
-    KALDI_LOG << "TRAINING STARTED";
-
-    int32 num_done = 0, num_no_num_ali = 0, num_no_den_lat = 0,
-          num_other_error = 0, num_frm_drop = 0;
-
-    kaldi::int64 total_frames = 0;
-    double lat_like;  // total likelihood of the lattice
-    double lat_ac_like;  // acoustic likelihood weighted by posterior.
-    double total_mmi_obj = 0.0, mmi_obj = 0.0;
-    double total_post_on_ali = 0.0, post_on_ali = 0.0;
-
-    // main loop over utterances,
-    for ( ; !feature_reader.Done(); feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      if (!den_lat_reader.HasKey(utt)) {
-        KALDI_WARN << "Missing lattice of " << utt;
-        num_no_den_lat++;
-        continue;
-      }
-      if (!num_ali_reader.HasKey(utt)) {
-        KALDI_WARN << "Missing alignment of " << utt;
-        num_no_num_ali++;
-        continue;
-      }
-
-      // 1) get the features, numerator alignment,
-      const Matrix<BaseFloat> &mat = feature_reader.Value();
-      const std::vector<int32> &num_ali = num_ali_reader.Value(utt);
-      // check duration of numerator alignments
-      if (static_cast<int32>(num_ali.size()) != mat.NumRows()) {
-        KALDI_WARN << "Duration mismatch!"
-                   << " alignment " << num_ali.size()
-                   << " features " << mat.NumRows();
-        num_other_error++;
-        continue;
-      }
-      if (mat.NumRows() > max_frames) {
-        KALDI_WARN << "Skipping " << utt
-          << " that has " << mat.NumRows() << " frames,"
-          << " it is longer than '--max-frames'" << max_frames;
-        num_other_error++;
-        continue;
-      }
-
-      // 2) get the denominator-lattice, preprocess
-      Lattice den_lat = den_lat_reader.Value(utt);
-      if (den_lat.Start() == -1) {
-        KALDI_WARN << "Empty lattice of " << utt << ", skipping.";
-        num_other_error++;
-        continue;
-      }
-      if (old_acoustic_scale != 1.0) {
-        fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale),
-                          &den_lat);
-      }
-      // optional sort it topologically
-      kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
-      if (!(props & fst::kTopSorted)) {
-        if (fst::TopSort(&den_lat) == false) {
-          KALDI_ERR << "Cycles detected in lattice.";
-        }
-      }
-      // get the lattice length and times of states,
-      std::vector<int32> state_times;
-      int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times);
-      // check duration of den. lattice,
-      if (max_time != mat.NumRows()) {
-        KALDI_WARN << "Duration mismatch!"
-          << " denominator lattice " << max_time
-          << " features " << mat.NumRows() << ","
-          << " skipping " << utt;
-        num_other_error++;
-        continue;
-      }
-
-      // get dims,
-      int32 num_frames = mat.NumRows(),
-            num_pdfs = nnet.OutputDim();
-
-      // 3) get the pre-softmax outputs from NN,
-      // apply transform,
-      nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
-      // propagate through the nnet (we know it's w/o softmax),
-      nnet.Propagate(feats_transf, &nnet_out);
-      // subtract the log_prior,
-      if (prior_opts.class_frame_counts != "") {
-        log_prior.SubtractOnLogpost(&nnet_out);
-      }
-      // transfer it back to the host,
-      nnet_out_h = Matrix<BaseFloat>(nnet_out);
-      // release the buffers we don't need anymore,
-      feats_transf.Resize(0, 0);
-      nnet_out.Resize(0, 0);
-
-      // 4) rescore the latice,
-      LatticeAcousticRescore(nnet_out_h, trans_model, state_times, &den_lat);
-      if (acoustic_scale != 1.0 || lm_scale != 1.0)
-        fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &den_lat);
-
-      // 5) get the posteriors,
-      kaldi::Posterior post;
-      lat_like = kaldi::LatticeForwardBackward(den_lat, &post, &lat_ac_like);
-
-      // 6) convert the Posterior to a matrix,
-      PosteriorToPdfMatrix(post, trans_model, &nnet_diff_h);
-
-      // 7) Calculate the MMI-objective function,
-      // Calculate the likelihood of correct path from acoustic score,
-      // the denominator likelihood is the total likelihood of the lattice.
-      double path_ac_like = 0.0;
-      for (int32 t = 0; t < num_frames; t++) {
-        int32 pdf = trans_model.TransitionIdToPdf(num_ali[t]);
-        path_ac_like += nnet_out_h(t, pdf);
-      }
-      path_ac_like *= acoustic_scale;
-      mmi_obj = path_ac_like - lat_like;
-      //
-      // Note: numerator likelihood does not include graph score,
-      // while denominator likelihood contains graph scores.
-      // The result is offset at the MMI-objective.
-      // However the offset is constant for given alignment,
-      // so it does not change accross epochs.
-
-      // Sum the den-posteriors under the correct path,
-      post_on_ali = 0.0;
-      for (int32 t = 0; t < num_frames; t++) {
-        int32 pdf = trans_model.TransitionIdToPdf(num_ali[t]);
-        double posterior = nnet_diff_h(t, pdf);
-        post_on_ali += posterior;
-      }
-
-      // Report,
-      KALDI_VLOG(1) << "Lattice #" << num_done + 1 << " processed"
-        << " (" << utt << "): found " << den_lat.NumStates()
-        << " states and " << fst::NumArcs(den_lat) << " arcs.";
-
-      KALDI_VLOG(1) << "Utterance " << utt << ": Average MMI obj. value = "
-        << (mmi_obj/num_frames) << " over " << num_frames << " frames."
-        << " (Avg. den-posterior on ali " << post_on_ali / num_frames << ")";
-
-
-      // 7a) Search for the frames with num/den mismatch,
-      int32 frm_drop = 0;
-      std::vector<int32> frm_drop_vec;
-      for (int32 t = 0; t < num_frames; t++) {
-        int32 pdf = trans_model.TransitionIdToPdf(num_ali[t]);
-        double posterior = nnet_diff_h(t, pdf);
-        if (posterior < 1e-20) {
-          frm_drop++;
-          frm_drop_vec.push_back(t);
-        }
-      }
-
-      // 8) subtract the pdf-Viterbi-path,
-      for (int32 t = 0; t < nnet_diff_h.NumRows(); t++) {
-        int32 pdf = trans_model.TransitionIdToPdf(num_ali[t]);
-        nnet_diff_h(t, pdf) -= 1.0;
-      }
-
-      // 9) Drop mismatched frames from the training by zeroing the derivative,
-      if (drop_frames) {
-        for (int32 i = 0; i < frm_drop_vec.size(); i++) {
-          nnet_diff_h.Row(frm_drop_vec[i]).Set(0.0);
-        }
-        num_frm_drop += frm_drop;
-      }
-      // Report the frame dropping
-      if (frm_drop > 0) {
-        std::stringstream ss;
-        ss << (drop_frames?"Dropped":"[dropping disabled] Would drop")
-           << " frames in " << utt << " " << frm_drop << "/" << num_frames
-           << ",";
-        // get frame intervals from vec frm_drop_vec,
-        ss << " intervals :";
-        // search for streaks of consecutive numbers,
-        int32 beg_streak = frm_drop_vec[0];
-        int32 len_streak = 0;
-        int32 i;
-        for (i = 0; i < frm_drop_vec.size(); i++, len_streak++) {
-          if (beg_streak + len_streak != frm_drop_vec[i]) {
-            ss << " " << beg_streak << ".." << frm_drop_vec[i-1] << "frm";
-            beg_streak = frm_drop_vec[i];
-            len_streak = 0;
-          }
-        }
-        ss << " " << beg_streak << ".." << frm_drop_vec[i-1] << "frm";
-        // print,
-        KALDI_WARN << ss.str();
-      }
-
-      // 10) backpropagate through the nnet, update,
-      nnet_diff.Resize(num_frames, num_pdfs, kUndefined);
-      nnet_diff.CopyFromMat(nnet_diff_h);
-      nnet.Backpropagate(nnet_diff, NULL);
-      // relase the buffer, we don't need anymore,
-      nnet_diff.Resize(0, 0);
-
-      // increase time counter
-      total_mmi_obj += mmi_obj;
-      total_post_on_ali += post_on_ali;
-      total_frames += num_frames;
-      num_done++;
-
-      if (num_done % 100 == 0) {
-        time_now = time.Elapsed();
-        KALDI_VLOG(1) << "After " << num_done << " utterances: "
-          << "time elapsed = " << time_now / 60 << " min; "
-          << "processed " << total_frames / time_now << " frames per sec.";
-#if HAVE_CUDA == 1
-        // check that GPU computes accurately,
-        CuDevice::Instantiate().CheckGpuHealth();
-#endif
-      }
-
-      // GRADIENT LOGGING
-      // First utterance,
-      if (num_done == 1) {
-        KALDI_VLOG(1) << nnet.InfoPropagate();
-        KALDI_VLOG(1) << nnet.InfoBackPropagate();
-        KALDI_VLOG(1) << nnet.InfoGradient();
-      }
-      // Every 1000 utterances (--verbose=2),
-      if (GetVerboseLevel() >= 2) {
-        if (num_done % 1000 == 0) {
-          KALDI_VLOG(2) << nnet.InfoPropagate();
-          KALDI_VLOG(2) << nnet.InfoBackPropagate();
-          KALDI_VLOG(2) << nnet.InfoGradient();
-        }
-      }
-    }  // main loop over utterances,
-
-    // After last utterance,
-    KALDI_VLOG(1) << nnet.InfoPropagate();
-    KALDI_VLOG(1) << nnet.InfoBackPropagate();
-    KALDI_VLOG(1) << nnet.InfoGradient();
-
-    // Add the softmax layer back before writing,
-    KALDI_LOG << "Appending the softmax " << target_model_filename;
-    nnet.AppendComponentPointer(new Softmax(nnet.OutputDim(), nnet.OutputDim()));
-    // Store the nnet,
-    nnet.Write(target_model_filename, binary);
-
-    time_now = time.Elapsed();
-    KALDI_LOG << "TRAINING FINISHED; "
-              << "Time taken = " << time_now/60 << " min; processed "
-              << (total_frames/time_now) << " frames per second.";
-
-    KALDI_LOG << "Done " << num_done << " files, "
-              << num_no_num_ali << " with no numerator alignments, "
-              << num_no_den_lat << " with no denominator lattices, "
-              << num_other_error << " with other errors.";
-
-    KALDI_LOG << "Overall MMI-objective/frame is "
-              << std::setprecision(8) << total_mmi_obj / total_frames
-              << " over " << total_frames << " frames,"
-              << " (average den-posterior on ali "
-              << total_post_on_ali / total_frames << ","
-              << " dropped " << num_frm_drop
-              << " frames with num/den mismatch)";
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-train-mpe-sequential.cc b/src/nnetbin/nnet-train-mpe-sequential.cc
deleted file mode 100644
index 2ba14527142..00000000000
--- a/src/nnetbin/nnet-train-mpe-sequential.cc
+++ /dev/null
@@ -1,412 +0,0 @@
-// nnetbin/nnet-train-mpe-sequential.cc
-
-// Copyright 2011-2016  Brno University of Technology (author: Karel Vesely);
-//                      Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/faster-decoder.h"
-#include "decoder/decodable-matrix.h"
-#include "lat/kaldi-lattice.h"
-#include "lat/lattice-functions.h"
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-activation.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-pdf-prior.h"
-#include "nnet/nnet-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-
-
-namespace kaldi {
-namespace nnet1 {
-
-void LatticeAcousticRescore(const Matrix<BaseFloat> &log_like,
-                            const TransitionModel &trans_model,
-                            const std::vector<int32> &state_times,
-                            Lattice *lat) {
-  kaldi::uint64 props = lat->Properties(fst::kFstProperties, false);
-  if (!(props & fst::kTopSorted))
-    KALDI_ERR << "Input lattice must be topologically sorted.";
-
-  KALDI_ASSERT(!state_times.empty());
-  std::vector<std::vector<int32> > time_to_state(log_like.NumRows());
-  for (size_t i = 0; i < state_times.size(); i++) {
-    KALDI_ASSERT(state_times[i] >= 0);
-    if (state_times[i] < log_like.NumRows())  // end state may be past this..
-      time_to_state[state_times[i]].push_back(i);
-    else
-      KALDI_ASSERT(state_times[i] == log_like.NumRows()
-                   && "There appears to be lattice/feature mismatch.");
-  }
-
-  for (int32 t = 0; t < log_like.NumRows(); t++) {
-    for (size_t i = 0; i < time_to_state[t].size(); i++) {
-      int32 state = time_to_state[t][i];
-      for (fst::MutableArcIterator<Lattice> aiter(lat, state); !aiter.Done();
-           aiter.Next()) {
-        LatticeArc arc = aiter.Value();
-        int32 trans_id = arc.ilabel;
-        if (trans_id != 0) {  // Non-epsilon input label on arc
-          int32 pdf_id = trans_model.TransitionIdToPdf(trans_id);
-          arc.weight.SetValue2(-log_like(t, pdf_id) + arc.weight.Value2());
-          aiter.SetValue(arc);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-  try {
-    const char *usage =
-      "Perform one iteration of MPE/sMBR training using SGD with per-utterance"
-      "updates.\n"
-
-      "Usage:  nnet-train-mpe-sequential [options] "
-      "<model-in> <transition-model-in> <feature-rspecifier> "
-      "<den-lat-rspecifier> <ali-rspecifier> [<model-out>]\n"
-
-      "e.g.: nnet-train-mpe-sequential nnet.init trans.mdl scp:feats.scp "
-      "scp:denlats.scp ark:ali.ark nnet.iter1\n";
-
-    ParseOptions po(usage);
-
-    NnetTrainOptions trn_opts;
-    trn_opts.learn_rate = 0.00001;  // changing default,
-    trn_opts.Register(&po);
-
-    bool binary = true;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-                "Feature transform in 'nnet1' format");
-
-    std::string silence_phones_str;
-    po.Register("silence-phones", &silence_phones_str,
-        "Colon-separated list of integer id's of silence phones, e.g. 46:47");
-
-    PdfPriorOptions prior_opts;
-    prior_opts.Register(&po);
-
-    BaseFloat acoustic_scale = 1.0,
-        lm_scale = 1.0,
-        old_acoustic_scale = 0.0;
-
-    po.Register("acoustic-scale", &acoustic_scale,
-        "Scaling factor for acoustic likelihoods");
-
-    po.Register("lm-scale", &lm_scale,
-        "Scaling factor for \"graph costs\" (including LM costs)");
-
-    po.Register("old-acoustic-scale", &old_acoustic_scale,
-        "Add in the scores in the input lattices with this scale, rather "
-        "than discarding them.");
-
-    bool one_silence_class = false;
-    po.Register("one-silence-class", &one_silence_class,
-        "If true, the newer behavior reduces insertions.");
-
-    kaldi::int32 max_frames = 6000;
-    po.Register("max-frames", &max_frames,
-        "Maximum number of frames an utterance can have (skipped if longer)");
-
-    bool do_smbr = false;
-    po.Register("do-smbr", &do_smbr,
-        "Use state-level accuracies instead of phone accuracies.");
-
-    std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        transition_model_filename = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        den_lat_rspecifier = po.GetArg(4),
-        ref_ali_rspecifier = po.GetArg(5),
-        target_model_filename = po.GetArg(6);
-
-    std::vector<int32> silence_phones;
-    if (!kaldi::SplitStringToIntegers(silence_phones_str, ":", false,
-                                      &silence_phones)) {
-      KALDI_ERR << "Invalid silence-phones string " << silence_phones_str;
-    }
-    kaldi::SortAndUniq(&silence_phones);
-    if (silence_phones.empty()) {
-      KALDI_LOG << "No silence phones specified.";
-    }
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if (feature_transform != "") {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    // we will use pre-softmax activations, removing softmax,
-    // - pre-softmax activations are equivalent to 'log-posterior + C_frame',
-    // - all paths crossing a frame share same 'C_frame',
-    // - with GMM, we also have the unnormalized acoustic likelihoods,
-    if (nnet.GetLastComponent().GetType() ==
-        kaldi::nnet1::Component::kSoftmax) {
-      KALDI_LOG << "Removing softmax from the nnet " << model_filename;
-      nnet.RemoveLastComponent();
-    } else {
-      KALDI_LOG << "The nnet was without softmax. "
-                << "The last component in " << model_filename << " was "
-                << Component::TypeToMarker(nnet.GetLastComponent().GetType());
-    }
-    nnet.SetTrainOptions(trn_opts);
-
-    // Read the class-frame-counts, compute priors,
-    PdfPrior log_prior(prior_opts);
-
-    // Read transition model,
-    TransitionModel trans_model;
-    ReadKaldiObject(transition_model_filename, &trans_model);
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
-    RandomAccessInt32VectorReader ref_ali_reader(ref_ali_rspecifier);
-
-    CuMatrix<BaseFloat> feats_transf, nnet_out, nnet_diff;
-    Matrix<BaseFloat> nnet_out_h;
-
-    Timer time;
-    double time_now = 0;
-    KALDI_LOG << "TRAINING STARTED";
-
-    int32 num_done = 0,
-          num_no_ref_ali = 0,
-          num_no_den_lat = 0,
-          num_other_error = 0;
-
-    kaldi::int64 total_frames = 0;
-    double total_frame_acc = 0.0, utt_frame_acc;
-
-    // main loop over utterances,
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      if (!den_lat_reader.HasKey(utt)) {
-        KALDI_WARN << "Missing lattice for " << utt;
-        num_no_den_lat++;
-        continue;
-      }
-      if (!ref_ali_reader.HasKey(utt)) {
-        KALDI_WARN << "Missing alignment for " << utt;
-        num_no_ref_ali++;
-        continue;
-      }
-
-      // 1) get the features, numerator alignment,
-      const Matrix<BaseFloat> &mat = feature_reader.Value();
-      const std::vector<int32> &ref_ali = ref_ali_reader.Value(utt);
-      // check duration of numerator alignments,
-      if (static_cast<MatrixIndexT>(ref_ali.size()) != mat.NumRows()) {
-        KALDI_WARN << "Duration mismatch!"
-                   << " alignment " << ref_ali.size()
-                   << " features " << mat.NumRows();
-        num_other_error++;
-        continue;
-      }
-      if (mat.NumRows() > max_frames) {
-        KALDI_WARN << "Skipping " << utt
-          << " that has " << mat.NumRows() << " frames,"
-          << " it is longer than '--max-frames'" << max_frames;
-        num_other_error++;
-        continue;
-      }
-
-      // 2) get the denominator lattice, preprocess
-      Lattice den_lat = den_lat_reader.Value(utt);
-      if (den_lat.Start() == -1) {
-        KALDI_WARN << "Empty lattice of " << utt << ", skipping.";
-        num_other_error++;
-        continue;
-      }
-      if (old_acoustic_scale != 1.0) {
-        fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale),
-                          &den_lat);
-      }
-      // optional sort it topologically
-      kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
-      if (!(props & fst::kTopSorted)) {
-        if (fst::TopSort(&den_lat) == false) {
-          KALDI_ERR << "Cycles detected in lattice.";
-        }
-      }
-      // get the lattice length and times of states
-      std::vector<int32> state_times;
-      int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times);
-      // check for temporal length of denominator lattices
-      if (max_time != mat.NumRows()) {
-        KALDI_WARN << "Duration mismatch!"
-          << " denominator lattice " << max_time
-          << " features " << mat.NumRows() << ","
-          << " skipping " << utt;
-        num_other_error++;
-        continue;
-      }
-
-      // get dims,
-      int32 num_frames = mat.NumRows();
-
-      // 3) get the pre-softmax outputs from NN,
-      // apply transform,
-      nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
-      // propagate through the nnet (we know it's w/o softmax),
-      nnet.Propagate(feats_transf, &nnet_out);
-      // subtract the log_prior,
-      if (prior_opts.class_frame_counts != "") {
-        log_prior.SubtractOnLogpost(&nnet_out);
-      }
-      // transfer it back to the host,
-      nnet_out_h = Matrix<BaseFloat>(nnet_out);
-      // release the buffers we don't need anymore
-      feats_transf.Resize(0, 0);
-      nnet_out.Resize(0, 0);
-
-      // 4) rescore the latice
-      LatticeAcousticRescore(nnet_out_h, trans_model, state_times, &den_lat);
-      if (acoustic_scale != 1.0 || lm_scale != 1.0)
-        fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &den_lat);
-
-      kaldi::Posterior post;
-      if (do_smbr) {
-        // use state-level accuracies, i.e. sMBR estimation,
-        utt_frame_acc = LatticeForwardBackwardMpeVariants(
-            trans_model, silence_phones, den_lat, ref_ali, "smbr",
-            one_silence_class, &post);
-      } else {
-        // use phone-level accuracies, i.e. MPFE (minimum phone frame error),
-        utt_frame_acc = LatticeForwardBackwardMpeVariants(
-            trans_model, silence_phones, den_lat, ref_ali, "mpfe",
-            one_silence_class, &post);
-      }
-
-      // 6) convert the Posterior to a matrix,
-      PosteriorToPdfMatrix(post, trans_model, &nnet_diff);
-      nnet_diff.Scale(-1.0);  // need to flip the sign of derivative,
-
-      KALDI_VLOG(1) << "Lattice #" << num_done + 1 << " processed"
-                    << " (" << utt << "): found " << den_lat.NumStates()
-                    << " states and " << fst::NumArcs(den_lat) << " arcs.";
-
-      KALDI_VLOG(1) << "Utterance " << utt << ": Average frame accuracy = "
-                    << (utt_frame_acc/num_frames) << " over " << num_frames
-                    << " frames,"
-                    << " diff-range(" << nnet_diff.Min() << ","
-                                      << nnet_diff.Max() << ")";
-
-      // 7) backpropagate through the nnet, update,
-      nnet.Backpropagate(nnet_diff, NULL);
-      nnet_diff.Resize(0, 0);  // release GPU memory,
-
-      // increase time counter
-      total_frame_acc += utt_frame_acc;
-      total_frames += num_frames;
-      num_done++;
-
-      if (num_done % 100 == 0) {
-        time_now = time.Elapsed();
-        KALDI_VLOG(1) << "After " << num_done << " utterances: "
-          << "time elapsed = " << time_now / 60 << " min; "
-          << "processed " << total_frames / time_now << " frames per sec.";
-#if HAVE_CUDA == 1
-        // check that GPU computes accurately,
-        CuDevice::Instantiate().CheckGpuHealth();
-#endif
-      }
-
-      // GRADIENT LOGGING
-      // First utterance,
-      if (num_done == 1) {
-        KALDI_VLOG(1) << nnet.InfoPropagate();
-        KALDI_VLOG(1) << nnet.InfoBackPropagate();
-        KALDI_VLOG(1) << nnet.InfoGradient();
-      }
-      // Every 1000 utterances (--verbose=2),
-      if (GetVerboseLevel() >= 2) {
-        if (num_done % 1000 == 0) {
-          KALDI_VLOG(2) << nnet.InfoPropagate();
-          KALDI_VLOG(2) << nnet.InfoBackPropagate();
-          KALDI_VLOG(2) << nnet.InfoGradient();
-        }
-      }
-    }  // main loop over utterances,
-
-    // After last utterance,
-    KALDI_VLOG(1) << nnet.InfoPropagate();
-    KALDI_VLOG(1) << nnet.InfoBackPropagate();
-    KALDI_VLOG(1) << nnet.InfoGradient();
-
-    // Add the softmax layer back before writing,
-    KALDI_LOG << "Appending the softmax " << target_model_filename;
-    nnet.AppendComponentPointer(new Softmax(nnet.OutputDim(), nnet.OutputDim()));
-    // Store the nnet,
-    nnet.Write(target_model_filename, binary);
-
-    time_now = time.Elapsed();
-    KALDI_LOG << "TRAINING FINISHED; "
-              << "Time taken = " << time_now / 60 << " min; processed "
-              << total_frames / time_now << " frames per second.";
-
-    KALDI_LOG << "Done " << num_done << " files, "
-              << num_no_ref_ali << " with no reference alignments, "
-              << num_no_den_lat << " with no lattices, "
-              << num_other_error << " with other errors.";
-
-    KALDI_LOG << "Overall average frame-accuracy is "
-              << total_frame_acc / total_frames << " over "
-              << total_frames << " frames.";
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-train-multistream-perutt.cc b/src/nnetbin/nnet-train-multistream-perutt.cc
deleted file mode 100644
index 3694cf29e01..00000000000
--- a/src/nnetbin/nnet-train-multistream-perutt.cc
+++ /dev/null
@@ -1,363 +0,0 @@
-// nnetbin/nnet-train-multistream-perutt.cc
-
-// Copyright 2016 Brno University of Technology (author: Karel Vesely)
-// Copyright 2015 Chongjia Ni
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-matrix-buffer.h"
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-
-#include <numeric>
-#include <algorithm>
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-
-  try {
-    const char *usage =
-      "Perform one iteration of Multi-stream training, per-utterance BPTT for (B)LSTMs.\n"
-      "The updates are done per-utterance, while several utterances are \n"
-      "processed at the same time.\n"
-      "\n"
-      "Usage: nnet-train-multistream-perutt [options] <feature-rspecifier> <labels-rspecifier> <model-in> [<model-out>]\n"
-      "e.g.: nnet-train-blstm-streams scp:feats.scp ark:targets.ark nnet.init nnet.iter1\n";
-
-    ParseOptions po(usage);
-
-    // training options,
-    NnetTrainOptions trn_opts;
-    trn_opts.Register(&po);
-    LossOptions loss_opts;
-    loss_opts.Register(&po);
-
-    bool binary = true;
-    po.Register("binary", &binary, "Write model in binary mode");
-
-    bool crossvalidate = false;
-    po.Register("cross-validate", &crossvalidate,
-        "Perform cross-validation (no backpropagation)");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in Nnet format");
-
-    int32 length_tolerance = 5;
-    po.Register("length-tolerance", &length_tolerance,
-        "Allowed length difference of features/targets (frames)");
-
-    std::string frame_weights;
-    po.Register("frame-weights", &frame_weights,
-        "Per-frame weights to scale gradients (frame selection/weighting).");
-
-    int32 num_streams = 20;
-    po.Register("num-streams", &num_streams,
-        "Number of sentences processed in parallel (can be lower if sentences are long)");
-
-    double max_frames = 8000;
-    po.Register("max-frames", &max_frames,
-        "Max number of frames to be processed");
-
-    bool dummy = false;
-    po.Register("randomize", &dummy, "Dummy option.");
-
-    std::string use_gpu = "yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feature_rspecifier = po.GetArg(1),
-      targets_rspecifier = po.GetArg(2),
-      model_filename = po.GetArg(3);
-
-    std::string target_model_filename;
-    if (!crossvalidate) {
-      target_model_filename = po.GetArg(4);
-    }
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if ( feature_transform != "" ) {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    nnet.SetTrainOptions(trn_opts);
-
-    if (crossvalidate) {
-      nnet_transf.SetDropoutRate(0.0);
-      nnet.SetDropoutRate(0.0);
-    }
-
-    kaldi::int64 total_frames = 0;
-
-    // Initialize feature and target readers,
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessPosteriorReader targets_reader(targets_rspecifier);
-    RandomAccessBaseFloatVectorReader weights_reader;
-    if (frame_weights != "") {
-      weights_reader.Open(frame_weights);
-    }
-
-
-    Xent xent(loss_opts);
-
-    CuMatrix<BaseFloat> feats_transf, nnet_out, obj_diff;
-
-    Timer time;
-    KALDI_LOG << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-              << " STARTED";
-
-    // Buffer for input features, used for choosing utt's with similar length,
-    MatrixBuffer matrix_buffer;
-    matrix_buffer.Init(&feature_reader);
-
-    int32 num_done = 0,
-          num_no_tgt_mat = 0,
-          num_other_error = 0;
-
-    while (!matrix_buffer.Done()) {
-
-      // Fill the parallel data into 'std::vector',
-      std::vector<Matrix<BaseFloat> > feats_utt;
-      std::vector<Posterior> labels_utt;
-      std::vector<Vector<BaseFloat> > weights_utt;
-      std::vector<int32> frame_num_utt;
-      {
-        matrix_buffer.ResetLength();  ///< reset the 'preferred' length,
-        for (matrix_buffer.Next(); !matrix_buffer.Done(); matrix_buffer.Next()) {
-          std::string utt = matrix_buffer.Key();
-          // Check that we have targets,
-          if (!targets_reader.HasKey(utt)) {
-            KALDI_WARN << utt << ", missing targets";
-            num_no_tgt_mat++;
-            continue;
-          }
-          // Do we have frame-weights?
-          if (frame_weights != "" && !weights_reader.HasKey(utt)) {
-            KALDI_WARN << utt << ", missing frame-weights";
-            num_other_error++;
-            continue;
-          }
-
-          // Get feature / target pair,
-          Matrix<BaseFloat> mat = matrix_buffer.Value();
-          Posterior targets  = targets_reader.Value(utt);
-
-          // Skip too long sentences,
-          if (mat.NumRows() > max_frames) continue;
-
-          Vector<BaseFloat> weights;
-          if (frame_weights != "") {
-            weights = weights_reader.Value(utt);
-          } else {  // all per-frame weights are 1.0
-            weights.Resize(mat.NumRows());
-            weights.Set(1.0);
-          }
-
-          // correct small length mismatch ... or drop sentence
-          {
-            // add lengths to vector
-            std::vector<int32> length;
-            length.push_back(mat.NumRows());
-            length.push_back(targets.size());
-            length.push_back(weights.Dim());
-            // find min, max
-            int32 min = *std::min_element(length.begin(), length.end());
-            int32 max = *std::max_element(length.begin(), length.end());
-            // fix or drop ?
-            if (max - min < length_tolerance) {
-              if (mat.NumRows() != min) mat.Resize(min, mat.NumCols(), kCopyData);
-              if (targets.size() != min) targets.resize(min);
-              if (weights.Dim() != min) weights.Resize(min, kCopyData);
-            } else {
-              KALDI_WARN << "Length mismatch! Targets " << targets.size()
-                         << ", features " << mat.NumRows() << ", " << utt;
-              num_other_error++;
-              continue;
-            }
-          }
-
-          // input transform may contain splicing,
-          nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
-
-          // store,
-          feats_utt.push_back(Matrix<BaseFloat>(feats_transf));
-          labels_utt.push_back(targets);
-          weights_utt.push_back(weights);
-          frame_num_utt.push_back(feats_transf.NumRows());
-
-          if (frame_num_utt.size() == num_streams) break;
-
-          // See how many frames we'd have (after padding), if we add one more utterance,
-          int32 max = (*std::max_element(frame_num_utt.begin(), frame_num_utt.end()));
-          if (max * (frame_num_utt.size() + 1) > max_frames) break;
-        }
-      }
-      // Having no data? Skip the cycle...
-      if (frame_num_utt.size() == 0) continue;
-
-      // Pack the parallel data,
-      Matrix<BaseFloat> feat_mat_host;
-      Posterior target_host;
-      Vector<BaseFloat> weight_host;
-      {
-        // Number of sequences,
-        int32 n_streams = frame_num_utt.size();
-        int32 frame_num_padded = (*std::max_element(frame_num_utt.begin(), frame_num_utt.end()));
-        int32 feat_dim = feats_utt.front().NumCols();
-
-        // Create the final feature matrix. Every utterance is padded to the max
-        // length within this group of utterances,
-        feat_mat_host.Resize(n_streams * frame_num_padded, feat_dim, kSetZero);
-        target_host.resize(n_streams * frame_num_padded);
-        weight_host.Resize(n_streams * frame_num_padded, kSetZero);
-
-        for (int32 s = 0; s < n_streams; s++) {
-          const Matrix<BaseFloat>& mat_tmp = feats_utt[s];
-          for (int32 r = 0; r < frame_num_utt[s]; r++) {
-            feat_mat_host.Row(r*n_streams + s).CopyFromVec(mat_tmp.Row(r));
-          }
-        }
-
-        for (int32 s = 0; s < n_streams; s++) {
-          const Posterior& target_tmp = labels_utt[s];
-          for (int32 r = 0; r < frame_num_utt[s]; r++) {
-            target_host[r*n_streams + s] = target_tmp[r];
-          }
-        }
-
-        // padded frames will keep initial zero-weight,
-        for (int32 s = 0; s < n_streams; s++) {
-          const Vector<BaseFloat>& weight_tmp = weights_utt[s];
-          for (int32 r = 0; r < frame_num_utt[s]; r++) {
-            weight_host(r*n_streams + s) = weight_tmp(r);
-          }
-        }
-      }
-
-      // Set the original lengths of utterances before padding,
-      nnet.SetSeqLengths(frame_num_utt);
-      // Show the 'utt' lengths in the VLOG[2],
-      if (GetVerboseLevel() >= 2) {
-        std::ostringstream os;
-        os << "[ ";
-        for (size_t i = 0; i < frame_num_utt.size(); i++) {
-          os << frame_num_utt[i] << " ";
-        }
-        os << "]";
-        KALDI_LOG << "frame_num_utt[" << frame_num_utt.size() << "]" << os.str();
-      }
-      // Reset all the streams (we have new sentences),
-      nnet.ResetStreams(std::vector<int32>(frame_num_utt.size(), 1));
-
-      // Propagation,
-      nnet.Propagate(CuMatrix<BaseFloat>(feat_mat_host), &nnet_out);
-
-      // Per-frame cross-entropy, gradients get re-scaled by weights,
-      xent.Eval(weight_host, nnet_out, target_host, &obj_diff);
-
-      // Backward pass
-      if (!crossvalidate) {
-        nnet.Backpropagate(obj_diff, NULL);
-      }
-
-      // 1st model update : show what happens in network,
-      if (total_frames == 0) {
-        KALDI_LOG << "### After " << total_frames << " frames,";
-        KALDI_LOG << nnet.Info();
-        KALDI_LOG << nnet.InfoPropagate();
-        if (!crossvalidate) {
-          KALDI_LOG << nnet.InfoBackPropagate();
-          KALDI_LOG << nnet.InfoGradient();
-        }
-      }
-
-      kaldi::int64 tmp_frames = total_frames;
-
-      num_done += frame_num_utt.size();
-      total_frames += std::accumulate(frame_num_utt.begin(), frame_num_utt.end(), 0);
-
-      // monitor the NN training (--verbose=2),
-      int32 F = 25000;
-      if (GetVerboseLevel() >= 3) {
-        // print every 25k frames,
-        if (tmp_frames / F != total_frames / F) {
-          KALDI_VLOG(2) << "### After " << total_frames << " frames,";
-          KALDI_VLOG(2) << nnet.Info();
-          KALDI_VLOG(2) << nnet.InfoPropagate();
-          if (!crossvalidate) {
-            KALDI_VLOG(2) << nnet.InfoBackPropagate();
-            KALDI_VLOG(2) << nnet.InfoGradient();
-          }
-        }
-      }
-    }
-
-    // after last model update : show what happens in network,
-    KALDI_LOG << "### After " << total_frames << " frames,";
-    KALDI_LOG << nnet.Info();
-    KALDI_LOG << nnet.InfoPropagate();
-    if (!crossvalidate) {
-      KALDI_LOG << nnet.InfoBackPropagate();
-      KALDI_LOG << nnet.InfoGradient();
-    }
-
-    if (!crossvalidate) {
-      nnet.Write(target_model_filename, binary);
-    }
-
-    KALDI_LOG << xent.ReportPerClass();
-    KALDI_LOG << "Done " << num_done << " files, " << num_no_tgt_mat
-              << " with no tgt_mats, " << num_other_error
-              << " with other errors. "
-              << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-              << ", " << time.Elapsed() / 60 << " min, "
-              << "fps" << total_frames / time.Elapsed() << "]";
-    KALDI_LOG << xent.Report();
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-train-multistream.cc b/src/nnetbin/nnet-train-multistream.cc
deleted file mode 100644
index 0667aa865bf..00000000000
--- a/src/nnetbin/nnet-train-multistream.cc
+++ /dev/null
@@ -1,471 +0,0 @@
-// nnetbin/nnet-train-multistream.cc
-
-// Copyright 2015-2016  Brno University of Technology (Author: Karel Vesely)
-//           2014  Jiayu DU (Jerry), Wei Li
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <numeric>
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-randomizer.h"
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-
-
-namespace kaldi {
-
-bool ReadData(SequentialBaseFloatMatrixReader& feature_reader,
-              RandomAccessPosteriorReader& target_reader,
-              RandomAccessBaseFloatVectorReader& weights_reader,
-              int32 length_tolerance,
-              Matrix<BaseFloat>* feats,
-              Posterior* targets,
-              Vector<BaseFloat>* weights,
-              int32* num_no_tgt_mat,
-              int32* num_other_error) {
-
-  // We're looking for the 1st valid utterance...
-  for ( ; !feature_reader.Done(); feature_reader.Next()) {
-    // Do we have targets?
-    const std::string& utt = feature_reader.Key();
-    KALDI_VLOG(3) << "Reading: " << utt;
-    if (!target_reader.HasKey(utt)) {
-      KALDI_WARN << utt << ", missing targets";
-      (*num_no_tgt_mat)++;
-      continue;
-    }
-    // Do we have frame-weights?
-    if (weights_reader.IsOpen() && !weights_reader.HasKey(utt)) {
-      KALDI_WARN << utt << ", missing frame-weights";
-      (*num_other_error)++;
-      continue;
-    }
-
-    // get the (feature,target) pair,
-    (*feats) = feature_reader.Value();
-    (*targets) = target_reader.Value(utt);
-
-    // getting per-frame weights,
-    if (weights_reader.IsOpen()) {
-      (*weights) = weights_reader.Value(utt);
-    } else {  // all per-frame weights are 1.0
-      weights->Resize(feats->NumRows());
-      weights->Set(1.0);
-    }
-
-    // correct small length mismatch ... or drop sentence
-    {
-      // add lengths to vector
-      std::vector<int32> length;
-      length.push_back(feats->NumRows());
-      length.push_back(targets->size());
-      length.push_back(weights->Dim());
-      // find min, max
-      int32 min = *std::min_element(length.begin(), length.end());
-      int32 max = *std::max_element(length.begin(), length.end());
-      // fix or drop ?
-      if (max - min < length_tolerance) {
-        if (feats->NumRows() != min) feats->Resize(min, feats->NumCols(), kCopyData);
-        if (targets->size() != min) targets->resize(min);
-        if (weights->Dim() != min) weights->Resize(min, kCopyData);
-      } else {
-        KALDI_WARN << "Length mismatch! Targets " << targets->size()
-                   << ", features " << feats->NumRows() << ", " << utt;
-        num_other_error++;
-        continue;
-      }
-    }
-
-    // By getting here we got a valid utterance,
-    feature_reader.Next();
-    return true;
-  }
-
-  // No more data,
-  return false;
-}
-
-}  // namespace kaldi
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-
-  try {
-    const char *usage =
-        "Perform one iteration of Multi-stream training, truncated BPTT for LSTMs.\n"
-        "The training targets are pdf-posteriors, usually prepared by ali-to-post.\n"
-        "The updates are per-utterance.\n"
-        "\n"
-        "Usage: nnet-train-multistream [options] "
-          "<feature-rspecifier> <targets-rspecifier> <model-in> [<model-out>]\n"
-        "e.g.: nnet-train-lstm-streams scp:feature.scp ark:posterior.ark nnet.init nnet.iter1\n";
-
-    ParseOptions po(usage);
-
-    NnetTrainOptions trn_opts;
-    trn_opts.Register(&po);
-    LossOptions loss_opts;
-    loss_opts.Register(&po);
-
-    bool binary = true;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    bool crossvalidate = false;
-    po.Register("cross-validate", &crossvalidate,
-        "Perform cross-validation (don't back-propagate)");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in Nnet format");
-
-    std::string objective_function = "xent";
-    po.Register("objective-function", &objective_function,
-        "Objective function : xent|mse");
-
-    int32 length_tolerance = 5;
-    po.Register("length-tolerance", &length_tolerance,
-      "Allowed length difference of features/targets (frames)");
-
-    std::string frame_weights;
-    po.Register("frame-weights", &frame_weights,
-      "Per-frame weights to scale gradients (frame selection/weighting).");
-
-    int32 batch_size = 20;
-    po.Register("batch-size", &batch_size,
-      "Length of 'one stream' in the Multi-stream training");
-
-    int32 num_streams = 4;
-    po.Register("num-streams", &num_streams,
-      "Number of streams in the Multi-stream training");
-
-    bool dummy = false;
-    po.Register("randomize", &dummy, "Dummy option.");
-
-    std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feature_rspecifier = po.GetArg(1),
-      targets_rspecifier = po.GetArg(2),
-      model_filename = po.GetArg(3);
-
-    std::string target_model_filename;
-    if (!crossvalidate) {
-      target_model_filename = po.GetArg(4);
-    }
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if (feature_transform != "") {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    nnet.SetTrainOptions(trn_opts);
-
-    if (crossvalidate) {
-      nnet_transf.SetDropoutRate(0.0);
-      nnet.SetDropoutRate(0.0);
-    }
-
-    kaldi::int64 total_frames = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessPosteriorReader target_reader(targets_rspecifier);
-    RandomAccessBaseFloatVectorReader weights_reader;
-    if (frame_weights != "") {
-      weights_reader.Open(frame_weights);
-    }
-
-    Xent xent(loss_opts);
-    Mse mse(loss_opts);
-
-    Timer time;
-    double time_gpu = 0;
-    KALDI_LOG << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-              << " STARTED";
-
-    int32 num_done = 0,
-          num_no_tgt_mat = 0,
-          num_other_error = 0;
-
-    // book-keeping for multi-stream training,
-    std::vector<Matrix<BaseFloat> > feats_utt(num_streams);
-    std::vector<Posterior> labels_utt(num_streams);
-    std::vector<Vector<BaseFloat> > weights_utt(num_streams);
-    std::vector<int32> cursor_utt(num_streams); // 0 initialized,
-    std::vector<int32> new_utt_flags(num_streams);
-
-    CuMatrix<BaseFloat> feats_transf, nnet_out, obj_diff;
-
-    // MAIN LOOP,
-    while (1) {
-
-      // Re-fill the streams, if needed,
-      new_utt_flags.assign(num_streams, 0);  // set new-utterance flags to zero,
-      for (int s = 0; s < num_streams; s++) {
-        // Need a new utterance for stream 's'?
-        if (cursor_utt[s] >= feats_utt[s].NumRows()) {
-          Matrix<BaseFloat> feats;
-          Posterior targets;
-          Vector<BaseFloat> weights;
-          // get the data from readers,
-          if (ReadData(feature_reader, target_reader, weights_reader,
-                       length_tolerance,
-                       &feats, &targets, &weights,
-                       &num_no_tgt_mat, &num_other_error)) {
-
-            // input transform may contain splicing,
-            Timer t;
-            nnet_transf.Feedforward(CuMatrix<BaseFloat>(feats), &feats_transf);
-            time_gpu += t.Elapsed();
-
-            /* Here we could do the 'targets_delay', BUT...
-             * It is better to do it by a <Splice> component!
-             *
-             * The prototype would look like this (6th frame becomes 1st frame, etc.):
-             * '<Splice> <InputDim> dim1 <OutputDim> dim1 <BuildVector> 5 </BuildVector>'
-             */
-
-            // store,
-            feats_utt[s] = Matrix<BaseFloat>(feats_transf);
-            labels_utt[s] = targets;
-            weights_utt[s] = weights;
-            cursor_utt[s] = 0;
-            new_utt_flags[s] = 1;
-          }
-        }
-      }
-
-      // End the training when 1st stream is empty
-      // (this avoids over-adaptation to last utterances),
-      size_t inactive_streams = 0;
-      for (int32 s = 0; s < num_streams; s++) {
-        if (feats_utt[s].NumRows() - cursor_utt[s] <= 0) {
-          inactive_streams += 1;
-        }
-      }
-      if (inactive_streams >= 1) {
-        KALDI_LOG << "No more data to re-fill one of the streams, end of the training!";
-        KALDI_LOG << "(remaining stubs of data are discarded, don't overtrain on them)";
-        break;
-      }
-
-      // number of frames we'll pack as the streams,
-      std::vector<int32> frame_num_utt;
-
-      // pack the parallel data,
-      Matrix<BaseFloat> feat_mat_host;
-      Posterior target_host;
-      Vector<BaseFloat> weight_host;
-      {
-        // Number of sequences (can have zero length),
-        int32 n_streams = num_streams;
-
-        // Create the final feature matrix with 'interleaved feature-lines',
-        feat_mat_host.Resize(n_streams * batch_size, nnet.InputDim(), kSetZero);
-        target_host.resize(n_streams * batch_size);
-        weight_host.Resize(n_streams * batch_size, kSetZero);
-        frame_num_utt.resize(n_streams, 0);
-
-        // we slice at the 'cursor' at most 'batch_size' frames,
-        for (int32 s = 0; s < n_streams; s++) {
-          int32 num_rows = std::max(0, feats_utt[s].NumRows() - cursor_utt[s]);
-          frame_num_utt[s] = std::min(batch_size, num_rows);
-        }
-
-        // pack the data,
-        {
-          for (int32 s = 0; s < n_streams; s++) {
-            if (frame_num_utt[s] > 0) {
-              auto mat_tmp = feats_utt[s].RowRange(cursor_utt[s], frame_num_utt[s]);
-              for (int32 r = 0; r < frame_num_utt[s]; r++) {
-                feat_mat_host.Row(r*n_streams + s).CopyFromVec(mat_tmp.Row(r));
-              }
-            }
-          }
-
-          for (int32 s = 0; s < n_streams; s++) {
-            for (int32 r = 0; r < frame_num_utt[s]; r++) {
-              target_host[r*n_streams + s] = labels_utt[s][cursor_utt[s] + r];
-            }
-          }
-
-          // padded frames will keep initial zero-weight,
-          for (int32 s = 0; s < n_streams; s++) {
-            if (frame_num_utt[s] > 0) {
-              auto weight_tmp = weights_utt[s].Range(cursor_utt[s], frame_num_utt[s]);
-              for (int32 r = 0; r < frame_num_utt[s]; r++) {
-                weight_host(r*n_streams + s) = weight_tmp(r);
-              }
-            }
-          }
-        }
-
-        // advance the cursors,
-        for (int32 s = 0; s < n_streams; s++) {
-          cursor_utt[s] += frame_num_utt[s];
-        }
-      }
-
-      // pass the info about padding,
-      nnet.SetSeqLengths(frame_num_utt);
-
-      // Show debug info,
-      if (GetVerboseLevel() >= 4) {
-        // cursors in the feature_matrices,
-        {
-          std::ostringstream os;
-          os << "[ ";
-          for (size_t i = 0; i < cursor_utt.size(); i++) {
-            os << cursor_utt[i] << " ";
-          }
-          os << "]";
-          KALDI_LOG << "cursor_utt[" << cursor_utt.size() << "]" << os.str();
-        }
-        // frames in the mini-batch,
-        {
-          std::ostringstream os;
-          os << "[ ";
-          for (size_t i = 0; i < frame_num_utt.size(); i++) {
-            os << frame_num_utt[i] << " ";
-          }
-          os << "]";
-          KALDI_LOG << "frame_num_utt[" << frame_num_utt.size() << "]" << os.str();
-        }
-      }
-
-      Timer t;
-      // with new utterance we reset the history,
-      nnet.ResetStreams(new_utt_flags);
-
-      // forward pass,
-      nnet.Propagate(CuMatrix<BaseFloat>(feat_mat_host), &nnet_out);
-
-      // evaluate objective function we've chosen,
-      if (objective_function == "xent") {
-        xent.Eval(weight_host, nnet_out, target_host, &obj_diff);
-      } else if (objective_function == "mse") {
-        mse.Eval(weight_host, nnet_out, target_host, &obj_diff);
-      } else {
-        KALDI_ERR << "Unknown objective function code : "
-                  << objective_function;
-      }
-
-      if (!crossvalidate) {
-        // back-propagate, and do the update,
-        nnet.Backpropagate(obj_diff, NULL);
-      }
-      time_gpu += t.Elapsed();
-
-      // 1st minibatch : show what happens in network,
-      if (total_frames == 0) {
-        KALDI_LOG << "### After " << total_frames << " frames,";
-        KALDI_LOG << nnet.Info();
-        KALDI_LOG << nnet.InfoPropagate();
-        if (!crossvalidate) {
-          KALDI_LOG << nnet.InfoBackPropagate();
-          KALDI_LOG << nnet.InfoGradient();
-        }
-      }
-
-      kaldi::int64 tmp_frames = total_frames;
-
-      num_done += std::accumulate(new_utt_flags.begin(), new_utt_flags.end(), 0);
-      total_frames += std::accumulate(frame_num_utt.begin(), frame_num_utt.end(), 0);
-
-      // monitor the NN training (--verbose=2),
-      int32 F = 25000;
-      if (GetVerboseLevel() >= 2) {
-        // print every 25k frames,
-        if (tmp_frames / F != total_frames / F) {
-          KALDI_VLOG(2) << "### After " << total_frames << " frames,";
-          KALDI_VLOG(2) << nnet.Info();
-          KALDI_VLOG(2) << nnet.InfoPropagate();
-          if (!crossvalidate) {
-            KALDI_VLOG(2) << nnet.InfoBackPropagate();
-            KALDI_VLOG(2) << nnet.InfoGradient();
-          }
-        }
-      }
-    }
-
-    // after last minibatch : show what happens in network,
-    KALDI_LOG << "### After " << total_frames << " frames,";
-    KALDI_LOG << nnet.Info();
-    KALDI_LOG << nnet.InfoPropagate();
-    if (!crossvalidate) {
-      KALDI_LOG << nnet.InfoBackPropagate();
-      KALDI_LOG << nnet.InfoGradient();
-    }
-
-    if (!crossvalidate) {
-      nnet.Write(target_model_filename, binary);
-    }
-
-    if (objective_function == "xent") {
-      KALDI_LOG << xent.ReportPerClass();
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, "
-      << num_no_tgt_mat << " with no tgt_mats, "
-      << num_other_error << " with other errors. "
-      << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-      << ", " << time.Elapsed() / 60 << " min, processing "
-      << total_frames / time.Elapsed() << " frames per sec, "
-      << "GPU_time " << 100.*time_gpu/time.Elapsed() << "% ]";
-
-    if (objective_function == "xent") {
-      KALDI_LOG << xent.Report();
-    } else if (objective_function == "mse") {
-      KALDI_LOG << mse.Report();
-    } else {
-      KALDI_ERR << "Unknown objective function code : " << objective_function;
-    }
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/nnet-train-perutt.cc b/src/nnetbin/nnet-train-perutt.cc
deleted file mode 100644
index 8f417e1b608..00000000000
--- a/src/nnetbin/nnet-train-perutt.cc
+++ /dev/null
@@ -1,310 +0,0 @@
-// nnetbin/nnet-train-perutt.cc
-
-// Copyright 2011-2014  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-randomizer.h"
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-
-  try {
-    const char *usage =
-      "Perform one iteration of NN training by SGD with per-utterance updates.\n"
-      "The training targets are represented as pdf-posteriors, usually prepared "
-      "by ali-to-post.\n"
-      "Usage: nnet-train-perutt [options] "
-      "<feature-rspecifier> <targets-rspecifier> <model-in> [<model-out>]\n"
-      "e.g.: nnet-train-perutt scp:feature.scp ark:posterior.ark nnet.init nnet.iter1\n";
-
-    ParseOptions po(usage);
-
-    NnetTrainOptions trn_opts;
-    trn_opts.Register(&po);
-    LossOptions loss_opts;
-    loss_opts.Register(&po);
-
-    bool binary = true;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    bool crossvalidate = false;
-    po.Register("cross-validate", &crossvalidate,
-        "Perform cross-validation (don't backpropagate)");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in Nnet format");
-
-    std::string objective_function = "xent";
-    po.Register("objective-function", &objective_function,
-        "Objective function : xent|mse");
-
-    int32 length_tolerance = 5;
-    po.Register("length-tolerance", &length_tolerance,
-        "Allowed length difference of features/targets (frames)");
-
-    std::string frame_weights;
-    po.Register("frame-weights", &frame_weights,
-        "Per-frame weights to scale gradients (frame selection/weighting).");
-
-    kaldi::int32 max_frames = 6000;  // Allow segments maximum of one minute by default
-    po.Register("max-frames",&max_frames, "Maximum number of frames a segment can have to be processed");
-
-    std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    //// Add dummy option for compatibility with default scheduler,
-    bool randomize = false;
-    po.Register("randomize", &randomize,
-        "Dummy, for compatibility with 'steps/nnet/train_scheduler.sh'");
-    ////
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string feature_rspecifier = po.GetArg(1),
-      targets_rspecifier = po.GetArg(2),
-      model_filename = po.GetArg(3);
-
-    std::string target_model_filename;
-    if (!crossvalidate) {
-      target_model_filename = po.GetArg(4);
-    }
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet nnet_transf;
-    if (feature_transform != "") {
-      nnet_transf.Read(feature_transform);
-    }
-
-    Nnet nnet;
-    nnet.Read(model_filename);
-    nnet.SetTrainOptions(trn_opts);
-
-    if (crossvalidate) {
-      nnet_transf.SetDropoutRate(0.0);
-      nnet.SetDropoutRate(0.0);
-    }
-
-    kaldi::int64 total_frames = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessPosteriorReader targets_reader(targets_rspecifier);
-    RandomAccessBaseFloatVectorReader weights_reader;
-    if (frame_weights != "") {
-      weights_reader.Open(frame_weights);
-    }
-
-    Xent xent(loss_opts);
-    Mse mse(loss_opts);
-
-    MultiTaskLoss multitask(loss_opts);
-    if (0 == objective_function.compare(0, 9, "multitask")) {
-      // objective_function contains something like :
-      // 'multitask,xent,2456,1.0,mse,440,0.001'
-      //
-      // the meaning is following:
-      // 'multitask,<type1>,<dim1>,<weight1>,...,<typeN>,<dimN>,<weightN>'
-      multitask.InitFromString(objective_function);
-    }
-
-    CuMatrix<BaseFloat> feats, feats_transf, nnet_out, obj_diff;
-
-    Timer time;
-    KALDI_LOG << (crossvalidate?"CROSS-VALIDATION":"TRAINING") << " STARTED";
-
-    int32 num_done = 0,
-          num_no_tgt_mat = 0,
-          num_other_error = 0;
-
-    // main loop,
-    for ( ; !feature_reader.Done(); feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      KALDI_VLOG(3) << "Reading " << utt;
-      // check that we have targets
-      if (!targets_reader.HasKey(utt)) {
-        KALDI_WARN << utt << ", missing targets";
-        num_no_tgt_mat++;
-        continue;
-      }
-      // check we have per-frame weights
-      if (frame_weights != "" && !weights_reader.HasKey(utt)) {
-        KALDI_WARN << utt << ", missing per-frame weights";
-        num_other_error++;
-        feature_reader.Next();
-        continue;
-      }
-      // get feature / target pair
-      Matrix<BaseFloat> mat = feature_reader.Value();
-      Posterior nnet_tgt = targets_reader.Value(utt);
-      // skip the sentence if it is too long,
-      if (mat.NumRows() > max_frames) {
-        KALDI_WARN << "Skipping " << utt
-          << " that has " << mat.NumRows() << " frames,"
-          << " it is longer than '--max-frames'" << max_frames;
-        num_other_error++;
-        continue;
-      }
-      // get per-frame weights
-      Vector<BaseFloat> frm_weights;
-      if (frame_weights != "") {
-        frm_weights = weights_reader.Value(utt);
-      } else {  // all per-frame weights are 1.0
-        frm_weights.Resize(mat.NumRows());
-        frm_weights.Set(1.0);
-      }
-      // correct small length mismatch ... or drop sentence
-      {
-        // add lengths to vector
-        std::vector<int32> length;
-        length.push_back(mat.NumRows());
-        length.push_back(nnet_tgt.size());
-        length.push_back(frm_weights.Dim());
-        // find min, max
-        int32 min = *std::min_element(length.begin(), length.end());
-        int32 max = *std::max_element(length.begin(), length.end());
-        // fix or drop ?
-        if (max - min < length_tolerance) {
-          if (mat.NumRows() != min) mat.Resize(min, mat.NumCols(), kCopyData);
-          if (nnet_tgt.size() != min) nnet_tgt.resize(min);
-          if (frm_weights.Dim() != min) frm_weights.Resize(min, kCopyData);
-        } else {
-          KALDI_WARN << utt << ", length mismatch of targets " << nnet_tgt.size()
-                     << " and features " << mat.NumRows();
-          num_other_error++;
-          continue;
-        }
-      }
-      // apply optional feature transform
-      nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
-
-      // forward pass
-      nnet.Propagate(feats_transf, &nnet_out);
-
-      // evaluate objective function we've chosen,
-      if (objective_function == "xent") {
-        // gradients are re-scaled by weights inside Eval,
-        xent.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
-      } else if (objective_function == "mse") {
-        // gradients are re-scaled by weights inside Eval,
-        mse.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
-      } else if (0 == objective_function.compare(0, 9, "multitask")) {
-        // gradients re-scaled by weights in Eval,
-        multitask.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
-      } else {
-        KALDI_ERR << "Unknown objective function code : "
-                  << objective_function;
-      }
-
-      if (!crossvalidate) {
-        // backpropagate and update,
-        nnet.Backpropagate(obj_diff, NULL);
-      }
-
-      // 1st minibatch : show what happens in network,
-      if (total_frames == 0) {
-        KALDI_LOG << "### After " << total_frames << " frames,";
-        KALDI_LOG << nnet.InfoPropagate();
-        if (!crossvalidate) {
-          KALDI_LOG << nnet.InfoBackPropagate();
-          KALDI_LOG << nnet.InfoGradient();
-        }
-      }
-
-      // VERBOSE LOG
-      // monitor the NN training (--verbose=2),
-      if (GetVerboseLevel() >= 2) {
-        static int32 counter = 0;
-        counter += mat.NumRows();
-        // print every 25k frames,
-        if (counter >= 25000) {
-          KALDI_VLOG(2) << "### After " << total_frames << " frames,";
-          KALDI_VLOG(2) << nnet.InfoPropagate();
-          if (!crossvalidate) {
-            KALDI_VLOG(2) << nnet.InfoBackPropagate();
-            KALDI_VLOG(2) << nnet.InfoGradient();
-          }
-          counter = 0;
-        }
-      }
-
-      num_done++;
-      total_frames += frm_weights.Sum();
-    }  // main loop,
-
-    // after last minibatch : show what happens in network,
-    KALDI_LOG << "### After " << total_frames << " frames,";
-    KALDI_LOG << nnet.InfoPropagate();
-    if (!crossvalidate) {
-      KALDI_LOG << nnet.InfoBackPropagate();
-      KALDI_LOG << nnet.InfoGradient();
-    }
-
-    if (!crossvalidate) {
-      nnet.Write(target_model_filename, binary);
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, "
-      << num_no_tgt_mat << " with no tgt_mats, "
-      << num_other_error << " with other errors. "
-      << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
-      << ", " << (randomize ? "RANDOMIZED" : "NOT-RANDOMIZED")
-      << ", " << time.Elapsed() / 60 << " min, processing "
-      << total_frames / time.Elapsed() << " frames per sec.]";
-
-    if (objective_function == "xent") {
-      KALDI_LOG << xent.ReportPerClass();
-      KALDI_LOG << xent.Report();
-    } else if (objective_function == "mse") {
-      KALDI_LOG << mse.Report();
-    } else if (0 == objective_function.compare(0, 9, "multitask")) {
-      KALDI_LOG << multitask.Report();
-    } else {
-      KALDI_ERR << "Unknown objective function code : " << objective_function;
-    }
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/paste-post.cc b/src/nnetbin/paste-post.cc
deleted file mode 100644
index 10d72f49ed6..00000000000
--- a/src/nnetbin/paste-post.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-// nnetbin/paste-post.cc
-
-// Copyright 2015       Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "base/io-funcs.h"
-#include "util/common-utils.h"
-#include "hmm/posterior.h"
-#include "nnet/nnet-utils.h"
-
-/** @brief Combines 2 or more streams with NN-training targets into single one.
- *  This is handy when training NN with more than one output layer (softmax).
- *  The format of NN-targets is 'posterior' and the dimensionality of the output
- *  stream is the sum of input-stream dimensions.
- */
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-  try {
-    const char *usage =
-      "Combine 2 or more streams with NN-training targets into single stream.\n"
-      "As the posterior streams are pasted, the output dimension is the sum\n"
-      "of the input dimensions. This is used when training NN with\n"
-      "multiple softmaxes on its output. This is used in multi-task, \n"
-      "multi-lingual or multi-database training. Depending on the context,\n"
-      "an utterance is not required to be in all the input streams.\n"
-      "For a multi-database training only 1 output layer will be active.\n"
-      "\n"
-      "The lengths of utterances are provided as 1st argument.\n"
-      "The dimensions of input stream are set as 2nd in argument.\n"
-      "Follow the input and output streams which are in 'posterior' format.\n"
-      "\n"
-      "Usage: paste-post <featlen-rspecifier> <dims-csl> <post1-rspecifier> "
-      "... <postN-rspecifier> <post-wspecifier>\n"
-      "e.g.: paste-post 'ark:feat-to-len $feats ark,t:-|' 1029:1124 "
-      "ark:post1.ark ark:post2.ark ark:pasted.ark\n";
-
-    ParseOptions po(usage);
-
-    bool allow_partial = false;
-    po.Register("allow-partial", &allow_partial,
-                "Produce output also when the utterance is not in all input streams.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string featlen_rspecifier = po.GetArg(1),  // segment lengths,
-                stream_dims_str = po.GetArg(2),
-                post_wspecifier = po.GetArg(po.NumArgs());
-    int32 stream_count = po.NumArgs() - 3;  // number of input posterior streams
-
-    // read the dims of input posterior streams,
-    std::vector<int32> stream_dims;
-    if (!kaldi::SplitStringToIntegers(stream_dims_str, ":,", false, &stream_dims)) {
-      KALDI_ERR << "Invalid stream-dims string " << stream_dims_str;
-    }
-    if (stream_count != stream_dims.size()) {
-      KALDI_ERR << "Mismatch in input posterior-stream count " << stream_count
-                << " and --stream-dims count" << stream_dims.size()
-                << ", " << stream_dims_str;
-    }
-
-    // prepare dim offsets of input streams,
-    std::vector<int32> stream_offset(stream_dims.size()+1, 0);
-    for (int32 s = 0; s < stream_dims.size(); s++) {
-      stream_offset[s+1] = stream_offset[s] + stream_dims[s];
-    }
-
-    // open the input posterior readers,
-    std::vector<RandomAccessPosteriorReader> posterior_reader(po.NumArgs()-3);
-    for (int32 s = 0; s < stream_count; s++) {
-      posterior_reader[s].Open(po.GetArg(s+3));
-    }
-
-    int32 num_done = 0, num_err = 0, num_empty = 0;
-    SequentialInt32Reader featlen_reader(featlen_rspecifier);
-    PosteriorWriter posterior_writer(post_wspecifier);
-
-    // main loop, posterior pasting happens here,
-    for (; !featlen_reader.Done(); featlen_reader.Next()) {
-      bool ok = true, empty = true;
-      std::string utt = featlen_reader.Key();
-      int32 num_frames = featlen_reader.Value();
-
-      // show which streams are non-empty,
-      if (allow_partial && GetVerboseLevel() >= 2) {
-        std::string nonempty_streams;
-        for (int32 s = 0; s < stream_count; s++) {
-          if (posterior_reader[s].HasKey(utt)) {
-            nonempty_streams += " " + ToString(s);
-          }
-        }
-        KALDI_VLOG(2) << "Processing " << utt
-                      << ", frames " << num_frames
-                      << ", pasted-from streams " << nonempty_streams;
-      }
-
-      // Create output posteriors,
-      Posterior post(num_frames);
-
-      // Fill posterior from input streams,
-      for (int32 s = 0; s < stream_count; s++) {
-        if (!posterior_reader[s].HasKey(utt)) {
-          if (!allow_partial) {
-            KALDI_WARN << "No such utterance " << utt
-                       << " in set " << (s+1) << " of posteriors.";
-            ok = false;
-            break;
-          }
-        } else {
-          const Posterior& post_s = posterior_reader[s].Value(utt);
-          KALDI_ASSERT(num_frames <= post_s.size());
-          for (int32 f = 0; f < num_frames; f++) {
-            for (int32 i = 0; i < post_s[f].size(); i++) {
-              int32 id = post_s[f][i].first;
-              BaseFloat val = post_s[f][i].second;
-              KALDI_ASSERT(id < stream_dims[s]);
-              post[f].push_back(std::make_pair(stream_offset[s] + id, val));
-            }
-          }
-          empty = false;
-        }
-      }
-      if (empty) {
-        KALDI_WARN << "Uttenrace with no posteriors " << utt << ", discarding";
-        num_empty++;
-        continue;
-      }
-      if (ok) {
-        posterior_writer.Write(featlen_reader.Key(), post);
-        num_done++;
-      } else {
-        num_err++;
-      }
-    }
-    KALDI_LOG << "Pasted posteriors for " << num_done << " sentences, "
-              << "missing sentences " << num_empty << ", "
-              << "failed for " << num_err;
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
-
diff --git a/src/nnetbin/rbm-convert-to-nnet.cc b/src/nnetbin/rbm-convert-to-nnet.cc
deleted file mode 100644
index 3ac89626376..00000000000
--- a/src/nnetbin/rbm-convert-to-nnet.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// nnetbin/rbm-convert-to-nnet.cc
-
-// Copyright 2009-2011  Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-rbm.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Convert RBM to <affinetransform> and <sigmoid>\n"
-        "Usage:  rbm-convert-to-nnet [options] <rbm-in> <nnet-out>\n"
-        "e.g.:\n"
-        " rbm-convert-to-nnet --binary=false rbm.mdl nnet.mdl\n";
-
-
-    bool binary_write = true;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_in_filename = po.GetArg(1),
-        model_out_filename = po.GetArg(2);
-
-    Nnet nnet;
-    {
-      bool binary_read;
-      Input ki(model_in_filename, &binary_read);
-      nnet.Read(ki.Stream(), binary_read);
-    }
-
-    KALDI_ASSERT(nnet.NumComponents() == 1);
-    KALDI_ASSERT(nnet.GetComponent(0).GetType() == kaldi::nnet1::Component::kRbm);
-    RbmBase& rbm = dynamic_cast<RbmBase&>(nnet.GetComponent(0));
-
-    {
-      Output ko(model_out_filename, binary_write);
-      rbm.WriteAsNnet(ko.Stream(), binary_write);
-    }
-
-    KALDI_LOG << "Written model to " << model_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/nnetbin/rbm-train-cd1-frmshuff.cc b/src/nnetbin/rbm-train-cd1-frmshuff.cc
deleted file mode 100644
index 09e6f247cec..00000000000
--- a/src/nnetbin/rbm-train-cd1-frmshuff.cc
+++ /dev/null
@@ -1,287 +0,0 @@
-// nnetbin/rbm-train-cd1-frmshuff.cc
-
-// Copyright 2012-2013  Brno University of Technology (Author: Karel Vesely)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet/nnet-trnopts.h"
-#include "nnet/nnet-rbm.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-loss.h"
-#include "nnet/nnet-randomizer.h"
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "base/timer.h"
-#include "cudamatrix/cu-device.h"
-#include "cudamatrix/cu-rand.h"
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;
-  try {
-    const char *usage =
-      "Train RBM by Contrastive Divergence alg. with 1 step of "
-      "Markov Chain Monte-Carlo.\n"
-      "The tool can perform several iterations (--num-iters) "
-      "or it can subsample the training dataset (--drop-data)\n"
-
-      "Usage: rbm-train-cd1-frmshuff [options] <model-in> "
-      "<feature-rspecifier> <model-out>\n"
-      "e.g.: rbm-train-cd1-frmshuff 1.rbm.init scp:train.scp 1.rbm\n";
-
-    ParseOptions po(usage);
-
-    RbmTrainOptions trn_opts, trn_opts_rbm;
-    trn_opts.Register(&po);
-    LossOptions loss_opts;
-    loss_opts.Register(&po);
-
-    bool binary = false;
-    po.Register("binary", &binary, "Write output in binary mode");
-
-    bool with_bug = true;
-    po.Register("with-bug", &with_bug,
-        "Apply bug which led to better results (set-initial-momentum-to-max)");
-
-    int32 num_iters = 1;
-    po.Register("num-iters", &num_iters,
-                "Number of iterations (smaller datasets should have more iterations, "
-                "iterating within tool because of linear momentum scheduling)");
-
-    std::string feature_transform;
-    po.Register("feature-transform", &feature_transform,
-        "Feature transform in 'nnet1' format");
-
-    NnetDataRandomizerOptions rnd_opts;
-    rnd_opts.minibatch_size = 100;
-    rnd_opts.Register(&po);
-
-    kaldi::int32 max_frames = 6000;
-    po.Register("max-frames", &max_frames,
-        "Maximum number of frames an utterance can have (skipped if longer)");
-
-    std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu,
-        "yes|no|optional, only has effect if compiled with CUDA");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2);
-
-    std::string target_model_filename;
-    target_model_filename = po.GetArg(3);
-
-
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-
-    Nnet rbm_transf;
-    if (feature_transform != "") {
-      rbm_transf.Read(feature_transform);
-    }
-
-    // Read nnet, extract the RBM,
-    Nnet nnet;
-    nnet.Read(model_filename);
-    KALDI_ASSERT(nnet.NumComponents() == 1);
-    KALDI_ASSERT(nnet.GetComponent(0).GetType() == kaldi::nnet1::Component::kRbm);
-    RbmBase &rbm = dynamic_cast<RbmBase&>(nnet.GetComponent(0));
-
-    // Configure the RBM,
-    // make some constants accessible, will use them later,
-    const BaseFloat& learn_rate = trn_opts.learn_rate;
-    const BaseFloat& momentum = trn_opts.momentum;
-    const BaseFloat& momentum_max = trn_opts.momentum_max;
-    const int32& momentum_steps = trn_opts.momentum_steps;
-    const int32& momentum_step_period = trn_opts.momentum_step_period;
-
-    // 'trn_opts_rbm' is a local copy of 'trn_opts' which is passed to RBM,
-    trn_opts_rbm = trn_opts;
-    // keep `effective' learning rate constant
-    trn_opts_rbm.learn_rate = learn_rate * (1 - momentum);
-    // pass options to RBM,
-    rbm.SetRbmTrainOptions(trn_opts_rbm);
-
-    kaldi::int64 total_frames = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomizerMask randomizer_mask(rnd_opts);
-    MatrixRandomizer feature_randomizer(rnd_opts);
-
-    CuRand<BaseFloat> cu_rand;  // parallel random number generator,
-    Mse mse(loss_opts);
-
-    CuMatrix<BaseFloat> feats_transf,
-                        pos_hid, pos_hid_aux,
-                        neg_vis, neg_hid;
-    CuMatrix<BaseFloat> dummy_mse_mat;
-
-    Timer time;
-    KALDI_LOG << "RBM TRAINING STARTED";
-
-    int32 iter = 1;
-    KALDI_LOG << "Iteration " << iter << "/" << num_iters;
-
-    int32 num_done = 0, num_other_error = 0;
-    while (!feature_reader.Done()) {
-#if HAVE_CUDA == 1
-      // check that GPU is computing accurately,
-      CuDevice::Instantiate().CheckGpuHealth();
-#endif
-      // fill the randomizer,
-      for ( ; !feature_reader.Done(); feature_reader.Next()) {
-        if (feature_randomizer.IsFull()) {
-          // break the loop without calling Next(),
-          // we keep the 'utt' for next round,
-          break;
-        }
-        std::string utt = feature_reader.Key();
-        KALDI_VLOG(3) << "Reading " << utt;
-        // get feature matrix,
-        const Matrix<BaseFloat> &mat = feature_reader.Value();
-        // skip too long segments (avoid runinning out of memory)
-        if (mat.NumRows() > max_frames) {
-          KALDI_WARN << "Skipping " << utt
-            << " that has " << mat.NumRows() << " frames,"
-            << " it is longer than '--max-frames'" << max_frames;
-          num_other_error++;
-          continue;
-        }
-        // apply feature transform,
-        rbm_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
-        // add to randomizer,
-        feature_randomizer.AddData(feats_transf);
-        num_done++;
-
-        // report the speed
-        if (num_done % 5000 == 0) {
-          double time_now = time.Elapsed();
-          KALDI_VLOG(1) << "After " << num_done << " utterances: "
-            << "time elapsed = " << time_now / 60 << " min; "
-            << "processed " << total_frames / time_now << " frames per sec.";
-        }
-      }
-
-      // randomize,
-      feature_randomizer.Randomize(
-        randomizer_mask.Generate(feature_randomizer.NumFrames())
-      );
-
-      // train with data from randomizer (using mini-batches)
-      for ( ; !feature_randomizer.Done(); feature_randomizer.Next()) {
-        // get the mini-batch,
-        const CuMatrixBase<BaseFloat>& pos_vis = feature_randomizer.Value();
-        // get the dims,
-        int32 num_frames = pos_vis.NumRows(),
-              dim_hid = rbm.OutputDim();
-        // Create dummy frame-weights for Mse::Eval,
-        Vector<BaseFloat> dummy_weights(num_frames);
-        dummy_weights.Set(1.0);
-
-        // TRAIN with CD1,
-        // forward pass,
-        rbm.Propagate(pos_vis, &pos_hid);
-
-        // alter the hidden values, so we can generate negative example,
-        if (rbm.HidType() == Rbm::Bernoulli) {
-          pos_hid_aux.Resize(num_frames, dim_hid);
-          cu_rand.BinarizeProbs(pos_hid, &pos_hid_aux);  // => 0 / 1,
-        } else {
-          KALDI_ASSERT(rbm.HidType() == Rbm::Gaussian);
-          pos_hid_aux = pos_hid;
-          cu_rand.AddGaussNoise(&pos_hid_aux);
-        }
-
-        // reconstruct pass,
-        rbm.Reconstruct(pos_hid_aux, &neg_vis);
-        // propagate negative examples
-        rbm.Propagate(neg_vis, &neg_hid);
-        // update step
-        rbm.RbmUpdate(pos_vis, pos_hid, neg_vis, neg_hid);
-        // evaluate mean square error
-        mse.Eval(dummy_weights, neg_vis, pos_vis, &dummy_mse_mat);
-
-        total_frames += num_frames;
-
-        // change the momentum progressively per 0.5million samples of the data
-        {
-          static int32 n_prev = -1;
-          BaseFloat step = (momentum_max - momentum) / momentum_steps;
-          // change every momentum_step_period data,
-          int32 n = total_frames / momentum_step_period;
-          BaseFloat momentum_actual;
-          if (n > momentum_steps) {
-            momentum_actual = momentum_max;
-          } else {
-            momentum_actual = momentum + n*step;
-          }
-          if (n - n_prev > 0) {
-            n_prev = n;
-            BaseFloat learning_rate_actual = learn_rate*(1-momentum_actual);
-            KALDI_VLOG(1) << "Setting momentum "
-              << (with_bug ? momentum_max : momentum_actual)
-              << " and learning rate " << learning_rate_actual
-              << " after processing "
-              << static_cast<double>(total_frames) / 360000 << " h";
-            // pass values to rbm,
-            trn_opts_rbm.momentum = (with_bug ? momentum_max : momentum_actual);
-            trn_opts_rbm.learn_rate = learning_rate_actual;
-            rbm.SetRbmTrainOptions(trn_opts_rbm);
-          }
-        }
-      }
-
-      // reopen the feature stream if we will run another iteration
-      if (feature_reader.Done() && (iter < num_iters)) {
-        iter++;
-        KALDI_LOG << "Iteration " << iter << "/" << num_iters;
-        feature_reader.Close();
-        feature_reader.Open(feature_rspecifier);
-      }
-    }
-
-    nnet.Write(target_model_filename, binary);
-
-    KALDI_LOG << "Done " << iter << " iterations, " << num_done << " files, "
-              << "skipped " << num_other_error << " files. "
-              << "[" << time.Elapsed() / 60 << " min, "
-              << "processing" << total_frames / time.Elapsed() << " "
-              << "frames per sec.]";
-
-    KALDI_LOG << mse.Report();
-
-#if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/train-transitions.cc b/src/nnetbin/train-transitions.cc
deleted file mode 100644
index 0226e0973d2..00000000000
--- a/src/nnetbin/train-transitions.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-// nnetbin/train-transitions.cc
-
-// Copyright 2015  Brno University of Technology (author: Karel Vesely)
-//           2012  Johns Hopkins University (author:  Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Train the transition probabilities in transition-model "
-        "(used in nnet1 recipe).\n"
-        "\n"
-        "Usage: train-transitions [options] "
-        "<trans-model-in> <alignments-rspecifier> <trans-model-out>\n"
-        "e.g.: train-transitions 1.mdl \"ark:gunzip -c ali.*.gz|\" 2.mdl\n";
-
-    bool binary_write = true;
-    MleTransitionUpdateConfig transition_update_config;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    transition_update_config.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string trans_model_rxfilename = po.GetArg(1),
-        ali_rspecifier = po.GetArg(2),
-        trans_model_wxfilename = po.GetArg(3);
-
-    TransitionModel trans_model;
-    {
-      bool binary_read;
-      Input ki(trans_model_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-    }
-
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
-
-    int32 num_done = 0;
-    SequentialInt32VectorReader ali_reader(ali_rspecifier);
-    for (; !ali_reader.Done(); ali_reader.Next()) {
-      const std::vector<int32> alignment(ali_reader.Value());
-      for (size_t i = 0; i < alignment.size(); i++) {
-        int32 tid = alignment[i];
-        BaseFloat weight = 1.0;
-        trans_model.Accumulate(weight, tid, &transition_accs);
-      }
-      num_done++;
-    }
-    KALDI_LOG << "Accumulated transition stats from " << num_done
-              << " utterances.";
-
-    {
-      BaseFloat objf_impr, count;
-      trans_model.MleUpdate(transition_accs, transition_update_config,
-                            &objf_impr, &count);
-      KALDI_LOG << "Transition model update: average " << (objf_impr/count)
-                << " log-like improvement per frame over " << count
-                << " frames.";
-    }
-
-    {
-      Output ko(trans_model_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-    }
-    KALDI_LOG << "Trained transition model and wrote it to "
-              << trans_model_wxfilename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/nnetbin/transf-to-nnet.cc b/src/nnetbin/transf-to-nnet.cc
deleted file mode 100644
index f83c71cc47d..00000000000
--- a/src/nnetbin/transf-to-nnet.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-// nnetbin/transf-to-nnet.cc
-
-// Copyright 2012  Brno University of Technology
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet/nnet-nnet.h"
-#include "nnet/nnet-linear-transform.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet1;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Convert transformation matrix to <affine-transform>\n"
-        "Usage:  transf-to-nnet [options] <transf-in> <nnet-out>\n"
-        "e.g.:\n"
-        " transf-to-nnet --binary=false transf.mat nnet.mdl\n";
-
-    bool binary_write = false;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string transform_rxfilename = po.GetArg(1),
-        model_out_filename = po.GetArg(2);
-
-    // read the matrix,
-    Matrix<BaseFloat> transform;
-    {
-      bool binary_read;
-      Input ki(transform_rxfilename, &binary_read);
-      transform.Read(ki.Stream(), binary_read);
-    }
-
-    // wrapping as Nnet with <LinearTransform>,
-    Nnet nnet;
-    LinearTransform lin_tran(transform.NumCols(), transform.NumRows());
-    lin_tran.SetLinearity(transform);
-    nnet.AppendComponent(lin_tran);
-
-    // write the nnet,
-    {
-      Output ko(model_out_filename, binary_write);
-      nnet.Write(ko.Stream(), binary_write);
-      KALDI_LOG << "Written transform in 'nnet1' model: " << model_out_filename;
-    }
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/online/Makefile b/src/online/Makefile
index 32c99500750..4316473e6c0 100644
--- a/src/online/Makefile
+++ b/src/online/Makefile
@@ -37,7 +37,7 @@ LIBNAME = kaldi-online
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/online/online-decodable.cc b/src/online/online-decodable.cc
index 58f6452879d..58180775fd8 100644
--- a/src/online/online-decodable.cc
+++ b/src/online/online-decodable.cc
@@ -25,7 +25,7 @@
 namespace kaldi {
 
 OnlineDecodableDiagGmmScaled::OnlineDecodableDiagGmmScaled(
-    const AmDiagGmm &am, const TransitionModel &trans_model,
+    const AmDiagGmm &am, const Transitions &trans_model,
     const BaseFloat scale, OnlineFeatureMatrix *input_feats):  
       features_(input_feats), ac_model_(am),
       ac_scale_(scale), trans_model_(trans_model),
diff --git a/src/online/online-decodable.h b/src/online/online-decodable.h
index b6d811d1031..ed7ef087ced 100644
--- a/src/online/online-decodable.h
+++ b/src/online/online-decodable.h
@@ -35,7 +35,7 @@ namespace kaldi {
 class OnlineDecodableDiagGmmScaled : public DecodableInterface {
  public:
   OnlineDecodableDiagGmmScaled(const AmDiagGmm &am,
-                               const TransitionModel &trans_model,
+                               const Transitions &trans_model,
                                const BaseFloat scale,
                                OnlineFeatureMatrix *input_feats);
 
@@ -54,7 +54,7 @@ class OnlineDecodableDiagGmmScaled : public DecodableInterface {
   OnlineFeatureMatrix *features_;
   const AmDiagGmm &ac_model_;
   BaseFloat ac_scale_;
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   const int32 feat_dim_; // dimensionality of the input features
   Vector<BaseFloat> cur_feats_;
   int32 cur_frame_;
diff --git a/src/online/online-faster-decoder.cc b/src/online/online-faster-decoder.cc
index e00181e1fc4..f67e64fc5b6 100644
--- a/src/online/online-faster-decoder.cc
+++ b/src/online/online-faster-decoder.cc
@@ -219,7 +219,7 @@ bool OnlineFasterDecoder::EndOfUtterance() {
   SplitToPhones(trans_model_, isymbols, &split);
   for (size_t i = 0; i < split.size(); i++) {
     int32 tid = split[i][0];
-    int32 phone = trans_model_.TransitionIdToPhone(tid);
+    int32 phone = trans_model_.InfoForTransitionId(tid).phone;
     if (silence_set_.count(phone) == 0)
       return false;
   }
diff --git a/src/online/online-faster-decoder.h b/src/online/online-faster-decoder.h
index cd05b091b53..a81e9451f7e 100644
--- a/src/online/online-faster-decoder.h
+++ b/src/online/online-faster-decoder.h
@@ -25,7 +25,7 @@
 
 #include "util/stl-utils.h"
 #include "decoder/faster-decoder.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 
@@ -79,7 +79,7 @@ class OnlineFasterDecoder : public FasterDecoder {
   OnlineFasterDecoder(const fst::Fst<fst::StdArc> &fst,
                       const OnlineFasterDecoderOpts &opts,
                       const std::vector<int32> &sil_phones,
-                      const TransitionModel &trans_model)
+                      const Transitions &trans_model)
       : FasterDecoder(fst, opts), opts_(opts),
         silence_set_(sil_phones), trans_model_(trans_model),
         max_beam_(opts.beam), effective_beam_(FasterDecoder::config_.beam),
@@ -118,7 +118,7 @@ class OnlineFasterDecoder : public FasterDecoder {
 
   const OnlineFasterDecoderOpts opts_;
   const ConstIntegerSet<int32> silence_set_; // silence phones IDs
-  const TransitionModel &trans_model_; // needed for trans-id -> phone conversion
+  const Transitions &trans_model_; // needed for trans-id -> phone conversion
   const BaseFloat max_beam_; // the maximum allowed beam
   BaseFloat &effective_beam_; // the currently used beam
   DecodeState state_; // the current state of the decoder
diff --git a/src/online2/Makefile b/src/online2/Makefile
index 242c7be6da6..8da05efedf9 100644
--- a/src/online2/Makefile
+++ b/src/online2/Makefile
@@ -6,23 +6,21 @@ include ../kaldi.mk
 TESTFILES =
 
 OBJFILES = online-gmm-decodable.o online-feature-pipeline.o online-ivector-feature.o \
-           online-nnet2-feature-pipeline.o online-gmm-decoding.o online-timing.o \
+           online2-feature-pipeline.o online-gmm-decoding.o online-timing.o \
            online-endpoint.o onlinebin-util.o online-speex-wrapper.o \
-           online-nnet2-decoding.o online-nnet2-decoding-threaded.o \
            online-nnet3-decoding.o
 
 LIBNAME = kaldi-online2
 
 ADDLIBS = ../ivector/kaldi-ivector.a ../nnet3/kaldi-nnet3.a \
-          ../chain/kaldi-chain.a ../nnet2/kaldi-nnet2.a \
+          ../chain/kaldi-chain.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
+          ../base/kaldi-base.a
 
 
 
 
 include ../makefiles/default_rules.mk
-
diff --git a/src/online2/online-endpoint.cc b/src/online2/online-endpoint.cc
index aa7752c4484..39ef6228f7f 100644
--- a/src/online2/online-endpoint.cc
+++ b/src/online2/online-endpoint.cc
@@ -72,7 +72,7 @@ bool EndpointDetected(const OnlineEndpointConfig &config,
 }
 
 template <typename FST>
-int32 TrailingSilenceLength(const TransitionModel &tmodel,
+int32 TrailingSilenceLength(const Transitions &tmodel,
                             const std::string &silence_phones_str,
                             const LatticeFasterOnlineDecoderTpl<FST> &decoder) {
   std::vector<int32> silence_phones;
@@ -95,7 +95,7 @@ int32 TrailingSilenceLength(const TransitionModel &tmodel,
     LatticeArc arc;
     iter = decoder.TraceBackBestPath(iter, &arc);
     if (arc.ilabel != 0) {
-      int32 phone = tmodel.TransitionIdToPhone(arc.ilabel);
+      int32 phone = tmodel.InfoForTransitionId(arc.ilabel).phone;
       if (silence_set.count(phone) != 0) {
         num_silence_frames++;
       } else {
@@ -109,7 +109,7 @@ int32 TrailingSilenceLength(const TransitionModel &tmodel,
 template <typename FST>
 bool EndpointDetected(
     const OnlineEndpointConfig &config,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     BaseFloat frame_shift_in_seconds,
     const LatticeFasterOnlineDecoderTpl<FST> &decoder) {
   if (decoder.NumFramesDecoded() == 0) return false;
@@ -131,7 +131,7 @@ bool EndpointDetected(
 template
 bool EndpointDetected<fst::Fst<fst::StdArc> >(
     const OnlineEndpointConfig &config,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     BaseFloat frame_shift_in_seconds,
     const LatticeFasterOnlineDecoderTpl<fst::Fst<fst::StdArc> > &decoder);
 
@@ -139,7 +139,7 @@ bool EndpointDetected<fst::Fst<fst::StdArc> >(
 template
 bool EndpointDetected<fst::GrammarFst>(
     const OnlineEndpointConfig &config,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     BaseFloat frame_shift_in_seconds,
     const LatticeFasterOnlineDecoderTpl<fst::GrammarFst> &decoder);
 
diff --git a/src/online2/online-endpoint.h b/src/online2/online-endpoint.h
index aaf9232db13..91aa4b54781 100644
--- a/src/online2/online-endpoint.h
+++ b/src/online2/online-endpoint.h
@@ -30,10 +30,9 @@
 #include "base/kaldi-error.h"
 #include "feat/feature-functions.h"
 #include "feat/feature-mfcc.h"
-#include "feat/feature-plp.h"
 #include "itf/online-feature-itf.h"
 #include "lat/kaldi-lattice.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "decoder/lattice-faster-online-decoder.h"
 
 namespace kaldi {
@@ -188,7 +187,7 @@ bool EndpointDetected(const OnlineEndpointConfig &config,
 /// BestPathEnd() and TraceBackOneLink() functions of LatticeFasterOnlineDecoder
 /// to do this efficiently.
 template <typename FST>
-int32 TrailingSilenceLength(const TransitionModel &tmodel,
+int32 TrailingSilenceLength(const Transitions &tmodel,
                             const std::string &silence_phones,
                             const LatticeFasterOnlineDecoderTpl<FST> &decoder);
 
@@ -198,7 +197,7 @@ int32 TrailingSilenceLength(const TransitionModel &tmodel,
 template <typename FST>
 bool EndpointDetected(
     const OnlineEndpointConfig &config,
-    const TransitionModel &tmodel,
+    const Transitions &tmodel,
     BaseFloat frame_shift_in_seconds,
     const LatticeFasterOnlineDecoderTpl<FST> &decoder);
 
diff --git a/src/online2/online-feature-pipeline.cc b/src/online2/online-feature-pipeline.cc
index 471de71f181..09d9f3d993b 100644
--- a/src/online2/online-feature-pipeline.cc
+++ b/src/online2/online-feature-pipeline.cc
@@ -25,12 +25,13 @@ namespace kaldi {
 
 OnlineFeaturePipelineConfig::OnlineFeaturePipelineConfig(
     const OnlineFeaturePipelineCommandLineConfig &config) {
-  if (config.feature_type == "mfcc" || config.feature_type == "plp" ||
+  if (config.feature_type == "mfcc" ||
       config.feature_type == "fbank") {
     feature_type = config.feature_type;
   } else {
     KALDI_ERR << "Invalid feature type: " << config.feature_type << ". "
-              << "Supported feature types: mfcc, plp, fbank.";
+              << "Supported feature types: mfcc, fbank.";
+    // TODO(galv): Make sure fbank features are really supported here!
   }
 
   if (config.mfcc_config != "") {
@@ -40,13 +41,6 @@ OnlineFeaturePipelineConfig::OnlineFeaturePipelineConfig(
                  << "since feature type is set to " << feature_type << ".";
   }  // else use the defaults.
 
-  if (config.plp_config != "") {
-    ReadConfigFromFile(config.plp_config, &plp_opts);
-    if (feature_type != "plp")
-      KALDI_WARN << "--plp-config option has no effect "
-                 << "since feature type is set to " << feature_type << ".";
-  }  // else use the defaults.
-
   if (config.fbank_config != "") {
     ReadConfigFromFile(config.fbank_config, &fbank_opts);
     if (feature_type != "fbank")
@@ -159,8 +153,6 @@ void OnlineFeaturePipeline::GetCmvnState(OnlineCmvnState *cmvn_state) {
 void OnlineFeaturePipeline::Init() {
   if (config_.feature_type == "mfcc") {
     base_feature_ = new OnlineMfcc(config_.mfcc_opts);
-  } else if (config_.feature_type == "plp") {
-    base_feature_ = new OnlinePlp(config_.plp_opts);
   } else if (config_.feature_type == "fbank") {
     base_feature_ = new OnlineFbank(config_.fbank_opts);
   } else {
@@ -286,8 +278,6 @@ void OnlineFeaturePipeline::InputFinished() {
 BaseFloat OnlineFeaturePipelineConfig::FrameShiftInSeconds() const {
   if (feature_type == "mfcc") {
     return mfcc_opts.frame_opts.frame_shift_ms / 1000.0f;
-  } else if (feature_type == "plp") {
-    return plp_opts.frame_opts.frame_shift_ms / 1000.0f;
   } else if (feature_type == "fbank") {
     return fbank_opts.frame_opts.frame_shift_ms / 1000.0f;
   } else {
diff --git a/src/online2/online-feature-pipeline.h b/src/online2/online-feature-pipeline.h
index 6196a3b30f9..b7f3195a01a 100644
--- a/src/online2/online-feature-pipeline.h
+++ b/src/online2/online-feature-pipeline.h
@@ -49,7 +49,6 @@ namespace kaldi {
 struct OnlineFeaturePipelineCommandLineConfig {
   std::string feature_type;
   std::string mfcc_config;
-  std::string plp_config;
   std::string fbank_config;
   bool add_pitch;
   std::string pitch_config;
@@ -68,15 +67,13 @@ struct OnlineFeaturePipelineCommandLineConfig {
 
   void Register(OptionsItf *opts) {
     opts->Register("feature-type", &feature_type,
-                   "Base feature type [mfcc, plp, fbank]");
+                   "Base feature type [mfcc, fbank]");
     opts->Register("mfcc-config", &mfcc_config, "Configuration file for "
                    "MFCC features (e.g. conf/mfcc.conf)");
-    opts->Register("plp-config", &plp_config, "Configuration file for "
-                   "PLP features (e.g. conf/plp.conf)");
     opts->Register("fbank-config", &fbank_config, "Configuration file for "
                    "filterbank features (e.g. conf/fbank.conf)");
     opts->Register("add-pitch", &add_pitch, "Append pitch features to raw "
-                   "MFCC/PLP features.");
+                   "MFCC features.");
     opts->Register("pitch-config", &pitch_config, "Configuration file for "
                    "pitch features (e.g. conf/pitch.conf)");
     opts->Register("pitch-process-config", &pitch_process_config,
@@ -119,11 +116,10 @@ struct OnlineFeaturePipelineConfig {
 
   BaseFloat FrameShiftInSeconds() const;
 
-  std::string feature_type;  // "mfcc" or "plp" or "fbank"
+  std::string feature_type;  // "mfcc" or "fbank"
 
   MfccOptions mfcc_opts;  // options for MFCC computation,
                           // if feature_type == "mfcc"
-  PlpOptions plp_opts;  // Options for PLP computation, if feature_type == "plp"
   FbankOptions fbank_opts;  // Options for filterbank computation, if
                             // feature_type == "fbank"
 
@@ -226,7 +222,7 @@ class OnlineFeaturePipeline: public OnlineFeatureInterface {
   Matrix<BaseFloat> lda_mat_;  // LDA matrix, if supplied.
   Matrix<BaseFloat> global_cmvn_stats_;  // Global CMVN stats.
 
-  OnlineBaseFeature *base_feature_;        // MFCC/PLP/Fbank
+  OnlineBaseFeature *base_feature_;        // MFCC/Fbank
   OnlinePitchFeature *pitch_;              // Raw pitch
   OnlineProcessPitch *pitch_feature_;  // Processed pitch
   OnlineFeatureInterface *feature_;        // CMVN (+ processed pitch)
diff --git a/src/online2/online-gmm-decodable.cc b/src/online2/online-gmm-decodable.cc
index 20a23858adb..0b62b3d2845 100644
--- a/src/online2/online-gmm-decodable.cc
+++ b/src/online2/online-gmm-decodable.cc
@@ -24,7 +24,7 @@
 namespace kaldi {
 
 DecodableDiagGmmScaledOnline::DecodableDiagGmmScaledOnline(
-    const AmDiagGmm &am, const TransitionModel &trans_model,
+    const AmDiagGmm &am, const Transitions &trans_model,
     const BaseFloat scale, OnlineFeatureInterface *input_feats):  
       features_(input_feats), ac_model_(am),
       ac_scale_(scale), trans_model_(trans_model),
@@ -45,7 +45,7 @@ void DecodableDiagGmmScaledOnline::CacheFrame(int32 frame) {
 BaseFloat DecodableDiagGmmScaledOnline::LogLikelihood(int32 frame, int32 index) {
   if (frame != cur_frame_)
     CacheFrame(frame);
-  int32 pdf_id = trans_model_.TransitionIdToPdf(index);
+  int32 pdf_id = trans_model_.TransitionIdToPdfFast(index);
   if (cache_[pdf_id].first == frame)
     return cache_[pdf_id].second;
   BaseFloat ans = ac_model_.LogLikelihood(pdf_id, cur_feats_) * ac_scale_;
diff --git a/src/online2/online-gmm-decodable.h b/src/online2/online-gmm-decodable.h
index 1a1d37ba2a2..c9436d83aa4 100644
--- a/src/online2/online-gmm-decodable.h
+++ b/src/online2/online-gmm-decodable.h
@@ -27,7 +27,7 @@
 #include "matrix/matrix-lib.h"
 #include "itf/decodable-itf.h"
 #include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 
 namespace kaldi {
 
@@ -35,7 +35,7 @@ namespace kaldi {
 class DecodableDiagGmmScaledOnline : public DecodableInterface {
  public:
   DecodableDiagGmmScaledOnline(const AmDiagGmm &am,
-                               const TransitionModel &trans_model,
+                               const Transitions &trans_model,
                                const BaseFloat scale,
                                OnlineFeatureInterface *input_feats);
 
@@ -56,7 +56,7 @@ class DecodableDiagGmmScaledOnline : public DecodableInterface {
   OnlineFeatureInterface *features_;
   const AmDiagGmm &ac_model_;
   BaseFloat ac_scale_;
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   const int32 feat_dim_;  // dimensionality of the input features
   Vector<BaseFloat> cur_feats_;
   int32 cur_frame_;
diff --git a/src/online2/online-gmm-decoding.cc b/src/online2/online-gmm-decoding.cc
index 56b5603b206..4addc8cd969 100644
--- a/src/online2/online-gmm-decoding.cc
+++ b/src/online2/online-gmm-decoding.cc
@@ -75,7 +75,7 @@ void SingleUtteranceGmmDecoder::AdvanceDecoding() {
   // from constructing it each time we want to decode more of the
   // input.
   DecodableDiagGmmScaledOnline decodable(am_gmm,
-                                         models_.GetTransitionModel(),
+                                         models_.GetTransitions(),
                                          config_.acoustic_scale,
                                          feature_pipeline_);
 
@@ -169,7 +169,7 @@ bool SingleUtteranceGmmDecoder::GetGaussianPosteriors(bool end_of_utterance,
                 << " frames.";
 
   ConstIntegerSet<int32> silence_set(silence_phones_);  // faster lookup
-  const TransitionModel &trans_model = models_.GetTransitionModel();
+  const Transitions &trans_model = models_.GetTransitions();
   WeightSilencePost(trans_model, silence_set,
                     config_.silence_weight, &post);  
   
@@ -309,7 +309,7 @@ SingleUtteranceGmmDecoder::~SingleUtteranceGmmDecoder() {
 
 bool SingleUtteranceGmmDecoder::EndpointDetected(
     const OnlineEndpointConfig &config) {
-  const TransitionModel &tmodel = models_.GetTransitionModel();
+  const Transitions &tmodel = models_.GetTransitions();
   return kaldi::EndpointDetected(config, tmodel,
                                  feature_pipeline_->FrameShiftInSeconds(),
                                  decoder_);
@@ -323,7 +323,7 @@ void SingleUtteranceGmmDecoder::GetLattice(bool rescore_if_needed,
   decoder_.GetRawLattice(&lat, end_of_utterance);
   if (rescore_if_needed && RescoringIsNeeded()) {
     DecodableDiagGmmScaledOnline decodable(models_.GetFinalModel(),
-                                           models_.GetTransitionModel(),
+                                           models_.GetTransitions(),
                                            config_.acoustic_scale,
                                            feature_pipeline_);
 
@@ -332,7 +332,7 @@ void SingleUtteranceGmmDecoder::GetLattice(bool rescore_if_needed,
   }
   PruneLattice(lat_beam, &lat);
 
-  DeterminizeLatticePhonePrunedWrapper(models_.GetTransitionModel(),
+  DeterminizeLatticePhonePrunedWrapper(models_.GetTransitions(),
                                        &lat, lat_beam, clat,
                                        config_.faster_decoder_opts.det_opts);
   
@@ -358,7 +358,7 @@ OnlineGmmDecodingModels::OnlineGmmDecodingModels(
   if (!config.online_alimdl_rxfilename.empty()) {
     bool binary;
     Input ki(config.online_alimdl_rxfilename, &binary);
-    TransitionModel tmodel;
+    Transitions tmodel;
     tmodel.Read(ki.Stream(), binary);
     if (!tmodel.Compatible(tmodel_))
       KALDI_ERR << "Incompatible models given to the --model and "
@@ -369,7 +369,7 @@ OnlineGmmDecodingModels::OnlineGmmDecodingModels(
   if (!config.rescore_model_rxfilename.empty()) {
     bool binary;
     Input ki(config.rescore_model_rxfilename, &binary);
-    TransitionModel tmodel;
+    Transitions tmodel;
     tmodel.Read(ki.Stream(), binary);
     if (!tmodel.Compatible(tmodel_))
       KALDI_ERR << "Incompatible models given to the --model and "
@@ -386,7 +386,7 @@ OnlineGmmDecodingModels::OnlineGmmDecodingModels(
 }
 
 
-const TransitionModel &OnlineGmmDecodingModels::GetTransitionModel() const {
+const Transitions &OnlineGmmDecodingModels::GetTransitions() const {
   return tmodel_;
 }
 
diff --git a/src/online2/online-gmm-decoding.h b/src/online2/online-gmm-decoding.h
index 8bec6cd9ab9..c6b492e5e80 100644
--- a/src/online2/online-gmm-decoding.h
+++ b/src/online2/online-gmm-decoding.h
@@ -34,7 +34,7 @@
 #include "online2/online-gmm-decodable.h"
 #include "online2/online-endpoint.h"
 #include "decoder/lattice-faster-online-decoder.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "gmm/am-diag-gmm.h"
 #include "hmm/posterior.h"
 
@@ -167,7 +167,7 @@ class OnlineGmmDecodingModels {
  public:
   OnlineGmmDecodingModels(const OnlineGmmDecodingConfig &config);
 
-  const TransitionModel &GetTransitionModel() const;
+  const Transitions &GetTransitions() const;
 
   const AmDiagGmm &GetOnlineAlignmentModel() const;
 
@@ -181,7 +181,7 @@ class OnlineGmmDecodingModels {
   // The transition-model is only needed for its integer ids, and these need to
   // be identical for all 3 models, so we only store one (it doesn't matter
   // which one).
-  TransitionModel tmodel_; 
+  Transitions tmodel_; 
   // The model trained with online-CMVN features
   // (if supplied, otherwise use model_)
   AmDiagGmm online_alignment_model_;
diff --git a/src/online2/online-ivector-feature.cc b/src/online2/online-ivector-feature.cc
index 32a4db70097..f222faff0d9 100644
--- a/src/online2/online-ivector-feature.cc
+++ b/src/online2/online-ivector-feature.cc
@@ -454,7 +454,7 @@ BaseFloat OnlineIvectorFeature::ObjfImprPerFrame() const {
 
 
 OnlineSilenceWeighting::OnlineSilenceWeighting(
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const OnlineSilenceWeightingConfig &config,
     int32 frame_subsampling_factor):
     trans_model_(trans_model), config_(config),
@@ -589,7 +589,7 @@ void OnlineSilenceWeighting::GetDeltaWeights(
         // frame we have a traceback for (probably a reasonable guess).
         frame_weight[offset] = frame_weight[offset - 1];
       } else {
-        int32 phone = trans_model_.TransitionIdToPhone(transition_id);
+        int32 phone = trans_model_.InfoForTransitionId(transition_id).phone;
         bool is_silence = (silence_phones_.count(phone) != 0);
         if (is_silence)
           frame_weight[offset] = silence_weight;
diff --git a/src/online2/online-ivector-feature.h b/src/online2/online-ivector-feature.h
index 12bc5c6bb2f..e6287184fcb 100644
--- a/src/online2/online-ivector-feature.h
+++ b/src/online2/online-ivector-feature.h
@@ -468,7 +468,7 @@ class OnlineSilenceWeighting {
   // frame-rate of the input features.  E.g. you might set it to 3 for such
   // models.
 
-  OnlineSilenceWeighting(const TransitionModel &trans_model,
+  OnlineSilenceWeighting(const Transitions &trans_model,
                          const OnlineSilenceWeightingConfig &config,
                          int32 frame_subsampling_factor = 1);
 
@@ -518,7 +518,7 @@ class OnlineSilenceWeighting {
   }
 
  private:
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
   const OnlineSilenceWeightingConfig &config_;
 
   int32 frame_subsampling_factor_;
diff --git a/src/online2/online-nnet2-decoding-threaded.cc b/src/online2/online-nnet2-decoding-threaded.cc
deleted file mode 100644
index de3194019f4..00000000000
--- a/src/online2/online-nnet2-decoding-threaded.cc
+++ /dev/null
@@ -1,661 +0,0 @@
-// online2/online-nnet2-decoding-threaded.cc
-
-// Copyright    2013-2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "online2/online-nnet2-decoding-threaded.h"
-#include "nnet2/nnet-compute-online.h"
-#include "lat/lattice-functions.h"
-#include "lat/determinize-lattice-pruned.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-
-ThreadSynchronizer::ThreadSynchronizer():
-    abort_(false),
-    producer_waiting_(false),
-    consumer_waiting_(false),
-    num_errors_(0) {
-  producer_semaphore_.Signal();
-  consumer_semaphore_.Signal();
-}
-
-bool ThreadSynchronizer::Lock(ThreadType t) {
-  if (abort_)
-    return false;
-  if (t == ThreadSynchronizer::kProducer) {
-    producer_semaphore_.Wait();
-  } else {
-    consumer_semaphore_.Wait();
-  }
-  if (abort_)
-    return false;
-  mutex_.lock();
-  held_by_ = t;
-  if (abort_) {
-    mutex_.unlock();
-    return false;
-  } else {
-    return true;
-  }
-}
-
-bool ThreadSynchronizer::UnlockSuccess(ThreadType t) {
-  if (t == ThreadSynchronizer::kProducer) {
-    producer_semaphore_.Signal();  // next Lock won't wait.
-    if (consumer_waiting_) {
-      consumer_semaphore_.Signal();
-      consumer_waiting_ = false;
-    }
-  } else {
-    consumer_semaphore_.Signal(); // next Lock won't wait.
-    if (producer_waiting_) {
-      producer_semaphore_.Signal();
-      producer_waiting_ = false;
-    }
-
-  }
-  mutex_.unlock();
-  return !abort_;
-}
-
-bool ThreadSynchronizer::UnlockFailure(ThreadType t) {
-
-  KALDI_ASSERT(held_by_ == t && "Code error: unlocking a mutex you don't hold.");
-
-  if (t == ThreadSynchronizer::kProducer) {
-    KALDI_ASSERT(!producer_waiting_ && "code error.");
-    producer_waiting_ = true;
-  } else {
-    KALDI_ASSERT(!consumer_waiting_ && "code error.");
-    consumer_waiting_ = true;
-  }
-  mutex_.unlock();
-  return !abort_;
-}
-
-void ThreadSynchronizer::SetAbort() {
-  abort_ = true;
-  // we signal the semaphores just in case someone was waiting on either of
-  // them.
-  producer_semaphore_.Signal();
-  consumer_semaphore_.Signal();
-}
-
-ThreadSynchronizer::~ThreadSynchronizer() {
-}
-
-// static
-void OnlineNnet2DecodingThreadedConfig::Check() {
-  KALDI_ASSERT(max_buffered_features > 1);
-  KALDI_ASSERT(feature_batch_size > 0);
-  KALDI_ASSERT(max_loglikes_copy >= 0);
-  KALDI_ASSERT(nnet_batch_size > 0);
-  KALDI_ASSERT(decode_batch_size >= 1);
-}
-
-
-SingleUtteranceNnet2DecoderThreaded::SingleUtteranceNnet2DecoderThreaded(
-    const OnlineNnet2DecodingThreadedConfig &config,
-    const TransitionModel &tmodel,
-    const nnet2::AmNnet &am_nnet,
-    const fst::Fst<fst::StdArc> &fst,
-    const OnlineNnet2FeaturePipelineInfo &feature_info,
-    const OnlineIvectorExtractorAdaptationState &adaptation_state,
-    const OnlineCmvnState &cmvn_state):
-  config_(config), am_nnet_(am_nnet), tmodel_(tmodel), sampling_rate_(0.0),
-  num_samples_received_(0), input_finished_(false),
-  feature_pipeline_(feature_info),
-  num_samples_discarded_(0),
-  silence_weighting_(tmodel, feature_info.silence_weighting_config),
-  decodable_(tmodel),
-  num_frames_decoded_(0), decoder_(fst, config_.decoder_opts),
-  abort_(false), error_(false) {
-  // if the user supplies an adaptation state that was not freshly initialized,
-  // it means that we take the adaptation state from the previous
-  // utterance(s)... this only makes sense if theose previous utterance(s) are
-  // believed to be from the same speaker.
-  feature_pipeline_.SetAdaptationState(adaptation_state);
-  feature_pipeline_.SetCmvnState(cmvn_state);
-  // spawn threads.
-  threads_[0] = std::thread(RunNnetEvaluation, this);
-  decoder_.InitDecoding();
-  threads_[1] = std::thread(RunDecoderSearch, this);
-}
-
-
-SingleUtteranceNnet2DecoderThreaded::~SingleUtteranceNnet2DecoderThreaded() {
-  if (!abort_) {
-    // If we have not already started the process of aborting the threads, do so now.
-    bool error = false;
-    AbortAllThreads(error);
-  }
-  // join all the threads (this avoids leaving zombie threads around, or threads
-  // that might be accessing deconstructed object).
-  WaitForAllThreads();
-  while (!input_waveform_.empty()) {
-    delete input_waveform_.front();
-    input_waveform_.pop_front();
-  }
-  while (!processed_waveform_.empty()) {
-    delete processed_waveform_.front();
-    processed_waveform_.pop_front();
-  }
-}
-
-void SingleUtteranceNnet2DecoderThreaded::AcceptWaveform(
-    BaseFloat sampling_rate,
-    const VectorBase<BaseFloat> &wave_part) {
-  if (sampling_rate_ <= 0.0)
-    sampling_rate_ = sampling_rate;
-  else {
-    KALDI_ASSERT(sampling_rate == sampling_rate_);
-  }
-  num_samples_received_ += wave_part.Dim();
-
-  if (wave_part.Dim() == 0) return;
-  if (!waveform_synchronizer_.Lock(ThreadSynchronizer::kProducer)) {
-    KALDI_ERR << "Failure locking mutex: decoding aborted.";
-  }
-
-  Vector<BaseFloat> *new_part = new Vector<BaseFloat>(wave_part);
-  input_waveform_.push_back(new_part);
-  // we always unlock with success because there is no buffer size limitation
-  // for the waveform so no reason why we might wait.
-  waveform_synchronizer_.UnlockSuccess(ThreadSynchronizer::kProducer);
-}
-
-int32 SingleUtteranceNnet2DecoderThreaded::NumWaveformPiecesPending() {
-  // Note RE locking: what we really want here is just to lock the mutex.  As a
-  // side effect, because of the way the synchronizer code works, it will also
-  // increment the semaphore and might wake up the consumer thread.  This will
-  // possibly make it do a little useless work (go around a loop once), but
-  // won't really do any harm.  Perhaps we should have implemented a version of
-  // the Lock function that takes no arguments.
-  if (!waveform_synchronizer_.Lock(ThreadSynchronizer::kProducer)) {
-    KALDI_ERR << "Failure locking mutex: decoding aborted.";
-  }
-  int32 ans = input_waveform_.size();
-  waveform_synchronizer_.UnlockSuccess(ThreadSynchronizer::kProducer);
-  return ans;
-}
-
-
-int32 SingleUtteranceNnet2DecoderThreaded::NumFramesReceivedApprox() const {
-  return num_samples_received_ /
-      (sampling_rate_ * feature_pipeline_.FrameShiftInSeconds());
-}
-
-void SingleUtteranceNnet2DecoderThreaded::InputFinished() {
-  // setting input_finished_ = true informs the feature-processing pipeline
-  // to expect no more input, and to flush out the last few frames if there
-  // is any latency in the pipeline (e.g. due to pitch).
-  if (!waveform_synchronizer_.Lock(ThreadSynchronizer::kProducer)) {
-    KALDI_ERR << "Failure locking mutex: decoding aborted.";
-  }
-  KALDI_ASSERT(!input_finished_ && "InputFinished called twice");
-  input_finished_ = true;
-  waveform_synchronizer_.UnlockSuccess(ThreadSynchronizer::kProducer);
-}
-
-void SingleUtteranceNnet2DecoderThreaded::TerminateDecoding() {
-  bool error = false;
-  AbortAllThreads(error);
-}
-
-void SingleUtteranceNnet2DecoderThreaded::Wait() {
-  if (!input_finished_ && !abort_) {
-    KALDI_ERR << "You cannot call Wait() before calling either InputFinished() "
-              << "or TerminateDecoding().";
-  }
-  WaitForAllThreads();
-}
-
-void SingleUtteranceNnet2DecoderThreaded::FinalizeDecoding() {
-  if (threads_[0].joinable()) {
-    KALDI_ERR << "It is an error to call FinalizeDecoding before Wait().";
-  }
-  decoder_.FinalizeDecoding();
-}
-
-BaseFloat SingleUtteranceNnet2DecoderThreaded::GetRemainingWaveform(
-    Vector<BaseFloat> *waveform) const {
-  if (threads_[0].joinable()) {
-    KALDI_ERR << "It is an error to call GetRemainingWaveform before Wait().";
-  }
-  int64 num_samples_stored = 0;  // number of samples we still have.
-  std::vector< Vector<BaseFloat>* > all_pieces;
-  std::deque< Vector<BaseFloat>* >::const_iterator iter;
-  for (iter = processed_waveform_.begin(); iter != processed_waveform_.end();
-       ++iter) {
-    num_samples_stored += (*iter)->Dim();
-    all_pieces.push_back(*iter);
-  }
-  for (iter = input_waveform_.begin(); iter != input_waveform_.end(); ++iter) {
-    num_samples_stored += (*iter)->Dim();
-    all_pieces.push_back(*iter);
-  }
-  int64 samples_shift_per_frame =
-      sampling_rate_ * feature_pipeline_.FrameShiftInSeconds();
-  int64 num_samples_to_discard = samples_shift_per_frame * num_frames_decoded_;
-  KALDI_ASSERT(num_samples_to_discard >= num_samples_discarded_);
-
-  // num_samp_discard is how many samples we must discard from our stored
-  // samples.
-  int64 num_samp_discard = num_samples_to_discard - num_samples_discarded_,
-      num_samp_keep = num_samples_stored - num_samp_discard;
-  KALDI_ASSERT(num_samp_discard <= num_samples_stored && num_samp_keep >= 0);
-  waveform->Resize(num_samp_keep, kUndefined);
-  int32 offset = 0; // offset in output waveform.  assume output waveform is no
-                    // larger than int32.
-  for (size_t i = 0; i < all_pieces.size(); i++) {
-    Vector<BaseFloat> *this_piece = all_pieces[i];
-    int32 this_dim = this_piece->Dim();
-    if (num_samp_discard >= this_dim) {
-      num_samp_discard -= this_dim;
-    } else {
-      // normal case is num_samp_discard = 0.
-      int32 this_dim_keep = this_dim - num_samp_discard;
-      waveform->Range(offset, this_dim_keep).CopyFromVec(
-          this_piece->Range(num_samp_discard, this_dim_keep));
-      offset += this_dim_keep;
-      num_samp_discard = 0;
-    }
-  }
-  KALDI_ASSERT(offset == num_samp_keep && num_samp_discard == 0);
-  return sampling_rate_;
-}
-
-void SingleUtteranceNnet2DecoderThreaded::GetAdaptationState(
-    OnlineIvectorExtractorAdaptationState *adaptation_state) {
-  std::lock_guard<std::mutex> lock(feature_pipeline_mutex_);
-  // If this blocks, it shouldn't be for very long.
-  feature_pipeline_.GetAdaptationState(adaptation_state);
-}
-
-void SingleUtteranceNnet2DecoderThreaded::GetCmvnState(
-    OnlineCmvnState *cmvn_state) {
-  std::lock_guard<std::mutex> lock(feature_pipeline_mutex_);
-  // If this blocks, it shouldn't be for very long.
-  feature_pipeline_.GetCmvnState(cmvn_state);
-}
-
-void SingleUtteranceNnet2DecoderThreaded::GetLattice(
-    bool end_of_utterance,
-    CompactLattice *clat,
-    BaseFloat *final_relative_cost) const {
-  clat->DeleteStates();
-  decoder_mutex_.lock();
-  if (final_relative_cost != NULL)
-    *final_relative_cost = decoder_.FinalRelativeCost();
-  if (decoder_.NumFramesDecoded() == 0) {
-    decoder_mutex_.unlock();
-    clat->SetFinal(clat->AddState(),
-                   CompactLatticeWeight::One());
-    return;
-  }
-  Lattice raw_lat;
-  decoder_.GetRawLattice(&raw_lat, end_of_utterance);
-  decoder_mutex_.unlock();
-
-  if (!config_.decoder_opts.determinize_lattice)
-    KALDI_ERR << "--determinize-lattice=false option is not supported at the moment";
-
-  BaseFloat lat_beam = config_.decoder_opts.lattice_beam;
-  DeterminizeLatticePhonePrunedWrapper(
-      tmodel_, &raw_lat, lat_beam, clat, config_.decoder_opts.det_opts);
-}
-
-void SingleUtteranceNnet2DecoderThreaded::GetBestPath(
-    bool end_of_utterance,
-    Lattice *best_path,
-    BaseFloat *final_relative_cost) const {
-  std::lock_guard<std::mutex> lock(decoder_mutex_);
-  if (decoder_.NumFramesDecoded() == 0) {
-    // It's possible that this if-statement is not necessary because we'd get this
-    // anyway if we just called GetBestPath on the decoder.
-    best_path->DeleteStates();
-    best_path->SetFinal(best_path->AddState(),
-                        LatticeWeight::One());
-    if (final_relative_cost != NULL)
-      *final_relative_cost = std::numeric_limits<BaseFloat>::infinity();
-  } else {
-    decoder_.GetBestPath(best_path,
-                         end_of_utterance);
-    if (final_relative_cost != NULL)
-      *final_relative_cost = decoder_.FinalRelativeCost();
-  }
-}
-
-void SingleUtteranceNnet2DecoderThreaded::AbortAllThreads(bool error) {
-  abort_ = true;
-  if (error)
-    error_ = true;
-  waveform_synchronizer_.SetAbort();
-  decodable_synchronizer_.SetAbort();
-}
-
-int32 SingleUtteranceNnet2DecoderThreaded::NumFramesDecoded() const {
-  std::lock_guard<std::mutex> lock(decoder_mutex_);
-  return decoder_.NumFramesDecoded();
-}
-
-void SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluation(
-    SingleUtteranceNnet2DecoderThreaded *me) {
-  try {
-    if (!me->RunNnetEvaluationInternal() && !me->abort_)
-      KALDI_ERR << "Returned abnormally and abort was not called";
-  } catch(const std::exception &e) {
-    KALDI_WARN << "Caught exception: " << e.what();
-    // if an error happened in one thread, we need to make sure the other
-    // threads can exit too.
-    bool error = true;
-    me->AbortAllThreads(error);
-  }
-}
-
-void SingleUtteranceNnet2DecoderThreaded::RunDecoderSearch(
-    SingleUtteranceNnet2DecoderThreaded *me) {
-  try {
-    if (!me->RunDecoderSearchInternal() && !me->abort_)
-      KALDI_ERR << "Returned abnormally and abort was not called";
-  } catch(const std::exception &e) {
-    KALDI_WARN << "Caught exception: " << e.what();
-    // if an error happened in one thread, we need to make sure the other threads can exit too.
-    bool error = true;
-    me->AbortAllThreads(error);
-  }
-}
-
-
-void SingleUtteranceNnet2DecoderThreaded::WaitForAllThreads() {
-  for (int32 i = 0; i < 2; i++) {  // there are 2 spawned threads.
-    if (threads_[i].joinable())
-      threads_[i].join();
-  }
-  if (error_)
-    KALDI_ERR << "Error encountered during decoding.  See above.";
-}
-
-
-void SingleUtteranceNnet2DecoderThreaded::ProcessLoglikes(
-    const CuVector<BaseFloat> &log_inv_prior,
-    CuMatrixBase<BaseFloat> *cu_loglikes) {
-  if (cu_loglikes->NumRows() != 0) {
-    cu_loglikes->ApplyFloor(1.0e-20);
-    cu_loglikes->ApplyLog();
-    // take the log-posteriors and turn them into pseudo-log-likelihoods by
-    // dividing by the pdf priors; then scale by the acoustic scale.
-    cu_loglikes->AddVecToRows(1.0, log_inv_prior);
-    cu_loglikes->Scale(config_.acoustic_scale);
-  }
-}
-
-// called from RunNnetEvaluationInternal().  Returns true in the normal case,
-// false on error; if it returns false, then we expect that the calling thread
-// will terminate.  This assumes the calling thread has already
-// locked feature_pipeline_mutex_.
-bool SingleUtteranceNnet2DecoderThreaded::FeatureComputation(
-    int32 num_frames_consumed) {
-
-  int32 num_frames_ready = feature_pipeline_.NumFramesReady(),
-      num_frames_usable = num_frames_ready - num_frames_consumed;
-  bool features_done = feature_pipeline_.IsLastFrame(num_frames_ready - 1);
-  KALDI_ASSERT(num_frames_usable >= 0);
-  if (features_done) {
-    return true;  // nothing to do. (but not an error).
-  } else {
-    if (num_frames_usable >= config_.nnet_batch_size)
-      return true;  // We don't need more data yet.
-
-    // Now try to get more data, if we can.
-    if (!waveform_synchronizer_.Lock(ThreadSynchronizer::kConsumer)) {
-      return false;
-    }
-    // we've got the lock.
-    if (input_waveform_.empty()) {  // we got no data
-      if (input_finished_ &&
-          !feature_pipeline_.IsLastFrame(feature_pipeline_.NumFramesReady()-1)) {
-        // the main thread called InputFinished() and set input_finished_, and
-        // we haven't yet registered that fact.  This is progress so
-        // unlock with UnlockSuccess().
-        feature_pipeline_.InputFinished();
-        return waveform_synchronizer_.UnlockSuccess(ThreadSynchronizer::kConsumer);
-      } else {
-        // there is no progress.  Unlock with UnlockFailure() so the next call to
-        // waveform_synchronizer_.Lock() will lock.
-        return waveform_synchronizer_.UnlockFailure(ThreadSynchronizer::kConsumer);
-      }
-    } else {  // we got some data.  Only take enough of the waveform to
-              // give us a maximum nnet batch size of frames to decode.
-      while (num_frames_usable < config_.nnet_batch_size &&
-             !input_waveform_.empty()) {
-        feature_pipeline_.AcceptWaveform(sampling_rate_, *input_waveform_.front());
-        processed_waveform_.push_back(input_waveform_.front());
-        input_waveform_.pop_front();
-        num_frames_ready = feature_pipeline_.NumFramesReady();
-        num_frames_usable = num_frames_ready - num_frames_consumed;
-      }
-      // Delete already-processed pieces of waveform if we have already decoded
-      // those frames.  (If not already decoded, we keep them around for the
-      // sake of GetRemainingWaveform()).
-      int32 samples_shift_per_frame =
-          sampling_rate_ * feature_pipeline_.FrameShiftInSeconds();
-      while (!processed_waveform_.empty() &&
-             num_samples_discarded_ + processed_waveform_.front()->Dim() <
-             samples_shift_per_frame * num_frames_decoded_) {
-        num_samples_discarded_ += processed_waveform_.front()->Dim();
-        delete processed_waveform_.front();
-        processed_waveform_.pop_front();
-      }
-      return waveform_synchronizer_.UnlockSuccess(ThreadSynchronizer::kConsumer);
-    }
-  }
-}
-
-bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
-  // if any of the Lock/Unlock functions return false, it's because AbortAllThreads()
-  // was called.
-
-  // This object is responsible for keeping track of the context, and avoiding
-  // re-computing things we've already computed.
-  bool pad_input = true;
-  nnet2::NnetOnlineComputer computer(am_nnet_.GetNnet(), pad_input);
-
-  // we declare the following as CuVector just to enable GPU support, but
-  // we expect this code to be run on CPU in the normal case.
-  CuVector<BaseFloat> log_inv_prior(am_nnet_.Priors());
-  log_inv_prior.ApplyFloor(1.0e-20);  // should have no effect.
-  log_inv_prior.ApplyLog();
-  log_inv_prior.Scale(-1.0);
-
-  // we'll have num_frames_consumed >= num_frames_output; num_frames_consumed is
-  // the number of feature frames consumed by the nnet computation,
-  // num_frames_output is the number of frames of loglikes the nnet computation
-  // has produced, which may be less than num_frames_consumed due to the
-  // right-context of the network.
-  int32 num_frames_consumed = 0, num_frames_output = 0;
-
-  while (true) {
-    bool last_time = false;
-
-    /****** Begin locking of feature pipeline mutex. ******/
-    feature_pipeline_mutex_.lock();
-    if (!FeatureComputation(num_frames_consumed)) {  // error
-      feature_pipeline_mutex_.unlock();
-      return false;
-    }
-    // take care of silence weighting.
-    if (silence_weighting_.Active() &&
-        feature_pipeline_.IvectorFeature() != NULL) {
-      silence_weighting_mutex_.lock();
-      std::vector<std::pair<int32, BaseFloat> > delta_weights;
-      silence_weighting_.GetDeltaWeights(
-          feature_pipeline_.IvectorFeature()->NumFramesReady(),
-          &delta_weights);
-      silence_weighting_mutex_.unlock();
-      feature_pipeline_.IvectorFeature()->UpdateFrameWeights(delta_weights);
-    }
-
-    int32 num_frames_ready = feature_pipeline_.NumFramesReady(),
-        num_frames_usable = num_frames_ready - num_frames_consumed;
-    bool features_done = feature_pipeline_.IsLastFrame(num_frames_ready - 1);
-
-    int32 num_frames_evaluate = std::min<int32>(num_frames_usable,
-                                                config_.nnet_batch_size);
-
-    Matrix<BaseFloat> feats;
-    if (num_frames_evaluate > 0) {
-      // we have something to do...
-      feats.Resize(num_frames_evaluate, feature_pipeline_.Dim());
-      for (int32 i = 0; i < num_frames_evaluate; i++) {
-        int32 t = num_frames_consumed + i;
-        SubVector<BaseFloat> feat(feats, i);
-        feature_pipeline_.GetFrame(t, &feat);
-      }
-    }
-    /****** End locking of feature pipeline mutex. ******/
-    feature_pipeline_mutex_.unlock();
-
-    CuMatrix<BaseFloat> cu_loglikes;
-
-    if (feats.NumRows() == 0) {
-      if (features_done) {
-        // flush out the last few frames.  Note: this is the only place from
-        // which we check feature_buffer_finished_, and we'll exit the loop, so
-        // if we reach here it must be the first time it was true.
-        last_time = true;
-        computer.Flush(&cu_loglikes);
-        ProcessLoglikes(log_inv_prior, &cu_loglikes);
-      }
-    } else {
-      CuMatrix<BaseFloat> cu_feats;
-      cu_feats.Swap(&feats);  // If we don't have a GPU (and not having a GPU is
-                              // the normal expected use-case for this code),
-                              // this would be a lightweight operation, swapping
-                              // pointers.
-
-      computer.Compute(cu_feats, &cu_loglikes);
-      num_frames_consumed += cu_feats.NumRows();
-      ProcessLoglikes(log_inv_prior, &cu_loglikes);
-    }
-
-    Matrix<BaseFloat> loglikes;
-    loglikes.Swap(&cu_loglikes);  // If we don't have a GPU (and not having a
-                                  // GPU is the normal expected use-case for
-                                  // this code), this would be a lightweight
-                                  // operation, swapping pointers.
-
-
-    // OK, at this point we may have some newly created log-likes and we want to
-    // give them to the decoding thread.
-
-    int32 num_loglike_frames = loglikes.NumRows();
-
-    if (num_loglike_frames != 0) {  // if we need to output some loglikes...
-      while (true) {
-        // we may have to grab and release the decodable mutex
-        // a few times before it's ready to accept the loglikes.
-        if (!decodable_synchronizer_.Lock(ThreadSynchronizer::kProducer))
-          return false;
-        int32 num_frames_decoded = num_frames_decoded_;
-        // we can't have output fewer frames than were decoded.
-        KALDI_ASSERT(num_frames_output >= num_frames_decoded);
-        if (num_frames_output - num_frames_decoded <= config_.max_loglikes_copy) {
-          // If we would have to copy fewer than config_.max_loglikes_copy
-          // previously output log-likelihoods inside the decodable object, then
-          // we go ahead and copy them to that object.
-          int32 frames_to_discard = num_frames_decoded_ -
-              decodable_.FirstAvailableFrame();
-          KALDI_ASSERT(frames_to_discard >= 0);
-          num_frames_output += num_loglike_frames;
-          decodable_.AcceptLoglikes(&loglikes, frames_to_discard);
-          if (!decodable_synchronizer_.UnlockSuccess(ThreadSynchronizer::kProducer))
-            return false;
-          break;  // break from the innermost while loop.
-        } else {
-          // There are too many frames already available to the decoder, that it
-          // hasn't processed yet, and we don't want them to have to be copied
-          // inside AcceptLoglikes(), so we wait for a bit.
-          // we want the next call to Lock to block until the decoder has
-          //  processed more frames.
-          if (!decodable_synchronizer_.UnlockFailure(ThreadSynchronizer::kProducer))
-            return false;
-        }
-      }
-    }
-    if (last_time) {
-      // Inform the decodable object that there will be no more input.
-      if (!decodable_synchronizer_.Lock(ThreadSynchronizer::kProducer))
-        return false;
-      decodable_.InputIsFinished();
-      if (!decodable_synchronizer_.UnlockSuccess(ThreadSynchronizer::kProducer))
-        return false;
-      KALDI_ASSERT(num_frames_consumed == num_frames_output);
-      return true;
-    }
-  }
-}
-
-
-bool SingleUtteranceNnet2DecoderThreaded::RunDecoderSearchInternal() {
-  int32 num_frames_decoded = 0;  // this is just a copy of decoder_->NumFramesDecoded();
-  while (true) {  // decode at most one frame each loop.
-    if (!decodable_synchronizer_.Lock(ThreadSynchronizer::kConsumer))
-      return false; // AbortAllThreads() called.
-    if (decodable_.NumFramesReady() <= num_frames_decoded) {
-      // no frames available to decode.
-      KALDI_ASSERT(decodable_.NumFramesReady() == num_frames_decoded);
-      if (decodable_.IsLastFrame(num_frames_decoded - 1)) {
-        decodable_synchronizer_.UnlockSuccess(ThreadSynchronizer::kConsumer);
-        return true;  // exit from this thread; we're done.
-      } else {
-        // we were not able to advance the decoding due to no available
-        // input.  The next call will ensure that the next call to
-        // decodable_synchronizer_.Lock() will wait.
-        if (!decodable_synchronizer_.UnlockFailure(ThreadSynchronizer::kConsumer))
-          return false;
-      }
-    } else {
-      // Decode at most config_.decode_batch_size frames (e.g. 1 or 2).
-      decoder_mutex_.lock();
-      decoder_.AdvanceDecoding(&decodable_, config_.decode_batch_size);
-      num_frames_decoded = decoder_.NumFramesDecoded();
-      if (silence_weighting_.Active()) {
-        std::lock_guard<std::mutex> lock(silence_weighting_mutex_);
-        // the next function does not trace back all the way; it's very fast.
-        silence_weighting_.ComputeCurrentTraceback(decoder_);
-      }
-      decoder_mutex_.unlock();
-      num_frames_decoded_ = num_frames_decoded;
-      if (!decodable_synchronizer_.UnlockSuccess(ThreadSynchronizer::kConsumer))
-        return false;
-    }
-  }
-}
-
-bool SingleUtteranceNnet2DecoderThreaded::EndpointDetected(
-    const OnlineEndpointConfig &config) {
-  std::lock_guard<std::mutex> lock(decoder_mutex_);
-  return kaldi::EndpointDetected(config, tmodel_,
-                                 feature_pipeline_.FrameShiftInSeconds(),
-                                 decoder_);
-}
-
-
-
-}  // namespace kaldi
diff --git a/src/online2/online-nnet2-decoding-threaded.h b/src/online2/online-nnet2-decoding-threaded.h
deleted file mode 100644
index e6c8422b2a2..00000000000
--- a/src/online2/online-nnet2-decoding-threaded.h
+++ /dev/null
@@ -1,443 +0,0 @@
-// online2/online-nnet2-decoding-threaded.h
-
-// Copyright 2014-2015  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_ONLINE2_ONLINE_NNET2_DECODING_THREADED_H_
-#define KALDI_ONLINE2_ONLINE_NNET2_DECODING_THREADED_H_
-
-#include <string>
-#include <vector>
-#include <deque>
-#include <mutex>
-#include <thread>
-
-#include "matrix/matrix-lib.h"
-#include "util/common-utils.h"
-#include "base/kaldi-error.h"
-#include "decoder/decodable-matrix.h"
-#include "nnet2/am-nnet.h"
-#include "online2/online-nnet2-feature-pipeline.h"
-#include "online2/online-endpoint.h"
-#include "decoder/lattice-faster-online-decoder.h"
-#include "hmm/transition-model.h"
-#include "util/kaldi-semaphore.h"
-
-namespace kaldi {
-/// @addtogroup  onlinedecoding OnlineDecoding
-/// @{
-
-
-/**
-   class ThreadSynchronizer acts to guard an arbitrary type of buffer between a
-   producing and a consuming thread (note: it's all symmetric between the two
-   thread types).  It has a similar interface to a mutex, except that instead of
-   just Lock and Unlock, it has Lock, UnlockSuccess and UnlockFailure, and each
-   function takes an argument kProducer or kConsumer to identify whether the
-   producing or consuming thread is waiting.
-
-   The basic concept is that you lock the object; and if you discover the you're
-   blocked because you're either trying to read an empty buffer or trying to
-   write to a full buffer, you unlock with UnlockFailure; and this will cause
-   your next call to Lock to block until the *other* thread has called Lock and
-   then UnlockSuccess.  However, if at that point the other thread calls Lock
-   and then UnlockFailure, it is an error because you can't have both producing
-   and consuming threads claiming that the buffer is full/empty.  If you lock
-   the object and were successful you call UnlockSuccess; and you call
-   UnlockSuccess even if, for your own reasons, you ended up not changing the
-   state of the buffer.
-*/
-class ThreadSynchronizer {
- public:
-  ThreadSynchronizer();
-
-  // Most calls to this class should provide the thread-type of the caller,
-  // producing or consuming.  Actually the behavior of this class is symmetric
-  // between the two types of thread.
-  enum ThreadType { kProducer, kConsumer };
-
-  // All functions returning bool will return true normally, and false if
-  // SetAbort() was set; if they return false, you should probably call SetAbort()
-  // on any other ThreadSynchronizer classes you are using and then return from
-  // the thread.
-
-  // call this to lock the object being guarded.
-  bool Lock(ThreadType t);
-
-  // Call this to unlock the object being guarded, if you don't want the next call to
-  // Lock to stall.
-  bool UnlockSuccess(ThreadType t);
-
-  // Call this if you want the next call to Lock() to stall until the other
-  // (producer/consumer) thread has locked and then unlocked the mutex.  Note
-  // that, if the other thread then calls Lock and then UnlockFailure, this will
-  // generate a printed warning (and if repeated too many times, an exception).
-  bool UnlockFailure(ThreadType t);
-
-  // Sets abort_ flag so future calls will return false, and future calls to
-  // Lock() won't lock the mutex but will immediately return false.
-  void SetAbort();
-
-  ~ThreadSynchronizer();
-
- private:
-  bool abort_;
-  bool producer_waiting_;  // true if producer is/will be waiting on semaphore
-  bool consumer_waiting_;  // true if consumer is/will be waiting on semaphore
-  std::mutex mutex_;  // Locks the buffer object.
-  ThreadType held_by_;  // Record of which thread is holding the mutex (if
-                        // held); else undefined.  Used for validation of input.
-  Semaphore producer_semaphore_;  // The producer thread waits on this semaphore
-  Semaphore consumer_semaphore_;  // The consumer thread waits on this semaphore
-  int32 num_errors_;  // Rumber of times the threads alternated doing Lock() and
-                      // UnlockFailure().  This should not happen at all; but
-                      // it's more user-friendly to simply warn a few times; and then
-                      // only after a number of errors, to fail.
-  KALDI_DISALLOW_COPY_AND_ASSIGN(ThreadSynchronizer);
-};
-
-
-
-
-// This is the configuration class for SingleUtteranceNnet2DecoderThreaded.  The
-// actual command line program requires other configs that it creates
-// separately, and which are not included here: namely,
-// OnlineNnet2FeaturePipelineConfig and OnlineEndpointConfig.
-struct OnlineNnet2DecodingThreadedConfig {
-
-  LatticeFasterDecoderConfig decoder_opts;
-
-  BaseFloat acoustic_scale;
-
-  int32 max_buffered_features;  // maximum frames of features we allow to be
-                                // held in the feature buffer before we block
-                                // the feature-processing thread.
-
-  int32 feature_batch_size;  // maximum number of frames at a time that we decode
-                             // before unlocking the mutex.  The only real cost
-                             // here is a mutex lock/unlock, so it's OK to make
-                             // this fairly small.
-  int32 max_loglikes_copy;   // maximum unused frames of log-likelihoods we will
-                             // copy from the decodable object back into another
-                             // matrix to be supplied to the decodable object.
-                             // make this too large-> will block the
-                             // decoder-search thread while copying; too small
-                             // -> the nnet-evaluation thread may get blocked
-                             // for too long while waiting for the decodable
-                             // thread to be ready.
-  int32 nnet_batch_size;    // batch size (number of frames) we evaluate in the
-                            // neural net, if this many is available.  To take
-                            // best advantage of BLAS, you may want to set this
-                            // fairly large, e.g. 32 or 64 frames.  It probably
-                            // makes sense to tune this a bit.
-  int32 decode_batch_size;  // maximum number of frames at a time that we decode
-                            // before unlocking the mutex.  The only real cost
-                            // here is a mutex lock/unlock, so it's OK to make
-                            // this fairly small.
-
-  OnlineNnet2DecodingThreadedConfig() {
-    acoustic_scale = 0.1;
-    max_buffered_features = 100;
-    feature_batch_size = 2;
-    nnet_batch_size = 32;
-    max_loglikes_copy = 20;
-    decode_batch_size = 2;
-  }
-
-  void Check();
-
-  void Register(OptionsItf *opts) {
-    decoder_opts.Register(opts);
-    opts->Register("acoustic-scale", &acoustic_scale, "Scale used on acoustics "
-                   "when decoding");
-    opts->Register("max-buffered-features", &max_buffered_features, "Obscure "
-                   "setting, affects multi-threaded decoding.");
-    opts->Register("feature-batch-size", &max_buffered_features, "Obscure "
-                   "setting, affects multi-threaded decoding.");
-    opts->Register("nnet-batch-size", &nnet_batch_size, "Maximum batch size "
-                   "(in frames) used when evaluating neural net likelihoods");
-    opts->Register("max-loglikes-copy", &max_loglikes_copy,  "Obscure "
-                   "setting, affects multi-threaded decoding.");
-    opts->Register("decode-batch-sie", &decode_batch_size, "Obscure "
-                   "setting, affects multi-threaded decoding.");
-  }
-};
-
-/**
-   You will instantiate this class when you want to decode a single
-   utterance using the online-decoding setup for neural nets.  Each time this
-   class is created, it creates three background threads, and the feature
-   extraction, neural net evaluation, and search aspects of decoding all
-   happen in different threads.
-   Note: we assume that all calls to its public interface happen from a single
-   thread.
-*/
-class SingleUtteranceNnet2DecoderThreaded {
- public:
-  // Constructor.  Unlike SingleUtteranceNnet2Decoder, we create the
-  // feature_pipeline object inside this class, since access to it needs to be
-  // controlled by a mutex and this class knows how to handle that.  The
-  // feature_info and adaptation_state arguments are used to initialize the
-  // (locally owned) feature pipeline.
-  SingleUtteranceNnet2DecoderThreaded(
-      const OnlineNnet2DecodingThreadedConfig &config,
-      const TransitionModel &tmodel,
-      const nnet2::AmNnet &am_nnet,
-      const fst::Fst<fst::StdArc> &fst,
-      const OnlineNnet2FeaturePipelineInfo &feature_info,
-      const OnlineIvectorExtractorAdaptationState &adaptation_state,
-      const OnlineCmvnState &cmvn_state);
-
-
-
-  /// You call this to provide this class with more waveform to decode.  This
-  /// call is, for all practical purposes, non-blocking.
-  void AcceptWaveform(BaseFloat samp_freq,
-                      const VectorBase<BaseFloat> &wave_part);
-
-  /// Returns the number of pieces of waveform that are still waiting to be
-  /// processed.  This may be useful for calling code to judge whether to supply
-  /// more waveform or to wait.
-  int32 NumWaveformPiecesPending();
-
-  /// You call this to inform the class that no more waveform will be provided;
-  /// this allows it to flush out the last few frames of features, and is
-  /// necessary if you want to call Wait() to wait until all decoding is done.
-  /// After calling InputFinished() you cannot call AcceptWaveform any more.
-  void InputFinished();
-
-  /// You can call this if you don't want the decoding to proceed further with
-  /// this utterance.  It just won't do any more processing, but you can still
-  /// use the lattice from the decoding that it's already done.  Note: it may
-  /// still continue decoding up to decode_batch_size (default: 2) frames of
-  /// data before the decoding thread exits.  You can call Wait() after calling
-  /// this, if you want to wait for that.
-  void TerminateDecoding();
-
-  /// This call will block until all the data has been decoded; it must only be
-  /// called after either InputFinished() has been called or TerminateDecoding() has
-  /// been called; otherwise, to call it is an error.
-  void Wait();
-
-  /// Finalizes the decoding. Cleans up and prunes remaining tokens, so the final
-  /// lattice is faster to obtain.  May not be called unless either InputFinished()
-  /// or TerminateDecoding() has been called.  If InputFinished() was called, it
-  /// calls Wait() to ensure that the decoding has finished (it's not an error
-  /// if you already called Wait()).
-  void FinalizeDecoding();
-
-  /// Returns *approximately* (ignoring end effects), the number of frames of
-  /// data that we expect given the amount of data that the pipeline has
-  /// received via AcceptWaveform().  (ignores small end effects).  This might
-  /// be useful in application code to compare with NumFramesDecoded() and gauge
-  /// how much latency there is.
-  int32 NumFramesReceivedApprox() const;
-
-  /// Returns the number of frames currently decoded.  Caution: don't rely on
-  /// the lattice having exactly this number if you get it after this call, as
-  /// it may increase after this-- unless you've already called either
-  /// TerminateDecoding() or InputFinished(), followed by Wait().
-  int32 NumFramesDecoded() const;
-
-  /// Gets the lattice.  The output lattice has any acoustic scaling in it
-  /// (which will typically be desirable in an online-decoding context); if you
-  /// want an un-scaled lattice, scale it using ScaleLattice() with the inverse
-  /// of the acoustic weight.  "end_of_utterance" will be true if you want the
-  /// final-probs to be included.  If this is at the end of the utterance,
-  /// you might want to first call FinalizeDecoding() first; this will make this
-  /// call return faster.
-  /// If no frames have been decoded yet, it will set clat to a lattice with
-  /// a single state that is final and with unit weight (no cost or alignment).
-  /// The output to final_relative_cost (if non-NULL) is a number >= 0 that's
-  /// closer to 0 if a final-state was close to the best-likelihood state
-  /// active on the last frame, at the time we obtained the lattice.
-  void GetLattice(bool end_of_utterance,
-                  CompactLattice *clat,
-                  BaseFloat *final_relative_cost) const;
-
-  /// Outputs an FST corresponding to the single best path through the current
-  /// lattice. If "use_final_probs" is true AND we reached the final-state of
-  /// the graph then it will include those as final-probs, else it will treat
-  /// all final-probs as one.
-  /// If no frames have been decoded yet, it will set best_path to a lattice with
-  /// a single state that is final and with unit weight (no cost).
-  /// The output to final_relative_cost (if non-NULL) is a number >= 0 that's
-  /// closer to 0 if a final-state were close to the best-likelihood state
-  /// active on the last frame, at the time we got the best path.
-  void GetBestPath(bool end_of_utterance,
-                   Lattice *best_path,
-                   BaseFloat *final_relative_cost) const;
-
-  /// This function calls EndpointDetected from online-endpoint.h,
-  /// with the required arguments.
-  bool EndpointDetected(const OnlineEndpointConfig &config);
-
-  /// Outputs the adaptation state of the feature pipeline to "adaptation_state".  This
-  /// mostly stores stats for iVector estimation, and will generally be called at the
-  /// end of an utterance, assuming it's a scenario where each speaker is seen for
-  /// more than one utterance.
-  /// You may only call this function after either calling TerminateDecoding() or
-  /// InputFinished, and then Wait().  Otherwise it is an error.
-  void GetAdaptationState(OnlineIvectorExtractorAdaptationState *adaptation_state);
-
-  /// Outputs the OnlineCmvnState of the feature pipeline to "cmvn_stat".  This
-  /// stores cmvn stats for the non-iVector features, and will be called at the
-  /// end of an utterance, assuming it's a scenario where each speaker is seen for
-  /// more than one utterance.
-  /// You may only call this function after either calling TerminateDecoding() or
-  /// InputFinished, and then Wait().  Otherwise it is an error.
-  void GetCmvnState(OnlineCmvnState *cmvn_state);
-
-  /// Gets the remaining, un-decoded part of the waveform and returns the sample
-  /// rate.  May only be called after Wait(), and it only makes sense to call
-  /// this if you called TerminateDecoding() before Wait().  The idea is that
-  /// you can then provide this un-decoded piece of waveform to another decoder.
-  BaseFloat GetRemainingWaveform(Vector<BaseFloat> *waveform_out) const;
-
-  ~SingleUtteranceNnet2DecoderThreaded();
- private:
-
-  // This function will instruct all threads to abort operation as soon as they
-  // can safely do so, by calling SetAbort() in the threads
-  void AbortAllThreads(bool error);
-
-  // This function waits for all the threads that have been spawned. It is
-  // called in the destructor and Wait(). If called twice it is not an error.
-  void WaitForAllThreads();
-
-
-
-  // this function runs the thread that does the feature extraction and
-  // neural-net evaluation. In case of failure, calls
-  // me->AbortAllThreads(true).
-  static void RunNnetEvaluation(SingleUtteranceNnet2DecoderThreaded *me);
-  // member-function version of RunNnetEvaluation, called by RunNnetEvaluation.
-  bool RunNnetEvaluationInternal();
-  // the following function is called inside RunNnetEvaluationInternal(); it
-  // takes the log and subtracts the prior.
-  void ProcessLoglikes(const CuVector<BaseFloat> &log_inv_prior,
-                       CuMatrixBase<BaseFloat> *loglikes);
-  // called from RunNnetEvaluationInternal().  Returns true in the normal case,
-  // false on error; if it returns false, then we expect that the calling thread
-  // will terminate.  This assumes the caller has already
-  // locked feature_pipeline_mutex_.
-  bool FeatureComputation(int32 num_frames_output);
-
-
-  // this function runs the thread that does the neural-net evaluation.
-  // In case of failure, calls me->AbortAllThreads(true).
-  static void RunDecoderSearch(SingleUtteranceNnet2DecoderThreaded *me);
-  // member-function version of RunDecoderSearch, called by RunDecoderSearch.
-  bool RunDecoderSearchInternal();
-
-
-  // Member variables:
-
-  OnlineNnet2DecodingThreadedConfig config_;
-
-  const nnet2::AmNnet &am_nnet_;
-
-  const TransitionModel &tmodel_;
-
-
-  // sampling_rate_ is set the first time AcceptWaveform is called.
-  BaseFloat sampling_rate_;
-  // A record of how many samples have been provided so
-  // far via calls to AcceptWaveform.
-  int64 num_samples_received_;
-
-  // The next two variables are written to by AcceptWaveform from the main
-  // thread, and read by the feature-processing thread; they are guarded by
-  // waveform_synchronizer_.  There is no bound on the buffer size here.
-  // Later-arriving data is appended to the vector.  When InputFinished() is
-  // called from the main thread, the main thread sets input_finished_ = true.
-  // sampling_rate_ is only needed for checking that it matches the config.
-  bool input_finished_;
-  std::deque< Vector<BaseFloat>* > input_waveform_;
-
-
-  ThreadSynchronizer waveform_synchronizer_;
-
-  // feature_pipeline_ is accessed by the nnet-evaluation thread, by the main
-  // thread if GetAdaptionState() is called, and by the decoding thread via
-  // ComputeCurrentTraceback() if online silence weighting is being used.  It is
-  // guarded by feature_pipeline_mutex_.
-  OnlineNnet2FeaturePipeline feature_pipeline_;
-  std::mutex feature_pipeline_mutex_;
-
-  // The next two variables are required only for implementation of the function
-  // GetRemainingWaveform().  After we take waveform from the input_waveform_
-  // queue to be processed into features, we put them onto this deque.  Then we
-  // discard from this queue any that we can discard because we have already
-  // decoded those frames (see num_frames_decoded_), and we increment
-  // num_samples_discarded_ by the corresponding number of samples.
-  std::deque< Vector<BaseFloat>* > processed_waveform_;
-  int64 num_samples_discarded_;
-
-  // This object is used to control the (optional) downweighting of silence in iVector estimation,
-  // which is based on the decoder traceback.
-  OnlineSilenceWeighting silence_weighting_;
-  std::mutex silence_weighting_mutex_;
-
-
-  // this Decodable object just stores a matrix of scaled log-likelihoods
-  // obtained by the nnet-evaluation thread.  It is produced by the
-  // nnet-evaluation thread and consumed by the decoder-search thread.  The
-  // decoding thread sets num_frames_decoded_ so the nnet-evaluation thread
-  // knows which frames of log-likelihoods it can discard.  Both of these
-  // variables are guarded by decodable_synchronizer_.  Note:
-  // the num_frames_decoded_ may be less than the current number of frames
-  // the decoder has decoded; the decoder thread sets this variable when it
-  // locks this mutex.
-  DecodableMatrixMappedOffset decodable_;
-  int32 num_frames_decoded_;
-  ThreadSynchronizer decodable_synchronizer_;
-
-  // the decoder_ object contains everything related to the graph search.
-  LatticeFasterOnlineDecoder decoder_;
-  // decoder_mutex_ guards the decoder_ object.  It is usually held by the decoding
-  // thread (where it is released and re-obtained on each frame), but is obtained
-  // by the main (parent) thread if you call functions like NumFramesDecoded(),
-  // GetLattice() and GetBestPath().
-  mutable std::mutex decoder_mutex_;  // declared as mutable because we mutate
-                                      // this mutex in const methods
-
-  // This contains the thread pointers for the nnet-evaluation and
-  // decoder-search threads respectively (or NULL if they have been joined in
-  // Wait()).
-  std::thread threads_[2];
-
-  // This is set to true if AbortAllThreads was called for any reason, including
-  // if someone called TerminateDecoding().
-  bool abort_;
-
-  // This is set to true if any kind of unexpected error is encountered,
-  // including if exceptions are raised in any of the threads.  Will normally
-  // be a coding error, malloc failure-- something we should never encounter.
-  bool error_;
-
-};
-
-
-/// @} End of "addtogroup onlinedecoding"
-
-}  // namespace kaldi
-
-
-
-#endif  // KALDI_ONLINE2_ONLINE_NNET2_DECODING_THREADED_H_
diff --git a/src/online2/online-nnet2-decoding.cc b/src/online2/online-nnet2-decoding.cc
deleted file mode 100644
index fdd1b78a880..00000000000
--- a/src/online2/online-nnet2-decoding.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-// online2/online-nnet2-decoding.cc
-
-// Copyright    2013-2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "online2/online-nnet2-decoding.h"
-#include "lat/lattice-functions.h"
-#include "lat/determinize-lattice-pruned.h"
-
-namespace kaldi {
-
-SingleUtteranceNnet2Decoder::SingleUtteranceNnet2Decoder(
-    const OnlineNnet2DecodingConfig &config,
-    const TransitionModel &tmodel,
-    const nnet2::AmNnet &model,
-    const fst::Fst<fst::StdArc> &fst,
-    OnlineFeatureInterface *feature_pipeline):
-    config_(config),
-    feature_pipeline_(feature_pipeline),
-    tmodel_(tmodel),
-    decodable_(model, tmodel, config.decodable_opts, feature_pipeline),
-    decoder_(fst, config.decoder_opts) {
-  decoder_.InitDecoding();
-}
-
-void SingleUtteranceNnet2Decoder::AdvanceDecoding() {
-  decoder_.AdvanceDecoding(&decodable_);
-}
-
-void SingleUtteranceNnet2Decoder::FinalizeDecoding() {
-  decoder_.FinalizeDecoding();
-}
-
-int32 SingleUtteranceNnet2Decoder::NumFramesDecoded() const {
-  return decoder_.NumFramesDecoded();
-}
-
-void SingleUtteranceNnet2Decoder::GetLattice(bool end_of_utterance,
-                                             CompactLattice *clat) const {
-  if (NumFramesDecoded() == 0)
-    KALDI_ERR << "You cannot get a lattice if you decoded no frames.";
-  Lattice raw_lat;
-  decoder_.GetRawLattice(&raw_lat, end_of_utterance);
-
-  if (!config_.decoder_opts.determinize_lattice)
-    KALDI_ERR << "--determinize-lattice=false option is not supported at the moment";
-
-  BaseFloat lat_beam = config_.decoder_opts.lattice_beam;
-  DeterminizeLatticePhonePrunedWrapper(
-      tmodel_, &raw_lat, lat_beam, clat, config_.decoder_opts.det_opts);
-}
-
-void SingleUtteranceNnet2Decoder::GetBestPath(bool end_of_utterance,
-                                              Lattice *best_path) const {
-  decoder_.GetBestPath(best_path, end_of_utterance);
-}
-
-bool SingleUtteranceNnet2Decoder::EndpointDetected(
-    const OnlineEndpointConfig &config) {
-  return kaldi::EndpointDetected(config, tmodel_,
-                                 feature_pipeline_->FrameShiftInSeconds(),
-                                 decoder_);  
-}
-
-
-}  // namespace kaldi
-
diff --git a/src/online2/online-nnet2-decoding.h b/src/online2/online-nnet2-decoding.h
deleted file mode 100644
index 2d48971694b..00000000000
--- a/src/online2/online-nnet2-decoding.h
+++ /dev/null
@@ -1,131 +0,0 @@
-// online2/online-nnet2-decoding.h
-
-// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_ONLINE2_ONLINE_NNET2_DECODING_H_
-#define KALDI_ONLINE2_ONLINE_NNET2_DECODING_H_
-
-#include <string>
-#include <vector>
-#include <deque>
-
-#include "matrix/matrix-lib.h"
-#include "util/common-utils.h"
-#include "base/kaldi-error.h"
-#include "nnet2/online-nnet2-decodable.h"
-#include "itf/online-feature-itf.h"
-#include "online2/online-endpoint.h"
-#include "decoder/lattice-faster-online-decoder.h"
-#include "hmm/transition-model.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-/// @addtogroup  onlinedecoding OnlineDecoding
-/// @{
-
-
-
-
-
-// This configuration class contains the configuration classes needed to create
-// the class SingleUtteranceNnet2Decoder.  The actual command line program
-// requires other configs that it creates separately, and which are not included
-// here: namely, OnlineNnet2FeaturePipelineConfig and OnlineEndpointConfig.
-struct OnlineNnet2DecodingConfig {
-  
-  LatticeFasterDecoderConfig decoder_opts;
-  nnet2::DecodableNnet2OnlineOptions decodable_opts;
-  
-  OnlineNnet2DecodingConfig() {  decodable_opts.acoustic_scale = 0.1; }
-  
-  void Register(OptionsItf *opts) {
-    decoder_opts.Register(opts);
-    decodable_opts.Register(opts);
-  }
-};
-
-/**
-   You will instantiate this class when you want to decode a single
-   utterance using the online-decoding setup for neural nets.
-*/
-class SingleUtteranceNnet2Decoder {
- public:
-  // Constructor.  The feature_pipeline_ pointer is not owned in this
-  // class, it's owned externally.
-  SingleUtteranceNnet2Decoder(const OnlineNnet2DecodingConfig &config,
-                              const TransitionModel &tmodel,
-                              const nnet2::AmNnet &model,
-                              const fst::Fst<fst::StdArc> &fst,
-                              OnlineFeatureInterface *feature_pipeline);
-  
-  /// advance the decoding as far as we can.
-  void AdvanceDecoding();
-
-  /// Finalizes the decoding. Cleans up and prunes remaining tokens, so the
-  /// GetLattice() call will return faster.  You must not call this before
-  /// calling (TerminateDecoding() or InputIsFinished()) and then Wait().
-  void FinalizeDecoding();
-
-  int32 NumFramesDecoded() const;
-  
-  /// Gets the lattice.  The output lattice has any acoustic scaling in it
-  /// (which will typically be desirable in an online-decoding context); if you
-  /// want an un-scaled lattice, scale it using ScaleLattice() with the inverse
-  /// of the acoustic weight.  "end_of_utterance" will be true if you want the
-  /// final-probs to be included.
-  void GetLattice(bool end_of_utterance,
-                  CompactLattice *clat) const;
-  
-  /// Outputs an FST corresponding to the single best path through the current
-  /// lattice. If "use_final_probs" is true AND we reached the final-state of
-  /// the graph then it will include those as final-probs, else it will treat
-  /// all final-probs as one.
-  void GetBestPath(bool end_of_utterance,
-                   Lattice *best_path) const;
-
-
-  /// This function calls EndpointDetected from online-endpoint.h,
-  /// with the required arguments.
-  bool EndpointDetected(const OnlineEndpointConfig &config);
-
-  const LatticeFasterOnlineDecoder &Decoder() const { return decoder_; }
-  
-  ~SingleUtteranceNnet2Decoder() { }
- private:
-
-  OnlineNnet2DecodingConfig config_;
-
-  OnlineFeatureInterface *feature_pipeline_;
-
-  const TransitionModel &tmodel_;
-  
-  nnet2::DecodableNnet2Online decodable_;
-  
-  LatticeFasterOnlineDecoder decoder_;
-  
-};
-
-  
-/// @} End of "addtogroup onlinedecoding"
-
-}  // namespace kaldi
-
-
-
-#endif  // KALDI_ONLINE2_ONLINE_NNET2_DECODING_H_
diff --git a/src/online2/online-nnet2-feature-pipeline.h b/src/online2/online-nnet2-feature-pipeline.h
index 6275378823a..95ce80adecf 100644
--- a/src/online2/online-nnet2-feature-pipeline.h
+++ b/src/online2/online-nnet2-feature-pipeline.h
@@ -41,7 +41,7 @@ namespace kaldi {
 /// \ref online-feature-pipeline.h, specialized for use in neural network
 /// decoding with iVectors.  Our recipe is that we extract iVectors that will
 /// be used as an additional input to the neural network, in addition to
-/// a window of several frames of spliced raw features (MFCC, PLP or filterbanks).
+/// a window of several frames of spliced raw features (MFCC, or filterbanks).
 /// The iVectors are extracted on top of a (splice+LDA+MLLT) feature pipeline,
 /// with the added complication that the GMM posteriors used for the iVector
 /// extraction are obtained with a version of the features that has online
@@ -67,9 +67,8 @@ namespace kaldi {
 /// Instead of taking the options for the parts of the feature pipeline
 /// directly, it reads in the names of configuration classes.
 struct OnlineNnet2FeaturePipelineConfig {
-  std::string feature_type;  // "plp" or "mfcc" or "fbank"
+  std::string feature_type;  // or "mfcc" or "fbank"
   std::string mfcc_config;
-  std::string plp_config;
   std::string fbank_config;
   std::string cmvn_config;
   std::string global_cmvn_stats_rxfilename;
@@ -100,11 +99,9 @@ struct OnlineNnet2FeaturePipelineConfig {
 
   void Register(OptionsItf *opts) {
     opts->Register("feature-type", &feature_type,
-                   "Base feature type [mfcc, plp, fbank]");
+                   "Base feature type [mfcc, fbank]");
     opts->Register("mfcc-config", &mfcc_config, "Configuration file for "
                    "MFCC features (e.g. conf/mfcc.conf)");
-    opts->Register("plp-config", &plp_config, "Configuration file for "
-                   "PLP features (e.g. conf/plp.conf)");
     opts->Register("fbank-config", &fbank_config, "Configuration file for "
                    "filterbank features (e.g. conf/fbank.conf)");
     opts->Register("cmvn-config", &cmvn_config, "Configuration file for "
@@ -115,7 +112,7 @@ struct OnlineNnet2FeaturePipelineConfig {
                    "filename with global stats for OnlineCmvn for features "
                    "on nnet3 input (not ivector features)");
     opts->Register("add-pitch", &add_pitch, "Append pitch features to raw "
-                   "MFCC/PLP/filterbank features [but not for iVector extraction]");
+                   "MFCC/filterbank features [but not for iVector extraction]");
     opts->Register("online-pitch-config", &online_pitch_config, "Configuration "
                    "file for online pitch features, if --add-pitch=true (e.g. "
                    "conf/online_pitch.conf)");
@@ -144,14 +141,12 @@ struct OnlineNnet2FeaturePipelineInfo {
 
   BaseFloat FrameShiftInSeconds() const;
 
-  std::string feature_type; /// "mfcc" or "plp" or "fbank"
-
-  MfccOptions mfcc_opts;  /// options for MFCC computation,
-                          /// if feature_type == "mfcc"
-  PlpOptions plp_opts;    /// Options for PLP computation, if feature_type == "plp"
-  FbankOptions fbank_opts;  /// Options for filterbank computation, if
-                            /// feature_type == "fbank"
+  std::string feature_type;  // "mfcc" or "fbank"
 
+  MfccOptions mfcc_opts;  // options for MFCC computation,
+                          // if feature_type == "mfcc"
+  FbankOptions fbank_opts;  // Options for filterbank computation, if
+                            // feature_type == "fbank"
   bool add_pitch;
   PitchExtractionOptions pitch_opts;  /// Options for pitch extraction, if done.
   ProcessPitchOptions pitch_process_opts;  /// Options for pitch post-processing
@@ -186,7 +181,7 @@ struct OnlineNnet2FeaturePipelineInfo {
 /// OnlineNnet2FeaturePipeline is a class that's responsible for putting
 /// together the various parts of the feature-processing pipeline for neural
 /// networks, in an online setting.  The recipe here does not include fMLLR;
-/// instead, it assumes we're giving raw features such as MFCC or PLP or
+/// instead, it assumes we're giving raw features such as MFCC or
 /// filterbank (with no CMVN) to the neural network, and optionally augmenting
 /// these with an iVector that describes the speaker characteristics.  The
 /// iVector is extracted using class OnlineIvectorFeature (see that class for
@@ -290,7 +285,7 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
  private:
   const OnlineNnet2FeaturePipelineInfo &info_;
 
-  OnlineBaseFeature *base_feature_;    /// MFCC/PLP/filterbank
+  OnlineBaseFeature *base_feature_;        // MFCC/filterbank
 
   OnlinePitchFeature *pitch_;          /// Raw pitch, if used
   OnlineProcessPitch *pitch_feature_;  /// Processed pitch, if pitch used.
diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc
index 1a6e43f1723..08c8ba28060 100644
--- a/src/online2/online-nnet3-decoding.cc
+++ b/src/online2/online-nnet3-decoding.cc
@@ -28,7 +28,7 @@ namespace kaldi {
 template <typename FST>
 SingleUtteranceNnet3DecoderTpl<FST>::SingleUtteranceNnet3DecoderTpl(
     const LatticeFasterDecoderConfig &decoder_opts,
-    const TransitionModel &trans_model,
+    const Transitions &trans_model,
     const nnet3::DecodableNnetSimpleLoopedInfo &info,
     const FST &fst,
     OnlineNnet2FeaturePipeline *features):
diff --git a/src/online2/online-nnet3-decoding.h b/src/online2/online-nnet3-decoding.h
index 9adf77fcb56..b80baad893f 100644
--- a/src/online2/online-nnet3-decoding.h
+++ b/src/online2/online-nnet3-decoding.h
@@ -34,7 +34,7 @@
 #include "online2/online-endpoint.h"
 #include "online2/online-nnet2-feature-pipeline.h"
 #include "decoder/lattice-faster-online-decoder.h"
-#include "hmm/transition-model.h"
+#include "hmm/transitions.h"
 #include "hmm/posterior.h"
 
 namespace kaldi {
@@ -55,7 +55,7 @@ class SingleUtteranceNnet3DecoderTpl {
   // Constructor. The pointer 'features' is not being given to this class to own
   // and deallocate, it is owned externally.
   SingleUtteranceNnet3DecoderTpl(const LatticeFasterDecoderConfig &decoder_opts,
-                                 const TransitionModel &trans_model,
+                                 const Transitions &trans_model,
                                  const nnet3::DecodableNnetSimpleLoopedInfo &info,
                                  const FST &fst,
                                  OnlineNnet2FeaturePipeline *features);
@@ -109,7 +109,7 @@ class SingleUtteranceNnet3DecoderTpl {
 
   // we need to keep a reference to the transition model around only because
   // it's needed by the endpointing code.
-  const TransitionModel &trans_model_;
+  const Transitions &trans_model_;
 
   nnet3::DecodableAmNnetLoopedOnline decodable_;
 
diff --git a/src/online2/online-nnet2-feature-pipeline.cc b/src/online2/online2-feature-pipeline.cc
similarity index 93%
rename from src/online2/online-nnet2-feature-pipeline.cc
rename to src/online2/online2-feature-pipeline.cc
index b291ba92d98..7f80e4f86e7 100644
--- a/src/online2/online-nnet2-feature-pipeline.cc
+++ b/src/online2/online2-feature-pipeline.cc
@@ -25,12 +25,12 @@ namespace kaldi {
 OnlineNnet2FeaturePipelineInfo::OnlineNnet2FeaturePipelineInfo(
     const OnlineNnet2FeaturePipelineConfig &config):
     silence_weighting_config(config.silence_weighting_config) {
-  if (config.feature_type == "mfcc" || config.feature_type == "plp" ||
+  if (config.feature_type == "mfcc" ||
       config.feature_type == "fbank") {
     feature_type = config.feature_type;
   } else {
     KALDI_ERR << "Invalid feature type: " << config.feature_type << ". "
-              << "Supported feature types: mfcc, plp, fbank.";
+              << "Supported feature types: mfcc, fbank.";
   }
 
   if (config.mfcc_config != "") {
@@ -40,13 +40,6 @@ OnlineNnet2FeaturePipelineInfo::OnlineNnet2FeaturePipelineInfo(
                  << "since feature type is set to " << feature_type << ".";
   }  // else use the defaults.
 
-  if (config.plp_config != "") {
-    ReadConfigFromFile(config.plp_config, &plp_opts);
-    if (feature_type != "plp")
-      KALDI_WARN << "--plp-config option has no effect "
-                 << "since feature type is set to " << feature_type << ".";
-  }  // else use the defaults.
-
   if (config.fbank_config != "") {
     ReadConfigFromFile(config.fbank_config, &fbank_opts);
     if (feature_type != "fbank")
@@ -100,8 +93,6 @@ OnlineNnet2FeaturePipeline::OnlineNnet2FeaturePipeline(
 
   if (info_.feature_type == "mfcc") {
     base_feature_ = new OnlineMfcc(info_.mfcc_opts);
-  } else if (info_.feature_type == "plp") {
-    base_feature_ = new OnlinePlp(info_.plp_opts);
   } else if (info_.feature_type == "fbank") {
     base_feature_ = new OnlineFbank(info_.fbank_opts);
   } else {
@@ -233,8 +224,6 @@ BaseFloat OnlineNnet2FeaturePipelineInfo::FrameShiftInSeconds() const {
     return mfcc_opts.frame_opts.frame_shift_ms / 1000.0f;
   } else if (feature_type == "fbank") {
     return fbank_opts.frame_opts.frame_shift_ms / 1000.0f;
-  } else if (feature_type == "plp") {
-    return plp_opts.frame_opts.frame_shift_ms / 1000.0f;
   } else {
     KALDI_ERR << "Unknown feature type " << feature_type;
     return 0.0;
diff --git a/src/online2bin/Makefile b/src/online2bin/Makefile
index 28c135eb950..21b309e3200 100644
--- a/src/online2bin/Makefile
+++ b/src/online2bin/Makefile
@@ -8,9 +8,8 @@ LDLIBS += $(CUDA_LDLIBS)
 
 BINFILES = online2-wav-gmm-latgen-faster apply-cmvn-online \
      extend-wav-with-silence compress-uncompress-speex \
-     online2-wav-nnet2-latgen-faster ivector-extract-online2 \
+     ivector-extract-online2 \
      online2-wav-dump-features ivector-randomize \
-     online2-wav-nnet2-am-compute  online2-wav-nnet2-latgen-threaded \
      online2-wav-nnet3-latgen-faster online2-wav-nnet3-latgen-grammar \
      online2-tcp-nnet3-decode-faster
 
@@ -19,10 +18,10 @@ OBJFILES =
 TESTFILES =
 
 ADDLIBS = ../online2/kaldi-online2.a ../ivector/kaldi-ivector.a \
-          ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a ../nnet2/kaldi-nnet2.a \
+          ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 include ../makefiles/default_rules.mk
diff --git a/src/online2bin/online2-tcp-nnet3-decode-faster.cc b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
index f68bf91cf60..44d8a8d818d 100644
--- a/src/online2bin/online2-tcp-nnet3-decode-faster.cc
+++ b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
@@ -186,7 +186,7 @@ int main(int argc, char *argv[]) {
 
     KALDI_VLOG(1) << "Loading AM...";
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     nnet3::AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/online2bin/online2-wav-dump-features.cc b/src/online2bin/online2-wav-dump-features.cc
index 6ddd3bf83e5..137d9523a4e 100644
--- a/src/online2bin/online2-wav-dump-features.cc
+++ b/src/online2bin/online2-wav-dump-features.cc
@@ -18,7 +18,6 @@
 // limitations under the License.
 
 #include "feat/wave-reader.h"
-#include "online2/online-nnet2-decoding.h"
 #include "online2/online-nnet2-feature-pipeline.h"
 #include "online2/onlinebin-util.h"
 #include "online2/online-timing.h"
diff --git a/src/online2bin/online2-wav-nnet2-am-compute.cc b/src/online2bin/online2-wav-nnet2-am-compute.cc
deleted file mode 100644
index 28ae0ccd01d..00000000000
--- a/src/online2bin/online2-wav-nnet2-am-compute.cc
+++ /dev/null
@@ -1,210 +0,0 @@
-// online2bin/online2-wav-nnet2-am-compute.cc
-
-// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
-//           2014  David Snyder
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "feat/wave-reader.h"
-#include "online2/online-nnet2-decoding.h"
-#include "online2/online-nnet2-feature-pipeline.h"
-#include "online2/onlinebin-util.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Simulates the online neural net computation for each file of input\n"
-        "features, and outputs as a matrix the result, with optional\n"
-        "iVector-based speaker adaptation. Note: some configuration values\n"
-        "and inputs are set via config files whose filenames are passed as\n"
-        "options.  Used mostly for debugging.\n"
-        "Note: if you want it to apply a log (e.g. for log-likelihoods), use\n"
-        "--apply-log=true.\n"
-        "\n"
-        "Usage:  online2-wav-nnet2-am-compute [options] <nnet-in>\n"
-        "<spk2utt-rspecifier> <wav-rspecifier> <feature-or-loglikes-wspecifier>\n"
-        "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
-        "you want to compute utterance by utterance.\n";
-
-    BaseFloat chunk_length_secs = 0.05;
-    bool apply_log = false;
-    bool pad_input = true;
-    bool online = true;
-
-    // feature_config includes configuration for the iVector adaptation,
-    // as well as the basic features.
-    OnlineNnet2FeaturePipelineConfig feature_config;
-    ParseOptions po(usage);
-    po.Register("apply-log", &apply_log, "Apply a log to the result of the computation "
-                "before outputting.");
-    po.Register("pad-input", &pad_input, "If true, duplicate the first and last frames "
-                "of input features as required for temporal context, to prevent #frames "
-                "of output being less than those of input.");
-    po.Register("chunk-length", &chunk_length_secs,
-                "Length of chunk size in seconds, that we process.");
-    po.Register("online", &online,
-                "You can set this to false to disable online iVector estimation "
-                "and have all the data for each utterance used, even at "
-                "utterance start.  This is useful where you just want the best "
-                "results and don't care about online operation.  Setting this to "
-                "false has the same effect as setting "
-                "--use-most-recent-ivector=true and --greedy-ivector-extractor=true "
-                "in the file given to --ivector-extraction-config, and "
-                "--chunk-length=-1.");
-
-    feature_config.Register(&po);
-    po.Read(argc, argv);
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      return 1;
-    }
-
-    std::string nnet2_rxfilename = po.GetArg(1),
-        spk2utt_rspecifier = po.GetArg(2),
-        wav_rspecifier = po.GetArg(3),
-        features_or_loglikes_wspecifier = po.GetArg(4);
-
-    OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
-    if (!online) {
-      feature_info.ivector_extractor_info.use_most_recent_ivector = true;
-      feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
-      chunk_length_secs = -1.0;
-    }
-
-    Matrix<double> global_cmvn_stats;
-    if (feature_info.global_cmvn_stats_rxfilename != "")
-      ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
-                      &global_cmvn_stats);
-
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet2_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-    Nnet &nnet = am_nnet.GetNnet();
-
-    int64 num_done = 0, num_frames = 0;
-    SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-    RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
-    BaseFloatCuMatrixWriter writer(features_or_loglikes_wspecifier);
-
-    for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-      std::string spk = spk2utt_reader.Key();
-      const std::vector<std::string> &uttlist = spk2utt_reader.Value();
-
-      OnlineIvectorExtractorAdaptationState adaptation_state(
-          feature_info.ivector_extractor_info);
-      OnlineCmvnState cmvn_state(global_cmvn_stats);
-
-      for (size_t i = 0; i < uttlist.size(); i++) {
-        std::string utt = uttlist[i];
-        if (!wav_reader.HasKey(utt)) {
-          KALDI_WARN << "Did not find audio for utterance " << utt;
-          continue;
-        }
-        const WaveData &wave_data = wav_reader.Value(utt);
-        // get the data for channel zero (if the signal is not mono, we only
-        // take the first channel).
-        SubVector<BaseFloat> data(wave_data.Data(), 0);
-
-        OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
-        feature_pipeline.SetAdaptationState(adaptation_state);
-        feature_pipeline.SetCmvnState(cmvn_state);
-
-        BaseFloat samp_freq = wave_data.SampFreq();
-        int32 chunk_length;
-        if (chunk_length_secs > 0) {
-          chunk_length = int32(samp_freq * chunk_length_secs);
-          if (chunk_length == 0) chunk_length = 1;
-        } else {
-          chunk_length = std::numeric_limits<int32>::max();
-        }
-
-        int32 samp_offset = 0;
-        while (samp_offset < data.Dim()) {
-          int32 samp_remaining = data.Dim() - samp_offset;
-          int32 num_samp = chunk_length < samp_remaining ? chunk_length
-                                                         : samp_remaining;
-
-          SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
-          feature_pipeline.AcceptWaveform(samp_freq, wave_part);
-
-          samp_offset += num_samp;
-          if (samp_offset == data.Dim()) {
-            // no more input. flush out last frames
-            feature_pipeline.InputFinished();
-          }
-        }
-
-        int32 feats_num_frames = feature_pipeline.NumFramesReady(),
-              feats_dim = feature_pipeline.Dim();
-        Matrix<BaseFloat> feats(feats_num_frames, feats_dim);
-
-        for (int32 i = 0; i < feats_num_frames; i++) {
-          SubVector<BaseFloat> frame_vector(feats, i);
-          feature_pipeline.GetFrame(i, &frame_vector);
-        }
-
-        // In an application you might avoid updating the adaptation state if
-        // you felt the utterance had low confidence.  See lat/confidence.h
-        feature_pipeline.GetAdaptationState(&adaptation_state);
-        feature_pipeline.GetCmvnState(&cmvn_state);
-
-        int32 output_frames = feats.NumRows(),
-              output_dim = nnet.OutputDim();
-        CuMatrix<BaseFloat> output(output_frames, output_dim),
-                            feats_cu(feats);
-
-        if (!pad_input)
-          output_frames -= nnet.LeftContext() + nnet.RightContext();
-        if (output_frames <= 0) {
-          KALDI_WARN << "Skipping utterance " << utt << " because output "
-                     << "would be empty.";
-          continue;
-        }
-
-        NnetComputation(nnet, feats_cu, pad_input, &output);
-
-        if (apply_log) {
-          output.ApplyFloor(1.0e-20);
-          output.ApplyLog();
-        }
-
-        writer.Write(utt, output);
-        num_frames += feats.NumRows();
-        num_done++;
-
-        KALDI_LOG << "Processed data for utterance " << utt;
-      }
-    }
-
-    KALDI_LOG << "Processed " << num_done << " feature files, "
-              << num_frames << " frames of input were processed.";
-
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception& e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-} // main()
diff --git a/src/online2bin/online2-wav-nnet2-latgen-faster.cc b/src/online2bin/online2-wav-nnet2-latgen-faster.cc
deleted file mode 100644
index f4d950636ab..00000000000
--- a/src/online2bin/online2-wav-nnet2-latgen-faster.cc
+++ /dev/null
@@ -1,302 +0,0 @@
-// online2bin/online2-wav-nnet2-latgen-faster.cc
-
-// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "feat/wave-reader.h"
-#include "online2/online-nnet2-decoding.h"
-#include "online2/online-nnet2-feature-pipeline.h"
-#include "online2/onlinebin-util.h"
-#include "online2/online-timing.h"
-#include "online2/online-endpoint.h"
-#include "fstext/fstext-lib.h"
-#include "lat/lattice-functions.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-
-void GetDiagnosticsAndPrintOutput(const std::string &utt,
-                                  const fst::SymbolTable *word_syms,
-                                  const CompactLattice &clat,
-                                  int64 *tot_num_frames,
-                                  double *tot_like) {
-  if (clat.NumStates() == 0) {
-    KALDI_WARN << "Empty lattice.";
-    return;
-  }
-  CompactLattice best_path_clat;
-  CompactLatticeShortestPath(clat, &best_path_clat);
-
-  Lattice best_path_lat;
-  ConvertLattice(best_path_clat, &best_path_lat);
-
-  double likelihood;
-  LatticeWeight weight;
-  int32 num_frames;
-  std::vector<int32> alignment;
-  std::vector<int32> words;
-  GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
-  num_frames = alignment.size();
-  likelihood = -(weight.Value1() + weight.Value2());
-  *tot_num_frames += num_frames;
-  *tot_like += likelihood;
-  KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
-                << (likelihood / num_frames) << " over " << num_frames
-                << " frames.";
-
-  if (word_syms != NULL) {
-    std::cerr << utt << ' ';
-    for (size_t i = 0; i < words.size(); i++) {
-      std::string s = word_syms->Find(words[i]);
-      if (s == "")
-        KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
-      std::cerr << s << ' ';
-    }
-    std::cerr << std::endl;
-  }
-}
-
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace fst;
-
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Reads in wav file(s) and simulates online decoding with neural nets\n"
-        "(nnet2 setup), with optional iVector-based speaker adaptation and\n"
-        "optional endpointing.  Note: some configuration values and inputs are\n"
-        "set via config files whose filenames are passed as options\n"
-        "\n"
-        "Usage: online2-wav-nnet2-latgen-faster [options] <nnet2-in> <fst-in> "
-        "<spk2utt-rspecifier> <wav-rspecifier> <lattice-wspecifier>\n"
-        "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
-        "you want to decode utterance by utterance.\n"
-        "See egs/rm/s5/local/run_online_decoding_nnet2.sh for example\n"
-        "See also online2-wav-nnet2-latgen-threaded\n";
-
-    ParseOptions po(usage);
-
-    std::string word_syms_rxfilename;
-
-    OnlineEndpointConfig endpoint_config;
-
-    // feature_config includes configuration for the iVector adaptation,
-    // as well as the basic features.
-    OnlineNnet2FeaturePipelineConfig feature_config;
-    OnlineNnet2DecodingConfig nnet2_decoding_config;
-
-    BaseFloat chunk_length_secs = 0.05;
-    bool do_endpointing = false;
-    bool online = true;
-
-    po.Register("chunk-length", &chunk_length_secs,
-                "Length of chunk size in seconds, that we process.  Set to <= 0 "
-                "to use all input in one chunk.");
-    po.Register("word-symbol-table", &word_syms_rxfilename,
-                "Symbol table for words [for debug output]");
-    po.Register("do-endpointing", &do_endpointing,
-                "If true, apply endpoint detection");
-    po.Register("online", &online,
-                "You can set this to false to disable online iVector estimation "
-                "and have all the data for each utterance used, even at "
-                "utterance start.  This is useful where you just want the best "
-                "results and don't care about online operation.  Setting this to "
-                "false has the same effect as setting "
-                "--use-most-recent-ivector=true and --greedy-ivector-extractor=true "
-                "in the file given to --ivector-extraction-config, and "
-                "--chunk-length=-1.");
-    po.Register("num-threads-startup", &g_num_threads,
-                "Number of threads used when initializing iVector extractor.");
-
-    feature_config.Register(&po);
-    nnet2_decoding_config.Register(&po);
-    endpoint_config.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      return 1;
-    }
-
-    std::string nnet2_rxfilename = po.GetArg(1),
-        fst_rxfilename = po.GetArg(2),
-        spk2utt_rspecifier = po.GetArg(3),
-        wav_rspecifier = po.GetArg(4),
-        clat_wspecifier = po.GetArg(5);
-
-    OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
-    if (!online) {
-      feature_info.ivector_extractor_info.use_most_recent_ivector = true;
-      feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
-      chunk_length_secs = -1.0;
-    }
-
-    Matrix<double> global_cmvn_stats;
-    if (feature_info.global_cmvn_stats_rxfilename != "")
-      ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
-                      &global_cmvn_stats);
-
-    TransitionModel trans_model;
-    nnet2::AmNnet nnet;
-    {
-      bool binary;
-      Input ki(nnet2_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      nnet.Read(ki.Stream(), binary);
-    }
-
-    fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldiGeneric(fst_rxfilename);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_rxfilename != "")
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                  << word_syms_rxfilename;
-
-    int32 num_done = 0, num_err = 0;
-    double tot_like = 0.0;
-    int64 num_frames = 0;
-
-    SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-    RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
-    CompactLatticeWriter clat_writer(clat_wspecifier);
-
-    OnlineTimingStats timing_stats;
-
-    for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-      std::string spk = spk2utt_reader.Key();
-      const std::vector<std::string> &uttlist = spk2utt_reader.Value();
-
-      OnlineIvectorExtractorAdaptationState adaptation_state(
-          feature_info.ivector_extractor_info);
-      OnlineCmvnState cmvn_state(global_cmvn_stats);
-
-      for (size_t i = 0; i < uttlist.size(); i++) {
-        std::string utt = uttlist[i];
-        if (!wav_reader.HasKey(utt)) {
-          KALDI_WARN << "Did not find audio for utterance " << utt;
-          num_err++;
-          continue;
-        }
-        const WaveData &wave_data = wav_reader.Value(utt);
-        // get the data for channel zero (if the signal is not mono, we only
-        // take the first channel).
-        SubVector<BaseFloat> data(wave_data.Data(), 0);
-
-        OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
-        feature_pipeline.SetAdaptationState(adaptation_state);
-        feature_pipeline.SetCmvnState(cmvn_state);
-
-        OnlineSilenceWeighting silence_weighting(
-            trans_model,
-            feature_info.silence_weighting_config);
-
-        SingleUtteranceNnet2Decoder decoder(nnet2_decoding_config,
-                                            trans_model,
-                                            nnet,
-                                            *decode_fst,
-                                            &feature_pipeline);
-        OnlineTimer decoding_timer(utt);
-
-        BaseFloat samp_freq = wave_data.SampFreq();
-        int32 chunk_length;
-        if (chunk_length_secs > 0) {
-          chunk_length = int32(samp_freq * chunk_length_secs);
-          if (chunk_length == 0) chunk_length = 1;
-        } else {
-          chunk_length = std::numeric_limits<int32>::max();
-        }
-
-        int32 samp_offset = 0;
-        std::vector<std::pair<int32, BaseFloat> > delta_weights;
-
-        while (samp_offset < data.Dim()) {
-          int32 samp_remaining = data.Dim() - samp_offset;
-          int32 num_samp = chunk_length < samp_remaining ? chunk_length
-                                                         : samp_remaining;
-
-          SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
-          feature_pipeline.AcceptWaveform(samp_freq, wave_part);
-
-          samp_offset += num_samp;
-          decoding_timer.WaitUntil(samp_offset / samp_freq);
-          if (samp_offset == data.Dim()) {
-            // no more input. flush out last frames
-            feature_pipeline.InputFinished();
-          }
-
-          if (silence_weighting.Active() &&
-              feature_pipeline.IvectorFeature() != NULL) {
-            silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
-            silence_weighting.GetDeltaWeights(
-                feature_pipeline.IvectorFeature()->NumFramesReady(),
-                &delta_weights);
-            feature_pipeline.IvectorFeature()->UpdateFrameWeights(
-                delta_weights);
-          }
-
-          decoder.AdvanceDecoding();
-
-          if (do_endpointing && decoder.EndpointDetected(endpoint_config))
-            break;
-        }
-        decoder.FinalizeDecoding();
-
-        CompactLattice clat;
-        bool end_of_utterance = true;
-        decoder.GetLattice(end_of_utterance, &clat);
-
-        GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
-                                     &num_frames, &tot_like);
-
-        decoding_timer.OutputStats(&timing_stats);
-
-        // In an application you might avoid updating the adaptation state if
-        // you felt the utterance had low confidence.  See lat/confidence.h
-        feature_pipeline.GetAdaptationState(&adaptation_state);
-        feature_pipeline.GetCmvnState(&cmvn_state);
-
-        // we want to output the lattice with un-scaled acoustics.
-        BaseFloat inv_acoustic_scale =
-            1.0 / nnet2_decoding_config.decodable_opts.acoustic_scale;
-        ScaleLattice(AcousticLatticeScale(inv_acoustic_scale), &clat);
-
-        clat_writer.Write(utt, clat);
-        KALDI_LOG << "Decoded utterance " << utt;
-        num_done++;
-      }
-    }
-    timing_stats.Print(online);
-
-    KALDI_LOG << "Decoded " << num_done << " utterances, "
-              << num_err << " with errors.";
-    KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
-              << " per frame over " << num_frames << " frames.";
-    delete decode_fst;
-    delete word_syms; // will delete if non-NULL.
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception& e) {
-    std::cerr << e.what();
-    return -1;
-  }
-} // main()
diff --git a/src/online2bin/online2-wav-nnet2-latgen-threaded.cc b/src/online2bin/online2-wav-nnet2-latgen-threaded.cc
deleted file mode 100644
index 2c04cc4180e..00000000000
--- a/src/online2bin/online2-wav-nnet2-latgen-threaded.cc
+++ /dev/null
@@ -1,318 +0,0 @@
-// online2bin/online2-wav-nnet2-latgen-threaded.cc
-
-// Copyright 2014-2015  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "feat/wave-reader.h"
-#include "online2/online-nnet2-decoding-threaded.h"
-#include "online2/online-nnet2-feature-pipeline.h"
-#include "online2/onlinebin-util.h"
-#include "online2/online-timing.h"
-#include "online2/online-endpoint.h"
-#include "fstext/fstext-lib.h"
-#include "lat/lattice-functions.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-
-void GetDiagnosticsAndPrintOutput(const std::string &utt,
-                                  const fst::SymbolTable *word_syms,
-                                  const CompactLattice &clat,
-                                  int64 *tot_num_frames,
-                                  double *tot_like) {
-  if (clat.NumStates() == 0) {
-    KALDI_WARN << "Empty lattice.";
-    return;
-  }
-  CompactLattice best_path_clat;
-  CompactLatticeShortestPath(clat, &best_path_clat);
-
-  Lattice best_path_lat;
-  ConvertLattice(best_path_clat, &best_path_lat);
-
-  double likelihood;
-  LatticeWeight weight;
-  int32 num_frames;
-  std::vector<int32> alignment;
-  std::vector<int32> words;
-  GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
-  num_frames = alignment.size();
-  likelihood = -(weight.Value1() + weight.Value2());
-  *tot_num_frames += num_frames;
-  *tot_like += likelihood;
-  KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
-                << (likelihood / num_frames) << " over " << num_frames
-                << " frames.";
-
-  if (word_syms != NULL) {
-    std::cerr << utt << ' ';
-    for (size_t i = 0; i < words.size(); i++) {
-      std::string s = word_syms->Find(words[i]);
-      if (s == "")
-        KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
-      std::cerr << s << ' ';
-    }
-    std::cerr << std::endl;
-  }
-}
-
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace fst;
-
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Reads in wav file(s) and simulates online decoding with neural nets\n"
-        "(nnet2 setup), with optional iVector-based speaker adaptation and\n"
-        "optional endpointing.  This version uses multiple threads for decoding.\n"
-        "Note: some configuration values and inputs are set via config files\n"
-        "whose filenames are passed as options\n"
-        "\n"
-        "Usage: online2-wav-nnet2-latgen-threaded [options] <nnet2-in> <fst-in> "
-        "<spk2utt-rspecifier> <wav-rspecifier> <lattice-wspecifier>\n"
-        "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
-        "you want to decode utterance by utterance.\n"
-        "See egs/rm/s5/local/run_online_decoding_nnet2.sh for example\n"
-        "See also online2-wav-nnet2-latgen-faster\n";
-
-    ParseOptions po(usage);
-
-    std::string word_syms_rxfilename;
-
-    OnlineEndpointConfig endpoint_config;
-
-    // feature_config includes configuration for the iVector adaptation,
-    // as well as the basic features.
-    OnlineNnet2FeaturePipelineConfig feature_config;
-    OnlineNnet2DecodingThreadedConfig nnet2_decoding_config;
-
-    BaseFloat chunk_length_secs = 0.05;
-    bool do_endpointing = false;
-    bool modify_ivector_config = false;
-    bool simulate_realtime_decoding = true;
-
-    po.Register("chunk-length", &chunk_length_secs,
-                "Length of chunk size in seconds, that we provide each time to the "
-                "decoder.  The actual chunk sizes it processes for various stages "
-                "of decoding are dynamically determinated, and unrelated to this");
-    po.Register("word-symbol-table", &word_syms_rxfilename,
-                "Symbol table for words [for debug output]");
-    po.Register("do-endpointing", &do_endpointing,
-                "If true, apply endpoint detection");
-    po.Register("modify-ivector-config", &modify_ivector_config,
-                "If true, modifies the iVector configuration from the config files "
-                "by setting --use-most-recent-ivector=true and --greedy-ivector-extractor=true. "
-                "This will give the best possible results, but the results may become dependent "
-                "on the speed of your machine (slower machine -> better results).  Compare "
-                "to the --online option in online2-wav-nnet2-latgen-faster");
-    po.Register("simulate-realtime-decoding", &simulate_realtime_decoding,
-                "If true, simulate real-time decoding scenario by providing the "
-                "data incrementally, calling sleep() until each piece is ready. "
-                "If false, don't sleep (so it will be faster).");
-    po.Register("num-threads-startup", &g_num_threads,
-                "Number of threads used when initializing iVector extractor.  ");
-
-    feature_config.Register(&po);
-    nnet2_decoding_config.Register(&po);
-    endpoint_config.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      return 1;
-    }
-
-    std::string nnet2_rxfilename = po.GetArg(1),
-        fst_rxfilename = po.GetArg(2),
-        spk2utt_rspecifier = po.GetArg(3),
-        wav_rspecifier = po.GetArg(4),
-        clat_wspecifier = po.GetArg(5);
-
-    OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
-
-    if (modify_ivector_config) {
-      feature_info.ivector_extractor_info.use_most_recent_ivector = true;
-      feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
-    }
-
-    Matrix<double> global_cmvn_stats;
-    if (feature_info.global_cmvn_stats_rxfilename != "")
-      ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
-                      &global_cmvn_stats);
-
-    TransitionModel trans_model;
-    nnet2::AmNnet am_nnet;
-    {
-      bool binary;
-      Input ki(nnet2_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_nnet.Read(ki.Stream(), binary);
-    }
-
-    fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldiGeneric(fst_rxfilename);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_rxfilename != "")
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                  << word_syms_rxfilename;
-
-    int32 num_done = 0, num_err = 0;
-    double tot_like = 0.0;
-    int64 num_frames = 0;
-    Timer global_timer;
-
-    SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-    RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
-    CompactLatticeWriter clat_writer(clat_wspecifier);
-
-    OnlineTimingStats timing_stats;
-
-    for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-      std::string spk = spk2utt_reader.Key();
-      const std::vector<std::string> &uttlist = spk2utt_reader.Value();
-
-      OnlineIvectorExtractorAdaptationState adaptation_state(
-          feature_info.ivector_extractor_info);
-      OnlineCmvnState cmvn_state(global_cmvn_stats);
-
-      for (size_t i = 0; i < uttlist.size(); i++) {
-        std::string utt = uttlist[i];
-        if (!wav_reader.HasKey(utt)) {
-          KALDI_WARN << "Did not find audio for utterance " << utt;
-          num_err++;
-          continue;
-        }
-        const WaveData &wave_data = wav_reader.Value(utt);
-        // get the data for channel zero (if the signal is not mono, we only
-        // take the first channel).
-        SubVector<BaseFloat> data(wave_data.Data(), 0);
-
-        SingleUtteranceNnet2DecoderThreaded decoder(
-            nnet2_decoding_config, trans_model, am_nnet,
-            *decode_fst, feature_info, adaptation_state, cmvn_state);
-
-        OnlineTimer decoding_timer(utt);
-
-        BaseFloat samp_freq = wave_data.SampFreq();
-        int32 chunk_length;
-        KALDI_ASSERT(chunk_length_secs > 0);
-        chunk_length = int32(samp_freq * chunk_length_secs);
-        if (chunk_length == 0) chunk_length = 1;
-
-        int32 samp_offset = 0;
-        while (samp_offset < data.Dim()) {
-          int32 samp_remaining = data.Dim() - samp_offset;
-          int32 num_samp = chunk_length < samp_remaining ? chunk_length
-                                                         : samp_remaining;
-
-          SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
-
-          // The endpointing code won't work if we let the waveform be given to
-          // the decoder all at once, because we'll exit this while loop, and
-          // the endpointing happens inside this while loop.  The next statement
-          // is intended to prevent this from happening.
-          while (do_endpointing &&
-                 decoder.NumWaveformPiecesPending() * chunk_length_secs > 2.0)
-            Sleep(0.5f);
-
-          decoder.AcceptWaveform(samp_freq, wave_part);
-
-          samp_offset += num_samp;
-
-          if (simulate_realtime_decoding) {
-            // Note: the next call may actually call sleep().
-            decoding_timer.SleepUntil(samp_offset / samp_freq);
-          }
-          if (samp_offset == data.Dim()) {
-            // no more input. flush out last frames
-            decoder.InputFinished();
-          }
-
-          if (do_endpointing && decoder.EndpointDetected(endpoint_config)) {
-            decoder.TerminateDecoding();
-            break;
-          }
-        }
-        Timer timer;
-        decoder.Wait();
-        if (simulate_realtime_decoding) {
-          KALDI_VLOG(1) << "Waited " << timer.Elapsed() << " seconds for decoder to "
-                        << "finish after giving it last chunk.";
-        }
-        decoder.FinalizeDecoding();
-
-        CompactLattice clat;
-        bool end_of_utterance = true;
-        decoder.GetLattice(end_of_utterance, &clat, NULL);
-
-        GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
-                                     &num_frames, &tot_like);
-
-        decoding_timer.OutputStats(&timing_stats);
-
-        // In an application you might avoid updating the adaptation state if
-        // you felt the utterance had low confidence.  See lat/confidence.h
-        decoder.GetAdaptationState(&adaptation_state);
-        decoder.GetCmvnState(&cmvn_state);
-
-        // we want to output the lattice with un-scaled acoustics.
-        BaseFloat inv_acoustic_scale =
-            1.0 / nnet2_decoding_config.acoustic_scale;
-        ScaleLattice(AcousticLatticeScale(inv_acoustic_scale), &clat);
-
-        if (simulate_realtime_decoding) {
-          KALDI_VLOG(1) << "Adding the various end-of-utterance tasks took the "
-                        << "total latency to " << timer.Elapsed() << " seconds.";
-        }
-        clat_writer.Write(utt, clat);
-        KALDI_LOG << "Decoded utterance " << utt;
-
-        num_done++;
-      }
-    }
-    bool online = true;
-
-    if (simulate_realtime_decoding) {
-      timing_stats.Print(online);
-    } else {
-      BaseFloat frame_shift = 0.01;
-      BaseFloat real_time_factor =
-          global_timer.Elapsed() / (frame_shift * num_frames);
-      if (num_frames > 0)
-        KALDI_LOG << "Real-time factor was " << real_time_factor
-                  << " assuming frame shift of " << frame_shift;
-    }
-
-    KALDI_LOG << "Decoded " << num_done << " utterances, "
-              << num_err << " with errors.";
-    KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
-              << " per frame over " << num_frames << " frames.";
-    delete decode_fst;
-    delete word_syms; // will delete if non-NULL.
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception& e) {
-    std::cerr << e.what();
-    return -1;
-  }
-} // main()
diff --git a/src/online2bin/online2-wav-nnet3-latgen-faster.cc b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
index 1549dd6ae52..cf6f8923ba9 100644
--- a/src/online2bin/online2-wav-nnet3-latgen-faster.cc
+++ b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
@@ -153,12 +153,15 @@ int main(int argc, char *argv[]) {
       chunk_length_secs = -1.0;
     }
 
+    Transitions trans_model;
+
     Matrix<double> global_cmvn_stats;
     if (feature_info.global_cmvn_stats_rxfilename != "")
       ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
                       &global_cmvn_stats);
 
     TransitionModel trans_model;
+
     nnet3::AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/online2bin/online2-wav-nnet3-latgen-grammar.cc b/src/online2bin/online2-wav-nnet3-latgen-grammar.cc
index 559baa1db6d..2085f025471 100644
--- a/src/online2bin/online2-wav-nnet3-latgen-grammar.cc
+++ b/src/online2bin/online2-wav-nnet3-latgen-grammar.cc
@@ -156,12 +156,13 @@ int main(int argc, char *argv[]) {
       chunk_length_secs = -1.0;
     }
 
+
     Matrix<double> global_cmvn_stats;
     if (feature_info.global_cmvn_stats_rxfilename != "")
       ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
                       &global_cmvn_stats);
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     nnet3::AmNnetSimple am_nnet;
     {
       bool binary;
diff --git a/src/onlinebin/Makefile b/src/onlinebin/Makefile
index 670dbc0dab2..6f3e6fddcc3 100644
--- a/src/onlinebin/Makefile
+++ b/src/onlinebin/Makefile
@@ -40,6 +40,6 @@ ADDLIBS = ../online/kaldi-online.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
+          ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/onlinebin/online-audio-server-decode-faster.cc b/src/onlinebin/online-audio-server-decode-faster.cc
index ca4bfeb8858..3a7d416441e 100644
--- a/src/onlinebin/online-audio-server-decode-faster.cc
+++ b/src/onlinebin/online-audio-server-decode-faster.cc
@@ -151,7 +151,7 @@ int32 main(int argc, char *argv[]) {
 
     std::cout << "Reading acoustic model: " << model_rspecifier << "..."
         << std::endl;
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
       bool binary;
diff --git a/src/onlinebin/online-gmm-decode-faster.cc b/src/onlinebin/online-gmm-decode-faster.cc
index 46904dbc59e..dac7657ea57 100644
--- a/src/onlinebin/online-gmm-decode-faster.cc
+++ b/src/onlinebin/online-gmm-decode-faster.cc
@@ -106,7 +106,7 @@ int main(int argc, char *argv[]) {
     if (silence_phones.empty())
         KALDI_ERR << "No silence phones given!";
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
         bool binary;
diff --git a/src/onlinebin/online-server-gmm-decode-faster.cc b/src/onlinebin/online-server-gmm-decode-faster.cc
index 80973bf0705..5567a192ce4 100644
--- a/src/onlinebin/online-server-gmm-decode-faster.cc
+++ b/src/onlinebin/online-server-gmm-decode-faster.cc
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
     if (silence_phones.empty())
         KALDI_ERR << "No silence phones given!";
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
         bool binary;
diff --git a/src/onlinebin/online-wav-gmm-decode-faster.cc b/src/onlinebin/online-wav-gmm-decode-faster.cc
index fe7c6d6b974..400835e6e20 100644
--- a/src/onlinebin/online-wav-gmm-decode-faster.cc
+++ b/src/onlinebin/online-wav-gmm-decode-faster.cc
@@ -109,7 +109,7 @@ int main(int argc, char *argv[]) {
       lda_transform.Read(ki.Stream(), binary_in);
     }
 
-    TransitionModel trans_model;
+    Transitions trans_model;
     AmDiagGmm am_gmm;
     {
         bool binary;
diff --git a/src/rnnlm/Makefile b/src/rnnlm/Makefile
index d4b3f3ce0a8..0a383c3c710 100644
--- a/src/rnnlm/Makefile
+++ b/src/rnnlm/Makefile
@@ -16,6 +16,6 @@ LIBNAME = kaldi-rnnlm
 
 ADDLIBS = ../nnet3/kaldi-nnet3.a ../cudamatrix/kaldi-cudamatrix.a \
           ../lm/kaldi-lm.a ../hmm/kaldi-hmm.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/rnnlmbin/Makefile b/src/rnnlmbin/Makefile
index 23a8eba6145..7e8f5127c33 100644
--- a/src/rnnlmbin/Makefile
+++ b/src/rnnlmbin/Makefile
@@ -20,7 +20,7 @@ ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../nnet3/kaldi-nnet3.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \
           ../hmm/kaldi-hmm.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/sgmm2/Makefile b/src/sgmm2/Makefile
deleted file mode 100644
index 35a8d3a1f40..00000000000
--- a/src/sgmm2/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-all:
-
-OPENFST_CXXFLAGS =
-OPENFST_LDLIBS =
-include ../kaldi.mk
-
-TESTFILES = am-sgmm2-test estimate-am-sgmm2-test  \
-   fmllr-sgmm2-test
-
-OBJFILES = am-sgmm2.o estimate-am-sgmm2.o estimate-am-sgmm2-ebw.o fmllr-sgmm2.o \
-          am-sgmm2-project.o decodable-am-sgmm2.o
-
-LIBNAME = kaldi-sgmm2
-
-ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
-
-include ../makefiles/default_rules.mk
diff --git a/src/sgmm2/am-sgmm2-project.cc b/src/sgmm2/am-sgmm2-project.cc
deleted file mode 100644
index 13a69a75842..00000000000
--- a/src/sgmm2/am-sgmm2-project.cc
+++ /dev/null
@@ -1,265 +0,0 @@
-// sgmm2/am-sgmm2-project.cc
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <functional>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-using std::vector;
-
-#include "sgmm2/am-sgmm2-project.h"
-#include "util/kaldi-thread.h"
-#include "gmm/full-gmm-normal.h"
-#include "gmm/diag-gmm-normal.h"
-
-namespace kaldi {
-
-// The output pointer argument "projection" projects from the pre-LDA+MLLT space
-// to the space we're going to model.  We retain "model_dim" dimensions, which
-// means we're keeping all dimensions that have any variation at all.
-
-void Sgmm2Project::ComputeProjection(const AmSgmm2 &sgmm,
-                                     const Matrix<BaseFloat> &inv_lda_mllt,
-                                     int32 start_dim,
-                                     int32 end_dim, // last dim plus one
-                                     Matrix<BaseFloat> *projection) {
-  Matrix<double> inv_lda_mllt_dbl(inv_lda_mllt);
-  KALDI_ASSERT(inv_lda_mllt.NumRows() == inv_lda_mllt.NumCols());
-  
-  // First, to compute the projection that we're going to use:
-  
-  SpMatrix<double> B; // between-class covar.
-  SpMatrix<double> W; // within-class covar.
-
-  int32 model_dim = sgmm.FeatureDim(),
-      full_dim = inv_lda_mllt.NumRows();
-  KALDI_ASSERT(full_dim > model_dim);
-  KALDI_ASSERT(start_dim >= 0 && start_dim < end_dim && end_dim <= full_dim);
-
-  ComputeLdaStats(sgmm.full_ubm(), &B, &W);
-  // B and W are now of dim "model_dim".
-
-  double diag_term = 0.001 / model_dim * B.Trace(); // This will ensure
-  // that the between-class covariance is full rank within the original
-  // feature space.
-  for (int32 i = 0; i < B.NumRows(); i++)
-    B(i, i) += diag_term;
-
-  B.Resize(full_dim, kCopyData); // This extends the extra dims with
-  // zeros, which is what we want, because we assume the means are zero in the
-  // extra dimensions [this is valid because we have cmd'ed data].
-
-  W.Resize(full_dim, kCopyData); // We want the within-class
-  // covar to be unit in the extra dimensions, so we need to do something
-  // about this... note, this is valid if we have an LDA-based feature
-  // space, as we constructed the LDA matrix so that the covar in
-  // the rejected dimensions is unit.  [note: we can gloss over differences
-  // between within vs. total covar here, as it's almost exactly the same
-  // for the rejected dimensions].
-  for (int32 i = model_dim; i < full_dim; i++)
-    W(i, i) = 1.0;
-  
-  // Next, we'll project these "extended" stats with the "inv_lda_mllt"
-  // matrix, which takes us into the space where we were before LDA+MLLT.
-  SpMatrix<double> B_orig(full_dim), W_orig(full_dim);
-  B_orig.AddMat2Sp(1.0, inv_lda_mllt_dbl, kNoTrans, B, 0.0); // B_orig <-- inv_lda_mllt B inv_lda_mllt^T
-  W_orig.AddMat2Sp(1.0, inv_lda_mllt_dbl, kNoTrans, W, 0.0); // W_orig <-- inv_lda_mllt W inv_lda_mllt^T
-
-  // Now get versions of B_orig and W_orig that are limited to the
-  // dimension range that we wanted.
-  Matrix<double> B_orig_mat(B_orig), W_orig_mat(W_orig); // Get them as full matrices...
-  SpMatrix<double> B_orig_limit(B_orig_mat.Range(start_dim, end_dim-start_dim,
-                                                 start_dim, end_dim-start_dim)),
-      W_orig_limit(W_orig_mat.Range(start_dim, end_dim-start_dim,
-                                    start_dim, end_dim-start_dim));
-  
-  Matrix<double> proj;
-  int32 retained_dim = model_dim;
-  if (end_dim - start_dim < retained_dim) retained_dim = end_dim - start_dim;
-  ComputeLdaTransform(B_orig_limit, W_orig_limit, retained_dim, &proj);
-  
-  // Now proj has the projection from the "limited-dimension" space.
-  // We want a projection from the entire space.
-  
-  projection->Resize(retained_dim, full_dim); // This projection (which we output) will project from
-  // full_dim to retained_dim; it goes from the pre-LDA+MLLT space to "retained_dim" which
-  // is <= model_dim.
-  
-  // Copy the relevant dimensions of "projection" from the "proj" matrix that
-  // we just computed.  The rest remain zero (corresponding to discarded dimensions).
-  projection->Range(0, retained_dim, start_dim, end_dim-start_dim).CopyFromMat(proj);
-}
-
-void Sgmm2Project::ComputeLdaTransform(const SpMatrix<double> &B,
-                                       const SpMatrix<double> &W,
-                                       int32 dim_to_retain, 
-                                       Matrix<double> *Projection) {
-  int32 dim = B.NumRows();
-  KALDI_ASSERT(dim_to_retain <= dim);
-
-  // OK, now do LDA in this space...
-  TpMatrix<double> T(dim);
-  T.Cholesky(W); // do Cholesky factorization W_orig = T T^T.  Now,
-  // T^{-1} is the projection that makes W unit.
-  TpMatrix<double> Tinv(T); // get inverse of T.
-  Tinv.Invert();
-  
-  // Now project B_orig with Tinv, to get between-class scatter in space where
-  // W_orig is unit.
-  SpMatrix<double> B_proj(dim);
-  B_proj.AddTp2Sp(1.0, Tinv, kNoTrans, B, 0.0);
-  
-  // Now, in this space, do SVD.
-
-  Matrix<double> P(dim, dim);
-  Vector<double> s(dim);
-  B_proj.SymPosSemiDefEig(&s, &P);
-  // Now B_proj = P diag(s) P^T, with P orthogonal.  It's both SVD and eigenvalue
-  // decomposition.
-  // So P^{-1}, which equals P^T, is the transformation that
-  // will make B_proj diagonal (with eigenvalues equal to s).
-
-  P.Resize(dim, dim_to_retain, kCopyData); // keep only rows of P^T that we want.
-  Projection->Resize(dim_to_retain, dim);
-  // The next line sets "Projection" to the LDA matrix, which is (part of P^T) * T^{-1}
-  Projection->AddMatTp(1.0, P, kTrans, Tinv, kNoTrans, 0.0);
-
-  KALDI_LOG << "Eigenvalues of retained LDA dimensions: "
-            << s.Range(0, dim_to_retain) << " (sum is:) "
-            << s.Range(0, dim_to_retain).Sum();
-  KALDI_LOG << "Eigenvalues of rejected LDA dimensions: "
-            << s.Range(dim_to_retain, dim - dim_to_retain) << " (sum is:) "
-            << s.Range(dim_to_retain, dim - dim_to_retain).Sum();
-
-  { // Check that it's been done correctly by projecting the
-    // matrices we got as input checking they become (diagonal, unit).
-    SpMatrix<double> B_ldaproj(dim_to_retain), W_ldaproj(dim_to_retain);
-    B_ldaproj.AddMat2Sp(1.0, *Projection, kNoTrans, B, 0.0);
-    KALDI_ASSERT(B_ldaproj.IsDiagonal());
-    W_ldaproj.AddMat2Sp(1.0, *Projection, kNoTrans, W, 0.0);
-    KALDI_ASSERT(W_ldaproj.IsUnit());
-  }
-}
-
-
-void Sgmm2Project::ComputeLdaStats(const FullGmm &full_ubm,
-                                   SpMatrix<double> *between_covar,
-                                   SpMatrix<double> *within_covar) {
-  int32 dim = full_ubm.Dim(); // Feature dimension.
-  between_covar->Resize(dim); // zeroes it.
-  within_covar->Resize(dim); // zeroes it.
-  FullGmmNormal full_gmm_normal(full_ubm);
-  BaseFloat weight = 1.0 / full_ubm.NumGauss();
-  Vector<double> avg_mean(dim);
-  for (int32 i = 0; i < full_ubm.NumGauss(); i++) {
-    between_covar->AddSp(weight, full_gmm_normal.vars_[i]);
-    within_covar->AddVec2(weight, full_gmm_normal.means_.Row(i));
-    avg_mean.AddVec(weight, full_gmm_normal.means_.Row(i));
-  }
-  between_covar->AddVec2(-1.0, avg_mean);
-}
-
-void Sgmm2Project::ApplyProjection(const Matrix<BaseFloat> &total_projection,
-                                   AmSgmm2 *sgmm) {
-  int32 dim = sgmm->FeatureDim();
-  int32 retained_dim = total_projection.NumRows();
-  KALDI_ASSERT(retained_dim <= dim);
-  
-  // Note: small_projection is as total_projection but ignoring the
-  // higher dimensions of the input... this is valid as far as the means
-  // are concerned, because we extend with zeros.
-  SubMatrix<BaseFloat> small_projection(total_projection, 0, retained_dim, 0, dim);
-  Matrix<double> small_projection_dbl(small_projection);
-  Matrix<double> total_projection_dbl(total_projection);
-  
-  int32 I = sgmm->NumGauss();
-  for (int32 i = 0; i < I; i++) {
-    {
-      // do M_i  <-- small_projection * M_i
-      Matrix<BaseFloat> M(sgmm->M_[i]);
-      sgmm->M_[i].Resize(retained_dim, M.NumCols());
-      sgmm->M_[i].AddMatMat(1.0, small_projection, kNoTrans, M, kNoTrans, 0.0);
-    }
-    if (!sgmm->N_.empty()) {
-      // do N_i  <-- small_projection * N_i
-      Matrix<BaseFloat> N(sgmm->N_[i]);
-      sgmm->N_[i].Resize(retained_dim, N.NumCols());
-      sgmm->N_[i].AddMatMat(1.0, small_projection, kNoTrans, N, kNoTrans, 0.0);
-    }
-    ProjectVariance(total_projection_dbl, true, // inverted,
-                    &(sgmm->SigmaInv_[i]));
-  }    
-
-  { // Project full_ubm.
-    FullGmmNormal full_ubm_normal(sgmm->full_ubm_);
-    for (int32 i = 0; i < I; i++) {
-      ProjectVariance(total_projection_dbl, false, &(full_ubm_normal.vars_[i]));
-    }
-    Matrix<double> old_means(full_ubm_normal.means_);
-    full_ubm_normal.means_.Resize(I, retained_dim);
-    full_ubm_normal.means_.AddMatMat(1.0, old_means, kNoTrans,
-                                     small_projection_dbl, kTrans, 0.0);
-    sgmm->full_ubm_.Resize(I, retained_dim);
-    full_ubm_normal.CopyToFullGmm(&sgmm->full_ubm_);
-    sgmm->full_ubm_.ComputeGconsts();
-  }
-  sgmm->diag_ubm_.Resize(I, retained_dim);
-  sgmm->diag_ubm_.CopyFromFullGmm(sgmm->full_ubm_);
-  sgmm->diag_ubm_.ComputeGconsts();
-  sgmm->n_.clear(); // The normalizers are invalid now, so clear them.
-}
-
-void Sgmm2Project::ProjectVariance(const Matrix<double> &total_projection,
-                                   bool inverse,
-                                   SpMatrix<double> *variance) {
-  if (inverse) {
-    SpMatrix<double> inv_var(*variance);
-    inv_var.Invert();
-    ProjectVariance(total_projection, false, &inv_var);
-    inv_var.Invert();
-    if (variance->NumRows() != inv_var.NumRows())
-      variance->Resize(inv_var.NumRows());
-    variance->CopyFromSp(inv_var);
-  } else {
-    SpMatrix<double> extended_var(*variance);
-    KALDI_ASSERT(total_projection.NumCols() >= extended_var.NumRows());
-    extended_var.Resize(total_projection.NumCols(), kCopyData);
-    for (int32 i = variance->NumRows(); i < extended_var.NumRows(); i++)
-      extended_var(i, i) = 1.0; // make new part of diagonal ones.
-    int32 tgt_dim = total_projection.NumRows();
-    KALDI_ASSERT(tgt_dim <= variance->NumRows());
-    if (tgt_dim < variance->NumRows()) variance->Resize(tgt_dim);
-    variance->AddMat2Sp(1.0, total_projection, kNoTrans, extended_var, 0.0);
-  }
-}
-
-void Sgmm2Project::ProjectVariance (const Matrix<double> &total_projection,
-                                    bool inverse,
-                                    SpMatrix<float> *variance) {
-  SpMatrix<double> variance_dbl(*variance);
-  ProjectVariance(total_projection, inverse, &variance_dbl);
-  if (variance->NumRows() != variance_dbl.NumRows())
-    variance->Resize(variance_dbl.NumRows());
-  variance->CopyFromSp(variance_dbl);
-}
-
-
-}  // namespace kaldi
diff --git a/src/sgmm2/am-sgmm2-project.h b/src/sgmm2/am-sgmm2-project.h
deleted file mode 100644
index d85fd7106dc..00000000000
--- a/src/sgmm2/am-sgmm2-project.h
+++ /dev/null
@@ -1,86 +0,0 @@
-// sgmm2/am-sgmm2-project.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM2_AM_SGMM2_PROJECT_H_
-#define KALDI_SGMM2_AM_SGMM2_PROJECT_H_
-
-#include <vector>
-#include <queue>
-
-#include "sgmm2/am-sgmm2.h"
-
-namespace kaldi {
-
-class Sgmm2Project {
-  // This class essentially functions as a namespace for some functions;
-  // it's a friend of AmSgmm.h.  It relates to "predictive" SGMMs.  This
-  // hasn't been written up yet.  We don't make any functions const or
-  // static, because there are no member variables.
- public:
-
-  // If inv_lda_mllt is the matrix that projects from the space the SGMM is
-  // in, typically back to the spliced-MFCC space, and begin_dim and end_dim
-  // represent the range of dims we want to model, then "projection" will be
-  // a matrix, applied *after* the "inv_lda_mllt" matrix, that projects from
-  // the raw splice-MFCC features to the space we want to model.  This matrix
-  // is of dimension e.g. 40 x 117, and omits the space that the model's states
-  // all treat the same.
-  void ComputeProjection(const AmSgmm2 &sgmm,
-                         const Matrix<BaseFloat> &inv_lda_mllt,
-                         int32 begin_dim,
-                         int32 end_dim, // last dim plus one that we keep.
-                         Matrix<BaseFloat> *projection);
-
-  // This function applies the feature-space projection to the SGMM.
-  // The matrix "total_projection" is the product of the "projection" matrix
-  // of ComputeProjection times the "inv_lda_mllt" matrix.  It actually
-  // projects from a larger dimension than the current SGMM.  We treat
-  // the means as if extended with zeros, and the covariances as if
-  // extended with a unit matrix.
-  void ApplyProjection(const Matrix<BaseFloat> &total_projection,
-                       AmSgmm2 *sgmm);
-                         
- private:
-  // Computes statistics for LDA, in the SGMM's feature space.
-  // This only needs to be approximate, so we use stats based
-  // on the means in the UBM.
-  void ComputeLdaStats(const FullGmm &full_ubm,
-                       SpMatrix<double> *between_covar,
-                       SpMatrix<double> *within_covar);
-
-  void ProjectVariance (const Matrix<double> &total_projection,
-                        bool inverse,
-                        SpMatrix<double> *variance);
-  
-  void ProjectVariance (const Matrix<double> &total_projection,
-                        bool inverse,
-                        SpMatrix<float> *variance);
-  
-  void ComputeLdaTransform(const SpMatrix<double> &B,
-                           const SpMatrix<double> &W,
-                           int32 dim_to_retain, 
-                           Matrix<double> *Projection);
-  
-};
-
-
-
-} // end namespace kaldi
-
-#endif  // KALDI_SGMM2_AM_SGMM2_PROJECT_H_
diff --git a/src/sgmm2/am-sgmm2-test.cc b/src/sgmm2/am-sgmm2-test.cc
deleted file mode 100644
index 5dee50f3f6d..00000000000
--- a/src/sgmm2/am-sgmm2-test.cc
+++ /dev/null
@@ -1,285 +0,0 @@
-// sgmm2/am-sgmm2-test.cc
-
-// Copyright 2012   Arnab Ghoshal
-//           2009-2011  Saarland University
-//           2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gmm/model-test-common.h"
-#include "sgmm2/am-sgmm2.h"
-#include "util/kaldi-io.h"
-
-using kaldi::AmSgmm2;
-using kaldi::int32;
-using kaldi::BaseFloat;
-namespace ut = kaldi::unittest;
-
-// Tests the initialization routines: InitializeFromFullGmm(), CopyFromSgmm2()
-// and CopyGlobalsInitVecs().
-void TestSgmm2Init(const AmSgmm2 &sgmm) {
-  using namespace kaldi;
-  int32 dim = sgmm.FeatureDim();
-  kaldi::Sgmm2GselectConfig config;
-  config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss());
-
-  kaldi::Vector<BaseFloat> feat(dim);
-  for (int32 d = 0; d < dim; d++) {
-    feat(d) = kaldi::RandGauss();
-  }
-  kaldi::Sgmm2PerFrameDerivedVars frame_vars;
-  frame_vars.Resize(sgmm.NumGauss(), sgmm.FeatureDim(),
-                    sgmm.PhoneSpaceDim());
-
-  std::vector<int32> gselect;
-  sgmm.GaussianSelection(config, feat, &gselect);
-  Sgmm2PerSpkDerivedVars empty;
-  Sgmm2PerFrameDerivedVars per_frame;
-  sgmm.ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  Sgmm2LikelihoodCache sgmm_cache(sgmm.NumGroups(), sgmm.NumPdfs());
-  BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  sgmm_cache.NextFrame();
-
-  // First, test the CopyFromSgmm2() method:
-  AmSgmm2 *sgmm1 = new AmSgmm2();
-  sgmm1->CopyFromSgmm2(sgmm, true, true);
-  sgmm1->GaussianSelection(config, feat, &gselect);
-  sgmm1->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  sgmm_cache.NextFrame();
-  BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike1, 1e-4);
-  delete sgmm1;
-
-  AmSgmm2 *sgmm2 = new AmSgmm2();
-  sgmm2->CopyFromSgmm2(sgmm, false, false);
-  sgmm2->ComputeNormalizers();
-  sgmm2->ComputeWeights();
-  sgmm2->GaussianSelection(config, feat, &gselect);
-  sgmm2->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  sgmm_cache.NextFrame();
-  BaseFloat loglike2 = sgmm2->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike2, 1e-4);
-  delete sgmm2;
-
-  // Next, initialize using the UBM from the current model
-  AmSgmm2 *sgmm3 = new AmSgmm2();
-  {
-    std::vector<int32> pdf2group(sgmm.NumPdfs());
-    for (int32 i = 0; i < sgmm.NumPdfs(); i++) pdf2group[i] = sgmm.Pdf2Group(i);
-    sgmm3->InitializeFromFullGmm(sgmm.full_ubm(), pdf2group,
-                                 sgmm.PhoneSpaceDim(), sgmm.SpkSpaceDim(), true, 0.9);
-  }
-  sgmm3->ComputeNormalizers();
-  sgmm3->GaussianSelection(config, feat, &gselect);
-  sgmm3->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  sgmm_cache.NextFrame();
-  BaseFloat loglike3 = sgmm3->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike3, 1e-4);
-  delete sgmm3;
-}
-
-// Tests the Read() and Write() methods, in both binary and ASCII mode, as well
-// as Check(), and methods in likelihood computations.
-void TestSgmm2IO(const AmSgmm2 &sgmm) {
-  using namespace kaldi;
-  int32 dim = sgmm.FeatureDim();
-  kaldi::Sgmm2GselectConfig config;
-  config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss());
-
-  kaldi::Vector<BaseFloat> feat(dim);
-  for (int32 d = 0; d < dim; d++) {
-    feat(d) = kaldi::RandGauss();
-  }
-  kaldi::Sgmm2PerFrameDerivedVars frame_vars;
-  frame_vars.Resize(sgmm.NumGauss(), sgmm.FeatureDim(),
-                    sgmm.PhoneSpaceDim());
-
-  std::vector<int32> gselect;
-  sgmm.GaussianSelection(config, feat, &gselect);
-  Sgmm2PerSpkDerivedVars empty;
-  Sgmm2PerFrameDerivedVars per_frame;
-  sgmm.ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  Sgmm2LikelihoodCache sgmm_cache(sgmm.NumGroups(), sgmm.NumPdfs());
-  BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-
-  // First, non-binary write
-  sgmm.Write(kaldi::Output("tmpf", false).Stream(), false,
-      kaldi::kSgmmWriteAll);
-
-  bool binary_in;
-  AmSgmm2 *sgmm1 = new AmSgmm2();
-  // Non-binary read
-  kaldi::Input ki1("tmpf", &binary_in);
-  sgmm1->Read(ki1.Stream(), binary_in);
-  sgmm1->Check(true);
-  sgmm1->GaussianSelection(config, feat, &gselect);
-  sgmm1->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike1, 1e-4);
-
-  // Next, binary write
-  sgmm1->Write(kaldi::Output("tmpfb", true).Stream(), true,
-      kaldi::kSgmmWriteAll);
-  delete sgmm1;
-
-  AmSgmm2 *sgmm2 = new AmSgmm2();
-  // Binary read
-  kaldi::Input ki2("tmpfb", &binary_in);
-  sgmm2->Read(ki2.Stream(), binary_in);
-  sgmm2->Check(true);
-  sgmm2->GaussianSelection(config, feat, &gselect);
-  sgmm2->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  BaseFloat loglike2 = sgmm2->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike2, 1e-4);
-  delete sgmm2;
-  unlink("tmpf");
-  unlink("tmpfb");
-}
-
-void TestSgmm2Substates(const AmSgmm2 &sgmm) {
-  using namespace kaldi;
-  int32 target_substates = 2 * sgmm.NumPdfs();
-  kaldi::Vector<BaseFloat> occs(sgmm.NumPdfs());
-  for (int32 i = 0; i < occs.Dim(); i++)
-    occs(i) = std::fabs(kaldi::RandGauss()) * (kaldi::RandUniform()+1);
-  AmSgmm2 *sgmm1 = new AmSgmm2();
-  sgmm1->CopyFromSgmm2(sgmm, false, false);
-  Sgmm2SplitSubstatesConfig cfg;
-  cfg.split_substates = target_substates;
-  sgmm1->SplitSubstates(occs, cfg);
-  sgmm1->ComputeNormalizers();
-  sgmm1->ComputeWeights();
-  sgmm1->Check(true);
-  int32 dim = sgmm.FeatureDim();
-  kaldi::Sgmm2GselectConfig config;
-  config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss());
-  kaldi::Vector<BaseFloat> feat(dim);
-  for (int32 d = 0; d < dim; d++) {
-    feat(d) = kaldi::RandGauss();
-  }
-
-  std::vector<int32> gselect;
-  sgmm.GaussianSelection(config, feat, &gselect);
-
-  Sgmm2PerSpkDerivedVars empty;
-  Sgmm2PerFrameDerivedVars per_frame;
-  sgmm.ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  Sgmm2LikelihoodCache sgmm_cache(sgmm.NumGroups(), sgmm.NumPdfs());  
-  BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-
-  sgmm1->GaussianSelection(config, feat, &gselect);
-  sgmm1->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  sgmm_cache.NextFrame();
-  BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike1, 1e-2);
-
-  delete sgmm1;
-}
-
-void TestSgmm2IncreaseDim(const AmSgmm2 &sgmm) {
-  using namespace kaldi;
-  int32 target_phn_dim = static_cast<int32>(1.5 * sgmm.PhoneSpaceDim());
-  int32 target_spk_dim = sgmm.PhoneSpaceDim() - 1;
-
-  int32 dim = sgmm.FeatureDim();
-  kaldi::Sgmm2GselectConfig config;
-  config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss());
-  kaldi::Vector<BaseFloat> feat(dim);
-  for (int32 d = 0; d < dim; d++) {
-    feat(d) = kaldi::RandGauss();
-  }
-  kaldi::Sgmm2PerFrameDerivedVars frame_vars;
-
-  std::vector<int32> gselect;
-  sgmm.GaussianSelection(config, feat, &gselect);
-  Sgmm2PerSpkDerivedVars empty;
-  Sgmm2PerFrameDerivedVars per_frame;  
-  sgmm.ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  Sgmm2LikelihoodCache sgmm_cache(sgmm.NumGroups(), sgmm.NumPdfs());  
-  BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-
-  kaldi::Matrix<BaseFloat> norm_xform;
-  kaldi::ComputeFeatureNormalizingTransform(sgmm.full_ubm(), &norm_xform);
-  AmSgmm2 *sgmm1 = new AmSgmm2();
-  sgmm1->CopyFromSgmm2(sgmm, false, false);
-  sgmm1->Check(true);
-  sgmm1->IncreasePhoneSpaceDim(target_phn_dim, norm_xform);
-  sgmm1->ComputeNormalizers();
-  sgmm1->Check(true);
-
-
-  sgmm1->GaussianSelection(config, feat, &gselect);
-  sgmm1->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  sgmm_cache.NextFrame();
-  BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike1, 1e-4);
-
-  sgmm1->IncreaseSpkSpaceDim(target_spk_dim, norm_xform, true);
-  sgmm1->Check(true);
-  sgmm1->GaussianSelection(config, feat, &gselect);
-  sgmm1->ComputePerFrameVars(feat, gselect, empty, &per_frame);
-  sgmm_cache.NextFrame();
-  BaseFloat loglike2 = sgmm1->LogLikelihood(per_frame, 0, &sgmm_cache, &empty);
-  kaldi::AssertEqual(loglike, loglike2, 1e-4);
-  delete sgmm1;
-}
-
-void TestSgmm2PreXform(const AmSgmm2 &sgmm) {
-  kaldi::Matrix<BaseFloat> xform, inv_xform;
-  kaldi::Vector<BaseFloat> diag_scatter;
-  kaldi::Vector<BaseFloat> occs(sgmm.NumPdfs());
-  occs.Set(100);
-  sgmm.ComputeFmllrPreXform(occs, &xform, &inv_xform, &diag_scatter);
-  int32 dim = xform.NumRows();
-  kaldi::SubMatrix<BaseFloat> a_pre(xform, 0, dim, 0, dim),
-      a_inv(inv_xform, 0, dim, 0, dim);
-  kaldi::Vector<BaseFloat> b_pre(dim), b_inv(dim);
-  b_pre.CopyColFromMat(xform, dim);
-  b_inv.CopyColFromMat(inv_xform, dim);
-  kaldi::Matrix<BaseFloat> res_mat(dim, dim, kaldi::kSetZero);
-  res_mat.AddMatMat(1.0, a_pre, kaldi::kNoTrans, a_inv, kaldi::kNoTrans, 0.0);
-  KALDI_ASSERT(res_mat.IsUnit(1.0e-5));
-  kaldi::Vector<BaseFloat> res_vec(dim, kaldi::kSetZero);
-  res_vec.AddMatVec(1.0, a_inv, kaldi::kNoTrans, b_pre, 0.0);
-  res_vec.AddVec(1.0, b_inv);
-  KALDI_ASSERT(res_vec.IsZero(1.0e-5));
-}
-
-void UnitTestSgmm2() {
-  size_t dim = 1 + kaldi::RandInt(0, 9);  // random dimension of the gmm
-  size_t num_comp = 1 + kaldi::RandInt(0, 9);  // random number of mixtures
-  kaldi::FullGmm full_gmm;
-  ut::InitRandFullGmm(dim, num_comp, &full_gmm);
-
-  std::vector<int32> pdf2group;
-  pdf2group.push_back(0);
-  AmSgmm2 sgmm;
-  kaldi::Sgmm2GselectConfig config;
-  sgmm.InitializeFromFullGmm(full_gmm, pdf2group, dim+1, 0, true, 0.9);
-  sgmm.ComputeNormalizers();
-  TestSgmm2Init(sgmm);
-  TestSgmm2IO(sgmm);
-  TestSgmm2Substates(sgmm);
-  TestSgmm2IncreaseDim(sgmm);
-  TestSgmm2PreXform(sgmm);
-}
-
-int main() {
-  for (int i = 0; i < 10; i++)
-    UnitTestSgmm2();
-  std::cout << "Test OK.\n";
-  return 0;
-}
diff --git a/src/sgmm2/am-sgmm2.cc b/src/sgmm2/am-sgmm2.cc
deleted file mode 100644
index d249a5ab8b2..00000000000
--- a/src/sgmm2/am-sgmm2.cc
+++ /dev/null
@@ -1,1493 +0,0 @@
-// sgmm2/am-sgmm2.cc
-
-// Copyright 2009-2011  Microsoft Corporation;  Lukas Burget;
-//                      Saarland University (Author: Arnab Ghoshal);
-//                      Ondrej Glembek;  Yanmin Qian;
-// Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey)
-//                      Liang Lu;  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <functional>
-
-#include "sgmm2/am-sgmm2.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-using std::vector;
-
-// This function needs to be added because std::generate is complaining
-// about RandGauss(), which takes an optional arguments.
-static inline float _RandGauss()
-{
-  return RandGauss();
-}
-
-void Sgmm2LikelihoodCache::NextFrame() {
-  t++;
-  if (t == 0) {
-    t++; // skip over zero; zero is used to invalidate frames.
-    for (size_t i = 0; i < substate_cache.size(); i++)
-      substate_cache[i].t = 0;
-    for (size_t i = 0; i < pdf_cache.size(); i++)
-      pdf_cache[i].t = 0;
-  }
-}
-
-void AmSgmm2::ComputeGammaI(const Vector<BaseFloat> &state_occupancies,
-                            Vector<BaseFloat> *gamma_i) const {
-  KALDI_ASSERT(state_occupancies.Dim() == NumPdfs());
-  Vector<BaseFloat> w_jm(NumGauss());
-  gamma_i->Resize(NumGauss());
-  for (int32 j1 = 0; j1 < NumGroups(); j1++) {
-    int32 M = NumSubstatesForGroup(j1);
-    const std::vector<int32> &pdfs = group2pdf_[j1];
-    Vector<BaseFloat> substate_weight(M); // total weight for each substate.
-    for (size_t i = 0; i < pdfs.size(); i++) {
-      int32 j2 = pdfs[i];
-      substate_weight.AddVec(state_occupancies(j2), c_[j2]);
-    }
-    for (int32 m = 0; m < M; m++) {
-      w_jm.AddMatVec(1.0, w_, kNoTrans, v_[j1].Row(m), 0.0);
-      w_jm.ApplySoftMax();
-      gamma_i->AddVec(substate_weight(m), w_jm);
-    }
-  }
-}
-
-
-void AmSgmm2::ComputePdfMappings() {
-  if (pdf2group_.empty()) {
-    KALDI_WARN << "ComputePdfMappings(): no pdf2group_ map, assuming you "
-        "are reading in old model.";
-    KALDI_ASSERT(v_.size() != 0);
-    pdf2group_.resize(v_.size());
-    for (int32 j2 = 0; j2 < static_cast<int32>(pdf2group_.size()); j2++)
-      pdf2group_[j2] = j2;
-  }
-  group2pdf_.clear();
-  for (int32 j2 = 0; j2 < static_cast<int32>(pdf2group_.size()); j2++) {
-    int32 j1 = pdf2group_[j2];
-    if (group2pdf_.size() <= j1) group2pdf_.resize(j1+1);
-    group2pdf_[j1].push_back(j2);
-  }
-}
-
-void AmSgmm2::Read(std::istream &in_stream, bool binary) {
-  { // We want this to work even if the object was previously
-    // populated, so we clear the items that are more likely
-    // to cause problems.
-    pdf2group_.clear();
-    group2pdf_.clear();
-    u_.Resize(0,0);
-    w_jmi_.clear();
-    v_.clear();
-  }
-  // removing anything that was in the object before.
-  int32 num_pdfs = -1, feat_dim, num_gauss;
-  std::string token;
-
-  ExpectToken(in_stream, binary, "<SGMM>");
-  ExpectToken(in_stream, binary, "<NUMSTATES>");
-  ReadBasicType(in_stream, binary, &num_pdfs);
-  ExpectToken(in_stream, binary, "<DIMENSION>");
-  ReadBasicType(in_stream, binary, &feat_dim);
-  ExpectToken(in_stream, binary, "<NUMGAUSS>");
-  ReadBasicType(in_stream, binary, &num_gauss);
-
-  KALDI_ASSERT(num_pdfs > 0 && feat_dim > 0);
-
-  ReadToken(in_stream, binary, &token);
-
-  while (token != "</SGMM>") {
-    if (token == "<PDF2GROUP>") {
-      ReadIntegerVector(in_stream, binary, &pdf2group_);
-      ComputePdfMappings();
-    } else if (token == "<WEIGHTIDX2GAUSS>") {  // TEMP!   Will remove.
-      std::vector<int32> garbage;
-      ReadIntegerVector(in_stream, binary, &garbage);
-    } else if (token == "<DIAG_UBM>") {
-      diag_ubm_.Read(in_stream, binary);
-    } else if (token == "<FULL_UBM>") {
-      full_ubm_.Read(in_stream, binary);
-    } else if (token == "<SigmaInv>") {
-      SigmaInv_.resize(num_gauss);
-      for (int32 i = 0; i < num_gauss; i++) {
-        SigmaInv_[i].Read(in_stream, binary);
-      }
-    } else if (token == "<M>") {
-      M_.resize(num_gauss);
-      for (int32 i = 0; i < num_gauss; i++) {
-        M_[i].Read(in_stream, binary);
-      }
-    } else if (token == "<N>") {
-      N_.resize(num_gauss);
-      for (int32 i = 0; i < num_gauss; i++) {
-        N_[i].Read(in_stream, binary);
-      }
-    } else if (token == "<w>") {
-      w_.Read(in_stream, binary);
-    } else if (token == "<u>") {
-      u_.Read(in_stream, binary);
-    } else if (token == "<v>") {
-      int32 num_groups = group2pdf_.size();
-      if (num_groups == 0) {
-        KALDI_WARN << "Reading old model with new code (should still work)";
-        num_groups = num_pdfs;
-      }
-      v_.resize(num_groups);
-      for (int32 j1 = 0; j1 < num_groups; j1++) {
-        v_[j1].Read(in_stream, binary);
-      }
-    } else if (token == "<c>") {
-      c_.resize(num_pdfs);
-      for (int32 j2 = 0; j2 < num_pdfs; j2++) {
-        c_[j2].Read(in_stream, binary);
-      }
-    } else if (token == "<n>") {
-      int32 num_groups = group2pdf_.size();
-      if (num_groups == 0) num_groups = num_pdfs;
-      n_.resize(num_groups);
-      for (int32 j1 = 0; j1 < num_groups; j1++) {
-        n_[j1].Read(in_stream, binary);
-      }
-      // The following are the Gaussian prior parameters for MAP adaptation of M
-      // They may be moved to somewhere else eventually.
-    } else if (token == "<M_Prior>") {
-      ExpectToken(in_stream, binary, "<NUMGaussians>");
-      ReadBasicType(in_stream, binary, &num_gauss);
-      M_prior_.resize(num_gauss);
-      for (int32 i = 0; i < num_gauss; i++) {
-        M_prior_[i].Read(in_stream, binary);
-      }
-    } else if (token == "<Row_Cov_Inv>") {
-      row_cov_inv_.Read(in_stream, binary);
-    } else if (token == "<Col_Cov_Inv>") {
-      col_cov_inv_.Read(in_stream, binary);
-    } else {
-      KALDI_ERR << "Unexpected token '" << token << "' in model file ";
-    }
-    ReadToken(in_stream, binary, &token);
-  }
-
-  if (pdf2group_.empty())
-    ComputePdfMappings(); // sets up group2pdf_, and pdf2group_ if reading
-  // old model.
-
-  if (n_.empty())
-    ComputeNormalizers();
-  if (HasSpeakerDependentWeights())
-    ComputeWeights();
-}
-
-int32 AmSgmm2::Pdf2Group(int32 j2) const {
-  KALDI_ASSERT(static_cast<size_t>(j2) < pdf2group_.size());
-  int32 j1 = pdf2group_[j2];
-  return j1;
-}
-
-
-void AmSgmm2::Write(std::ostream &out_stream,
-                   bool binary,
-                   SgmmWriteFlagsType write_params) const {
-  int32 num_pdfs = NumPdfs(),
-      feat_dim = FeatureDim(),
-      num_gauss = NumGauss();
-
-  WriteToken(out_stream, binary, "<SGMM>");
-  if (!binary) out_stream << "\n";
-  WriteToken(out_stream, binary, "<NUMSTATES>");
-  WriteBasicType(out_stream, binary, num_pdfs);
-  WriteToken(out_stream, binary, "<DIMENSION>");
-  WriteBasicType(out_stream, binary, feat_dim);
-  WriteToken(out_stream, binary, "<NUMGAUSS>");
-  WriteBasicType(out_stream, binary, num_gauss);
-  if (!binary) out_stream << "\n";
-
-  if (write_params & kSgmmBackgroundGmms) {
-    WriteToken(out_stream, binary, "<DIAG_UBM>");
-    diag_ubm_.Write(out_stream, binary);
-    WriteToken(out_stream, binary, "<FULL_UBM>");
-    full_ubm_.Write(out_stream, binary);
-  }
-
-  if (write_params & kSgmmGlobalParams) {
-    WriteToken(out_stream, binary, "<SigmaInv>");
-    if (!binary) out_stream << "\n";
-    for (int32 i = 0; i < num_gauss; i++) {
-      SigmaInv_[i].Write(out_stream, binary);
-    }
-    WriteToken(out_stream, binary, "<M>");
-    if (!binary) out_stream << "\n";
-    for (int32 i = 0; i < num_gauss; i++) {
-      M_[i].Write(out_stream, binary);
-    }
-    if (N_.size() != 0) {
-      WriteToken(out_stream, binary, "<N>");
-      if (!binary) out_stream << "\n";
-      for (int32 i = 0; i < num_gauss; i++) {
-        N_[i].Write(out_stream, binary);
-      }
-    }
-    WriteToken(out_stream, binary, "<w>");
-    w_.Write(out_stream, binary);
-    WriteToken(out_stream, binary, "<u>");
-    u_.Write(out_stream, binary);
-  }
-
-  if (write_params & kSgmmStateParams) {
-    WriteToken(out_stream, binary, "<PDF2GROUP>");
-    WriteIntegerVector(out_stream, binary, pdf2group_);
-    WriteToken(out_stream, binary, "<v>");
-    for (int32 j1 = 0; j1 < NumGroups(); j1++) {
-      v_[j1].Write(out_stream, binary);
-    }
-    WriteToken(out_stream, binary, "<c>");
-    for (int32 j2 = 0; j2 < num_pdfs; j2++) {
-      c_[j2].Write(out_stream, binary);
-    }
-  }
-
-  if (write_params & kSgmmNormalizers) {
-    WriteToken(out_stream, binary, "<n>");
-    if (n_.empty())
-      KALDI_WARN << "Not writing normalizers since they are not present.";
-    else
-      for (int32 j1 = 0; j1 < NumGroups(); j1++)
-        n_[j1].Write(out_stream, binary);
-  }
-  WriteToken(out_stream, binary, "</SGMM>");
-}
-
-
-void AmSgmm2::Check(bool show_properties) {
-  int32 J1 = NumGroups(),
-      J2 = NumPdfs(),
-      num_gauss = NumGauss(),
-      feat_dim = FeatureDim(),
-      phn_dim = PhoneSpaceDim(),
-      spk_dim = SpkSpaceDim();
-
-  if (show_properties)
-    KALDI_LOG << "AmSgmm2: #pdfs = " << J2 << ", #pdf-groups = "
-              << J1 << ", #Gaussians = "
-              << num_gauss << ", feature dim = " << feat_dim
-              << ", phone-space dim =" << phn_dim
-              << ", speaker-space dim =" << spk_dim;
-  KALDI_ASSERT(J1 > 0 && num_gauss > 0 && feat_dim > 0 && phn_dim > 0
-               && J2 > 0 && J2 >= J1);
-
-  std::ostringstream debug_str;
-
-  // First check the diagonal-covariance UBM.
-  KALDI_ASSERT(diag_ubm_.NumGauss() == num_gauss);
-  KALDI_ASSERT(diag_ubm_.Dim() == feat_dim);
-
-  // Check the full-covariance UBM.
-  KALDI_ASSERT(full_ubm_.NumGauss() == num_gauss);
-  KALDI_ASSERT(full_ubm_.Dim() == feat_dim);
-
-  // Check the globally-shared covariance matrices.
-  KALDI_ASSERT(SigmaInv_.size() == static_cast<size_t>(num_gauss));
-  for (int32 i = 0; i < num_gauss; i++) {
-    KALDI_ASSERT(SigmaInv_[i].NumRows() == feat_dim &&
-                 SigmaInv_[i](0, 0) > 0.0);  // or it wouldn't be +ve definite.
-  }
-
-  if (spk_dim != 0) {
-    KALDI_ASSERT(N_.size() == static_cast<size_t>(num_gauss));
-    for (int32 i = 0; i < num_gauss; i++)
-      KALDI_ASSERT(N_[i].NumRows() == feat_dim && N_[i].NumCols() == spk_dim);
-    if (u_.NumRows() == 0) {
-      debug_str << "Speaker-weight projections: no.";
-    } else {
-      KALDI_ASSERT(u_.NumRows() == num_gauss && u_.NumCols() == spk_dim);
-      debug_str << "Speaker-weight projections: yes.";
-    }
-  } else {
-    KALDI_ASSERT(N_.size() == 0 && u_.NumRows() == 0);
-  }
-
-  KALDI_ASSERT(M_.size() == static_cast<size_t>(num_gauss));
-  for (int32 i = 0; i < num_gauss; i++) {
-    KALDI_ASSERT(M_[i].NumRows() == feat_dim && M_[i].NumCols() == phn_dim);
-  }
-
-  KALDI_ASSERT(w_.NumRows() == num_gauss && w_.NumCols() == phn_dim);
-
-  {  // check v, c.
-    KALDI_ASSERT(v_.size() == static_cast<size_t>(J1) &&
-                 c_.size() == static_cast<size_t>(J2));
-    int32 nSubstatesTot = 0;
-    for (int32 j1 = 0; j1 < J1; j1++) {
-      int32 M_j = NumSubstatesForGroup(j1);
-      nSubstatesTot += M_j;
-      KALDI_ASSERT(M_j > 0 && v_[j1].NumRows() == M_j &&
-                   v_[j1].NumCols() == phn_dim);
-    }
-    debug_str << "Substates: "<< (nSubstatesTot) << ".  ";
-    int32 nSubstateWeights = 0;
-    for (int32 j2 = 0; j2 < J2; j2++) {
-      int32 j1 = Pdf2Group(j2);
-      int32 M = NumSubstatesForPdf(j2);
-      KALDI_ASSERT(M == NumSubstatesForGroup(j1));
-      nSubstateWeights += M;
-    }
-    KALDI_ASSERT(nSubstateWeights >= nSubstatesTot);
-    debug_str << "SubstateWeights: "<< (nSubstateWeights) << ".  ";
-  }
-
-  // check normalizers.
-  if (n_.size() == 0) {
-    debug_str << "Normalizers: no.  ";
-  } else {
-    debug_str << "Normalizers: yes.  ";
-    KALDI_ASSERT(n_.size() == static_cast<size_t>(J1));
-    for (int32 j1 = 0; j1 < J1; j1++) {
-      KALDI_ASSERT(n_[j1].NumRows() == num_gauss &&
-                   n_[j1].NumCols() == NumSubstatesForGroup(j1));
-    }
-  }
-
-  // check w_jmi_.
-  if (w_jmi_.size() == 0) {
-    debug_str << "Computed weights: no.  ";
-  } else {
-    debug_str << "Computed weights: yes.  ";
-    KALDI_ASSERT(w_jmi_.size() == static_cast<size_t>(J1));
-    for (int32 j1 = 0; j1 < J1; j1++) {
-      KALDI_ASSERT(w_jmi_[j1].NumRows() == NumSubstatesForGroup(j1) &&
-                   w_jmi_[j1].NumCols() == num_gauss);
-    }
-  }
-
-  if (show_properties)
-    KALDI_LOG << "Subspace GMM model properties: " << debug_str.str();
-}
-
-void AmSgmm2::InitializeFromFullGmm(const FullGmm &full_gmm,
-                                    const std::vector<int32> &pdf2group,
-                                    int32 phn_subspace_dim,
-                                    int32 spk_subspace_dim,
-                                    bool speaker_dependent_weights,
-                                    BaseFloat self_weight) {
-  pdf2group_ = pdf2group;
-  ComputePdfMappings();
-  full_ubm_.CopyFromFullGmm(full_gmm);
-  diag_ubm_.CopyFromFullGmm(full_gmm);
-  if (phn_subspace_dim < 1 || phn_subspace_dim > full_gmm.Dim() + 1) {
-    KALDI_WARN << "Initial phone-subspace dimension must be >= 1, value is "
-               << phn_subspace_dim << "; setting to " << full_gmm.Dim() + 1;
-    phn_subspace_dim = full_gmm.Dim() + 1;
-  }
-  KALDI_ASSERT(spk_subspace_dim >= 0);
-
-  w_.Resize(0, 0);
-  N_.clear();
-  c_.clear();
-  v_.clear();
-  SigmaInv_.clear();
-
-  KALDI_LOG << "Initializing model";
-  Matrix<BaseFloat> norm_xform;
-  ComputeFeatureNormalizingTransform(full_gmm, &norm_xform);
-  InitializeMw(phn_subspace_dim, norm_xform);
-  if (spk_subspace_dim > 0)
-    InitializeNu(spk_subspace_dim, norm_xform, speaker_dependent_weights);
-  InitializeVecsAndSubstateWeights(self_weight);
-  KALDI_LOG << "Initializing variances";
-  InitializeCovars();
-}
-
-void AmSgmm2::CopyFromSgmm2(const AmSgmm2 &other,
-                          bool copy_normalizers,
-                          bool copy_weights) {
-  KALDI_LOG << "Copying AmSgmm2";
-  pdf2group_ = other.pdf2group_;
-  group2pdf_ = other.group2pdf_;
-
-  // Copy background GMMs
-  diag_ubm_.CopyFromDiagGmm(other.diag_ubm_);
-  full_ubm_.CopyFromFullGmm(other.full_ubm_);
-
-  // Copy global params
-  SigmaInv_ = other.SigmaInv_;
-  M_ = other.M_;
-  w_ = other.w_;
-  N_ = other.N_;
-  u_ = other.u_;
-
-  // Copy state-specific params, but only copy normalizers if requested.
-  v_ = other.v_;
-  c_ = other.c_;
-  if (copy_normalizers) n_ = other.n_;
-  if (copy_weights) w_jmi_ = other.w_jmi_;
-
-  KALDI_LOG << "Done.";
-}
-
-void AmSgmm2::ComputePerFrameVars(const VectorBase<BaseFloat> &data,
-                                 const std::vector<int32> &gselect,
-                                 const Sgmm2PerSpkDerivedVars &spk_vars,
-                                 Sgmm2PerFrameDerivedVars *per_frame_vars) const {
-  KALDI_ASSERT(!n_.empty() && "ComputeNormalizers() must be called.");
-
-  per_frame_vars->Resize(gselect.size(), FeatureDim(), PhoneSpaceDim());
-
-  per_frame_vars->gselect = gselect;
-  per_frame_vars->xt.CopyFromVec(data);
-
-  for (int32 ki = 0, last = gselect.size(); ki < last; ki++) {
-    int32 i = gselect[ki];
-    per_frame_vars->xti.Row(ki).CopyFromVec(per_frame_vars->xt);
-    if (spk_vars.v_s.Dim() != 0)
-      per_frame_vars->xti.Row(ki).AddVec(-1.0, spk_vars.o_s.Row(i));
-  }
-  Vector<BaseFloat> SigmaInv_xt(FeatureDim());
-
-  bool speaker_dep_weights =
-      (spk_vars.v_s.Dim() != 0 && HasSpeakerDependentWeights());
-  for (int32 ki = 0, last = gselect.size(); ki < last; ki++) {
-    int32 i = gselect[ki];
-    BaseFloat ssgmm_term = (speaker_dep_weights ? spk_vars.log_b_is(i) : 0.0);
-    SigmaInv_xt.AddSpVec(1.0, SigmaInv_[i], per_frame_vars->xti.Row(ki), 0.0);
-    // Eq (35): z_{i}(t) = M_{i}^{T} \Sigma_{i}^{-1} x_{i}(t)
-    per_frame_vars->zti.Row(ki).AddMatVec(1.0, M_[i], kTrans, SigmaInv_xt, 0.0);
-    // Eq.(36): n_{i}(t) = -0.5 x_{i}^{T} \Sigma_{i}^{-1} x_{i}(t)
-    per_frame_vars->nti(ki) = -0.5 * VecVec(per_frame_vars->xti.Row(ki),
-                                            SigmaInv_xt) + ssgmm_term;
-  }
-}
-
-// inline
-void AmSgmm2::ComponentLogLikes(const Sgmm2PerFrameDerivedVars &per_frame_vars,
-                               int32 j1,
-                               Sgmm2PerSpkDerivedVars *spk_vars,
-                               Matrix<BaseFloat> *loglikes) const {
-  const vector<int32> &gselect = per_frame_vars.gselect;
-  int32 num_gselect = gselect.size(), num_substates = v_[j1].NumRows();
-
-  // Eq.(37): log p(x(t), m, i|j)  [indexed by j, ki]
-  // Although the extra memory allocation of storing this as a
-  // matrix might seem unnecessary, we save time in the LogSumExp()
-  // via more effective pruning.
-  loglikes->Resize(num_gselect, num_substates);
-  bool speaker_dep_weights =
-      (spk_vars->v_s.Dim() != 0 && HasSpeakerDependentWeights());
-  if (speaker_dep_weights) {
-    KALDI_ASSERT(static_cast<int32>(spk_vars->log_d_jms.size()) == NumGroups());
-    KALDI_ASSERT(static_cast<int32>(w_jmi_.size()) == NumGroups() ||
-                 "You need to call ComputeWeights().");
-  }
-  for (int32 ki = 0;  ki < num_gselect; ki++) {
-    SubVector<BaseFloat> logp_xi(*loglikes, ki);
-    int32 i = gselect[ki];
-    // for all substates, compute z_{i}^T v_{jm}
-    logp_xi.AddMatVec(1.0, v_[j1], kNoTrans, per_frame_vars.zti.Row(ki), 0.0);
-    logp_xi.AddVec(1.0, n_[j1].Row(i));  // for all substates, add n_{jim}
-    logp_xi.Add(per_frame_vars.nti(ki));  // for all substates, add n_{i}(t)
-  }
-  if (speaker_dep_weights) { // [SSGMM]
-    Vector<BaseFloat> &log_d = spk_vars->log_d_jms[j1];
-    if (log_d.Dim() == 0) { // have not yet cached this quantity.
-      log_d.Resize(num_substates);
-      log_d.AddMatVec(1.0, w_jmi_[j1], kNoTrans, spk_vars->b_is, 0.0);
-      log_d.ApplyLog();
-    }
-    loglikes->AddVecToRows(-1.0, log_d); // [SSGMM] this is the term
-    // - log d_{jm}^{(s)} in the likelihood function [eq. 25 in
-    // the techreport]
-  }
-}
-
-
-BaseFloat AmSgmm2::LogLikelihood(const Sgmm2PerFrameDerivedVars &per_frame_vars,
-                                int32 j2,
-                                Sgmm2LikelihoodCache *cache,
-                                Sgmm2PerSpkDerivedVars *spk_vars,
-                                BaseFloat log_prune) const {
-  int32 t = cache->t; // not a real time; used to uniquely identify frames.
-  // Forgo asserts here, as this is frequently called.
-  // We'll probably get a segfault if an error is made.
-  Sgmm2LikelihoodCache::PdfCacheElement &pdf_cache =
-      cache->pdf_cache[j2];
-#ifdef KALDI_PARANOID
-  bool random_test = (Rand() % 1000 == 1); // to check that the user is
-  // calling Next() on the cache, as they should.
-#else
-  bool random_test = false; // compiler will ignore test branches.
-#endif
-  if (pdf_cache.t == t) {
-    if (!random_test) return pdf_cache.log_like;
-  } else {
-    random_test = false;
-  }
-  // if random_test == true at this point, it was already cached, and we will
-  // verify that we return the same value as the cached one.
-  pdf_cache.t = t;
-
-  int32 j1 = pdf2group_[j2];
-  Sgmm2LikelihoodCache::SubstateCacheElement &substate_cache =
-      cache->substate_cache[j1];
-  if (substate_cache.t != t) { // Need to compute sub-state likelihoods.
-    substate_cache.t = t;
-    Matrix<BaseFloat> loglikes; // indexed [gselect-index][substate-index]
-    ComponentLogLikes(per_frame_vars, j1, spk_vars, &loglikes);
-    BaseFloat max = loglikes.Max(); // use this to keep things in good numerical range.
-    loglikes.Add(-max);
-    loglikes.ApplyExp();
-    substate_cache.remaining_log_like = max;
-    int32 num_substates = loglikes.NumCols();
-    substate_cache.likes.Resize(num_substates); // zeroes it.
-    substate_cache.likes.AddRowSumMat(1.0, loglikes); // add likelihoods [not in log!] for
-    // each column [i.e. summing over the rows], so we get the sum for
-    // each substate index.  You have to multiply by exp(remaining_log_like)
-    // to get a real likelihood.
-  }
-
-  BaseFloat log_like = substate_cache.remaining_log_like
-      + Log(VecVec(substate_cache.likes, c_[j2]));
-
-  if (random_test)
-    KALDI_ASSERT(ApproxEqual(pdf_cache.log_like, log_like));
-
-  pdf_cache.log_like = log_like;
-  KALDI_ASSERT(log_like == log_like && log_like - log_like == 0); // check
-  // that it's not NaN or infinity.
-  return log_like;
-}
-
-BaseFloat
-AmSgmm2::ComponentPosteriors(const Sgmm2PerFrameDerivedVars &per_frame_vars,
-                            int32 j2,
-                            Sgmm2PerSpkDerivedVars *spk_vars,
-                            Matrix<BaseFloat> *post) const {
-  KALDI_ASSERT(j2 < NumPdfs() && post != NULL);
-  int32 j1 = pdf2group_[j2];
-  ComponentLogLikes(per_frame_vars, j1, spk_vars, post); // now
-  // post is a matrix of log-likelihoods indexed by [gaussian-selection index]
-  // [sub-state index].  It doesn't include the sub-state weights,
-  // though.
-  BaseFloat loglike = post->Max();
-  post->Add(-loglike); // get it to nicer numeric range.
-  post->ApplyExp(); // so we're dealing with likelihoods (with an arbitrary offset
-  // "loglike" removed to make it in a nice numeric range)
-  post->MulColsVec(c_[j2]); // include the sub-state weights.
-
-  BaseFloat tot_like = post->Sum();
-  KALDI_ASSERT(tot_like != 0.0); // note: not valid to have zero weights.
-  loglike += Log(tot_like);
-  post->Scale(1.0 / tot_like); // so "post" now sums to one, and "loglike"
-  // contains the correct log-likelihood of the data given the pdf.
-
-  return loglike;
-}
-
-void AmSgmm2::SplitSubstatesInGroup(const Vector<BaseFloat> &pdf_occupancies,
-                                    const Sgmm2SplitSubstatesConfig &opts,
-                                    const SpMatrix<BaseFloat> &sqrt_H_sm,
-                                    int32 j1,
-                                    int32 tgt_M) {
-  const std::vector<int32> &pdfs = group2pdf_[j1];
-  int32 phn_dim = PhoneSpaceDim(), cur_M = NumSubstatesForGroup(j1),
-      num_pdfs_for_group = pdfs.size();
-  Vector<BaseFloat> rand_vec(phn_dim), v_shift(phn_dim);
-
-  KALDI_ASSERT(tgt_M >= cur_M);
-  if (cur_M == tgt_M) return;
-  // Resize v[j1] to fit new substates
-  {
-    Matrix<BaseFloat> tmp_v_j(v_[j1]);
-    v_[j1].Resize(tgt_M, phn_dim);
-    v_[j1].Range(0, cur_M, 0, phn_dim).CopyFromMat(tmp_v_j);
-  }
-
-  // we'll use a temporary matrix for the c quantities.
-  Matrix<BaseFloat> c_j(num_pdfs_for_group, tgt_M);
-  for (int32 i = 0; i < num_pdfs_for_group; i++) {
-    int32 j2 = pdfs[i];
-    c_j.Row(i).Range(0, cur_M).CopyFromVec(c_[j2]);
-  }
-
-  // Keep splitting substates until obtaining the desired number
-  for (; cur_M < tgt_M; cur_M++) {
-    int32 split_m; // substate to split.
-    {
-      Vector<BaseFloat> substate_count(tgt_M);
-      substate_count.AddRowSumMat(1.0, c_j);
-      BaseFloat *data = substate_count.Data();
-      split_m = std::max_element(data, data+cur_M) - data;
-    }
-    for (int32 i = 0; i < num_pdfs_for_group; i++) { // divide count of split
-      // substate. [extended for SCTM]
-      // c_{jkm} := c_{jmk}' := c_{jkm} / 2
-      c_j(i, split_m) = c_j(i, cur_M) = c_j(i, split_m) / 2;
-    }
-    // v_{jkm} := +/- split_perturb * H_k^{(sm)}^{-0.5} * rand_vec
-    std::generate(rand_vec.Data(), rand_vec.Data() + rand_vec.Dim(),
-                  _RandGauss);
-    v_shift.AddSpVec(opts.perturb_factor, sqrt_H_sm, rand_vec, 0.0);
-    v_[j1].Row(cur_M).CopyFromVec(v_[j1].Row(split_m));
-    v_[j1].Row(cur_M).AddVec(1.0, v_shift);
-    v_[j1].Row(split_m).AddVec(-1.0, v_shift);
-  }
-  // copy the temporary matrix for the c_ (sub-state weight)
-  // quantities back to the place it belongs.
-  for (int32 i = 0; i < num_pdfs_for_group; i++) {
-    int32 j2 = pdfs[i];
-    c_[j2].Resize(tgt_M);
-    c_[j2].CopyFromVec(c_j.Row(i));
-  }
-}
-
-
-void AmSgmm2::SplitSubstates(const Vector<BaseFloat> &pdf_occupancies,
-                             const Sgmm2SplitSubstatesConfig &opts) {
-  KALDI_ASSERT(pdf_occupancies.Dim() == NumPdfs());
-  int32 J1 = NumGroups(), J2 = NumPdfs();
-  Vector<BaseFloat> group_occupancies(J1);
-  for (int32 j2 = 0; j2 < J2; j2++)
-    group_occupancies(Pdf2Group(j2)) += pdf_occupancies(j2);
-
-  vector<int32> tgt_num_substates;
-
-  GetSplitTargets(group_occupancies, opts.split_substates,
-                  opts.power, opts.min_count, &tgt_num_substates);
-
-  int32 tot_num_substates_old = 0, tot_num_substates_new = 0;
-  vector< SpMatrix<BaseFloat> > H_i;
-  SpMatrix<BaseFloat> sqrt_H_sm;
-
-  ComputeH(&H_i);  // set up that array.
-  ComputeHsmFromModel(H_i, pdf_occupancies, &sqrt_H_sm, opts.max_cond);
-  H_i.clear();
-  sqrt_H_sm.ApplyPow(-0.5);
-
-  for (int32 j1 = 0; j1 < J1; j1++) {
-    int32 cur_M = NumSubstatesForGroup(j1),
-        tgt_M = tgt_num_substates[j1];
-    tot_num_substates_old += cur_M;
-    tot_num_substates_new += std::max(cur_M, tgt_M);
-    if (cur_M < tgt_M)
-      SplitSubstatesInGroup(pdf_occupancies, opts, sqrt_H_sm, j1, tgt_M);
-  }
-  if (tot_num_substates_old == tot_num_substates_new) {
-    KALDI_LOG << "Not splitting substates; current #substates is "
-              << tot_num_substates_old << " and target is "
-              << opts.split_substates;
-  } else {
-    KALDI_LOG << "Getting rid of normalizers as they will no longer be valid";
-    n_.clear();
-    KALDI_LOG << "Split " << tot_num_substates_old << " substates to "
-              << tot_num_substates_new;
-  }
-}
-
-void AmSgmm2::IncreasePhoneSpaceDim(int32 target_dim,
-                                   const Matrix<BaseFloat> &norm_xform) {
-  KALDI_ASSERT(!M_.empty());
-  int32 initial_dim = PhoneSpaceDim(),
-      feat_dim = FeatureDim();
-  KALDI_ASSERT(norm_xform.NumRows() == feat_dim);
-
-  if (target_dim < initial_dim)
-    KALDI_ERR << "You asked to increase phn dim to a value lower than the "
-              << " current dimension, " << target_dim << " < " << initial_dim;
-
-  if (target_dim > initial_dim + feat_dim) {
-    KALDI_WARN << "Cannot increase phone subspace dimensionality from "
-               << initial_dim << " to " << target_dim << ", increasing to "
-               << initial_dim + feat_dim;
-    target_dim = initial_dim + feat_dim;
-  }
-
-  if (initial_dim < target_dim) {
-    Matrix<BaseFloat> tmp_M(feat_dim, initial_dim);
-    for (int32 i = 0; i < NumGauss(); i++) {
-      tmp_M.CopyFromMat(M_[i]);
-      M_[i].Resize(feat_dim, target_dim);
-      M_[i].Range(0, feat_dim, 0, tmp_M.NumCols()).CopyFromMat(tmp_M);
-      M_[i].Range(0, feat_dim, tmp_M.NumCols(),
-          target_dim - tmp_M.NumCols()).CopyFromMat(norm_xform.Range(0,
-              feat_dim, 0, target_dim-tmp_M.NumCols()));
-    }
-    Matrix<BaseFloat> tmp_w = w_;
-    w_.Resize(tmp_w.NumRows(), target_dim);
-    w_.Range(0, tmp_w.NumRows(), 0, tmp_w.NumCols()).CopyFromMat(tmp_w);
-
-    for (int32 j1 = 0; j1 < NumGroups(); j1++) {
-      // Resize phonetic-subspce vectors.
-      Matrix<BaseFloat> tmp_v_j = v_[j1];
-      v_[j1].Resize(tmp_v_j.NumRows(), target_dim);
-      v_[j1].Range(0, tmp_v_j.NumRows(), 0, tmp_v_j.NumCols()).CopyFromMat(
-          tmp_v_j);
-    }
-    KALDI_LOG << "Phone subspace dimensionality increased from " <<
-        initial_dim << " to " << target_dim;
-  } else {
-    KALDI_LOG << "Phone subspace dimensionality unchanged, since target " <<
-        "dimension (" << target_dim << ") <= initial dimansion (" <<
-        initial_dim << ")";
-  }
-}
-
-void AmSgmm2::IncreaseSpkSpaceDim(int32 target_dim,
-                                 const Matrix<BaseFloat> &norm_xform,
-                                 bool speaker_dependent_weights) {
-  int32 initial_dim = SpkSpaceDim(),
-      feat_dim = FeatureDim();
-  KALDI_ASSERT(norm_xform.NumRows() == feat_dim);
-
-  if (N_.size() == 0)
-    N_.resize(NumGauss());
-
-  if (target_dim < initial_dim)
-    KALDI_ERR << "You asked to increase spk dim to a value lower than the "
-              << " current dimension, " << target_dim << " < " << initial_dim;
-
-  if (target_dim > initial_dim + feat_dim) {
-    KALDI_WARN << "Cannot increase speaker subspace dimensionality from "
-               << initial_dim << " to " << target_dim << ", increasing to "
-               << initial_dim + feat_dim;
-    target_dim = initial_dim + feat_dim;
-  }
-
-  if (initial_dim < target_dim) {
-    int32 dim_change = target_dim - initial_dim;
-    Matrix<BaseFloat> tmp_N((initial_dim != 0) ? feat_dim : 0,
-                            initial_dim);
-    for (int32 i = 0; i < NumGauss(); i++) {
-      if (initial_dim != 0) tmp_N.CopyFromMat(N_[i]);
-      N_[i].Resize(feat_dim, target_dim);
-      if (initial_dim != 0) {
-        N_[i].Range(0, feat_dim, 0, tmp_N.NumCols()).CopyFromMat(tmp_N);
-      }
-      N_[i].Range(0, feat_dim, tmp_N.NumCols(), dim_change).CopyFromMat(
-          norm_xform.Range(0, feat_dim, 0, dim_change));
-    }
-    // if we already have speaker-dependent weights or we are increasing
-    // spk-dim from zero and are asked to add them...
-    if (u_.NumRows() != 0 || (initial_dim == 0 && speaker_dependent_weights))
-      u_.Resize(NumGauss(), target_dim, kCopyData); // extend dim of u_i's
-    KALDI_LOG << "Speaker subspace dimensionality increased from " <<
-        initial_dim << " to " << target_dim;
-    if (initial_dim == 0 && speaker_dependent_weights)
-      KALDI_LOG << "Added parameters u for speaker-dependent weights.";
-  } else {
-    KALDI_LOG << "Speaker subspace dimensionality unchanged, since target " <<
-        "dimension (" << target_dim << ") <= initial dimansion (" <<
-        initial_dim << ")";
-  }
-}
-
-void AmSgmm2::ComputeWeights() {
-  int32 J1 = NumGroups();
-  w_jmi_.resize(J1);
-  int32 i = NumGauss();
-  for (int32 j1 = 0; j1 < J1; j1++) {
-    int32 M = NumSubstatesForGroup(j1);
-    w_jmi_[j1].Resize(M, i);
-    w_jmi_[j1].AddMatMat(1.0, v_[j1], kNoTrans, w_, kTrans, 0.0);
-    // now w_jmi_ contains un-normalized log weights.
-    for (int32 m = 0; m < M; m++)
-      w_jmi_[j1].Row(m).ApplySoftMax(); // get the actual weights.
-  }
-}
-
-void AmSgmm2::ComputeDerivedVars() {
-  if (n_.empty()) ComputeNormalizers();
-  if (diag_ubm_.NumGauss() != full_ubm_.NumGauss()
-      || diag_ubm_.Dim() != full_ubm_.Dim()) {
-    diag_ubm_.CopyFromFullGmm(full_ubm_);
-  }
-  if (w_jmi_.empty() && HasSpeakerDependentWeights())
-    ComputeWeights();
-}
-
-class ComputeNormalizersClass: public MultiThreadable { // For multi-threaded.
- public:
-  ComputeNormalizersClass(AmSgmm2 *am_sgmm,
-                          int32 *entropy_count_ptr,
-                          double *entropy_sum_ptr):
-      am_sgmm_(am_sgmm), entropy_count_ptr_(entropy_count_ptr),
-      entropy_sum_ptr_(entropy_sum_ptr), entropy_count_(0),
-      entropy_sum_(0.0) { }
-
-  ComputeNormalizersClass(const ComputeNormalizersClass &other):
-      MultiThreadable(other),
-      am_sgmm_(other.am_sgmm_), entropy_count_ptr_(other.entropy_count_ptr_),
-      entropy_sum_ptr_(other.entropy_sum_ptr_), entropy_count_(0),
-      entropy_sum_(0.0) { }
-
-  ~ComputeNormalizersClass() {
-    *entropy_count_ptr_ += entropy_count_;
-    *entropy_sum_ptr_ += entropy_sum_;
-  }
-
-  inline void operator() () {
-    // Note: give them local copy of the sums we're computing,
-    // which will be propagated to original pointer in the destructor.
-    am_sgmm_->ComputeNormalizersInternal(num_threads_, thread_id_,
-                                         &entropy_count_,
-                                         &entropy_sum_);
-  }
- private:
-  ComputeNormalizersClass() { } // Disallow empty constructor.
-  AmSgmm2 *am_sgmm_;
-  int32 *entropy_count_ptr_;
-  double *entropy_sum_ptr_;
-  int32 entropy_count_;
-  double entropy_sum_;
-
-};
-
-void AmSgmm2::ComputeNormalizers() {
-  KALDI_LOG << "Computing normalizers";
-  n_.resize(NumPdfs());
-  int32 entropy_count = 0;
-  double entropy_sum = 0.0;
-  ComputeNormalizersClass c(this, &entropy_count, &entropy_sum);
-  RunMultiThreaded(c);
-
-  KALDI_LOG << "Entropy of weights in substates is "
-            << (entropy_sum / entropy_count) << " over " << entropy_count
-            << " substates, equivalent to perplexity of "
-            << (Exp(entropy_sum /entropy_count));
-  KALDI_LOG << "Done computing normalizers";
-}
-
-
-void AmSgmm2::ComputeNormalizersInternal(int32 num_threads, int32 thread,
-                                         int32 *entropy_count,
-                                         double *entropy_sum) {
-
-  BaseFloat DLog2pi = FeatureDim() * Log(2 * M_PI);
-  Vector<BaseFloat> log_det_Sigma(NumGauss());
-
-  for (int32 i = 0; i < NumGauss(); i++) {
-    try {
-      log_det_Sigma(i) = - SigmaInv_[i].LogPosDefDet();
-    } catch(...) {
-      if (thread == 0) // just for one thread, print errors [else, duplicates]
-        KALDI_WARN << "Covariance is not positive definite, setting to unit";
-      SigmaInv_[i].SetUnit();
-      log_det_Sigma(i) = 0.0;
-    }
-  }
-
-  int32 J1 = NumGroups();
-
-  int block_size = (NumPdfs() + num_threads-1) / num_threads;
-  int j_start = thread * block_size, j_end = std::min(J1, j_start + block_size);
-
-  int32 I = NumGauss();
-  for (int32 j1 = j_start; j1 < j_end; j1++) {
-    int32 M = NumSubstatesForGroup(j1);
-    Matrix<BaseFloat> log_w_jm(M, I);
-    n_[j1].Resize(I, M);
-    Matrix<BaseFloat> mu_jmi(M, FeatureDim());
-    Matrix<BaseFloat> SigmaInv_mu(M, FeatureDim());
-
-    // (in logs): w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7)
-    log_w_jm.AddMatMat(1.0, v_[j1], kNoTrans, w_, kTrans, 0.0);
-    for (int32 m = 0; m < M; m++) {
-      log_w_jm.Row(m).Add(-1.0 * log_w_jm.Row(m).LogSumExp());
-      {  // DIAGNOSTIC CODE
-        (*entropy_count)++;
-        for (int32 i = 0; i < NumGauss(); i++) {
-          (*entropy_sum) -= log_w_jm(m, i) * Exp(log_w_jm(m, i));
-        }
-      }
-    }
-
-    for (int32 i = 0; i < I; i++) {
-      // mu_jmi = M_{i} * v_{jm}
-      mu_jmi.AddMatMat(1.0, v_[j1], kNoTrans, M_[i], kTrans, 0.0);
-      SigmaInv_mu.AddMatSp(1.0, mu_jmi, kNoTrans, SigmaInv_[i], 0.0);
-
-      for (int32 m = 0; m < M; m++) {
-        // mu_{jmi} * \Sigma_{i}^{-1} * mu_{jmi}
-        BaseFloat mu_SigmaInv_mu = VecVec(mu_jmi.Row(m), SigmaInv_mu.Row(m));
-        // Previously had:
-        // BaseFloat logc = log(c_[j](m));
-        // but because of STCM aspect, we can't include the sub-state mixture weights
-        // at this point [included later on.]
-
-        // eq.(31)
-        n_[j1](i, m) = log_w_jm(m, i) - 0.5 * (log_det_Sigma(i) + DLog2pi
-            + mu_SigmaInv_mu);
-        {  // Mainly diagnostic code.  Not necessary.
-          BaseFloat tmp = n_[j1](i, m);
-          if (!KALDI_ISFINITE(tmp)) {  // NaN or inf
-            KALDI_LOG << "Warning: normalizer for j1 = " << j1 << ", m = " << m
-                      << ", i = " << i << " is infinite or NaN " << tmp << "= "
-                      << log_w_jm(m, i) << "+"
-                      << (-0.5 * log_det_Sigma(i)) << "+" << (-0.5 * DLog2pi)
-                      << "+" << (mu_SigmaInv_mu) << ", setting to finite.";
-            n_[j1](i, m) = -1.0e+40;  // future work(arnab): get rid of magic number
-          }
-        }
-      }
-    }
-  }
-}
-
-BaseFloat AmSgmm2::GetDjms(int32 j1, int32 m,
-                          Sgmm2PerSpkDerivedVars *spk_vars) const {
-  // This relates to SSGMMs (speaker-dependent weights).
-  if (spk_vars->log_d_jms.empty()) return -1; // this would be
-  // because we don't have speaker-dependent weights ("u" not set up).
-
-  KALDI_ASSERT(!w_jmi_.empty() && "You need to call ComputeWeights() on SGMM.");
-  Vector<BaseFloat> &log_d = spk_vars->log_d_jms[j1];
-  if (log_d.Dim() == 0) {
-    log_d.Resize(NumSubstatesForGroup(j1));
-    log_d.AddMatVec(1.0, w_jmi_[j1], kNoTrans, spk_vars->b_is, 0.0);
-    log_d.ApplyLog();
-  }
-  return Exp(log_d(m));
-}
-
-
-void AmSgmm2::ComputeFmllrPreXform(const Vector<BaseFloat> &state_occs,
-                                  Matrix<BaseFloat> *xform,
-                                   Matrix<BaseFloat> *inv_xform,
-                                  Vector<BaseFloat> *diag_mean_scatter) const {
-  int32 num_pdfs = NumPdfs(),
-      num_gauss = NumGauss(),
-      dim = FeatureDim();
-  KALDI_ASSERT(state_occs.Dim() == num_pdfs);
-
-  BaseFloat total_occ = state_occs.Sum();
-
-  // Degenerate case: unlikely to ever happen.
-  if (total_occ == 0) {
-    KALDI_WARN << "Zero probability (computing transform). Using unit "
-               << "pre-transform";
-    xform->Resize(dim, dim + 1, kUndefined);
-    xform->SetUnit();
-    inv_xform->Resize(dim, dim + 1, kUndefined);
-    inv_xform->SetUnit();
-    diag_mean_scatter->Resize(dim, kSetZero);
-    return;
-  }
-
-  // Convert state occupancies to posteriors; Eq. (B.1)
-  Vector<BaseFloat> state_posteriors(state_occs);
-  state_posteriors.Scale(1/total_occ);
-
-  Vector<BaseFloat> mu_jmi(dim), global_mean(dim);
-  SpMatrix<BaseFloat> within_class_covar(dim), between_class_covar(dim);
-  Vector<BaseFloat> gauss_weight(num_gauss);  // weights for within-class vars.
-  Vector<BaseFloat> w_jm(num_gauss);
-  for (int32 j1 = 0; j1 < NumGroups(); j1++) {
-    const std::vector<int32> &pdfs = group2pdf_[j1];
-    int32 M = NumSubstatesForGroup(j1);
-    Vector<BaseFloat> substate_weight(M); // total weight for each substate.
-    for (size_t i = 0; i < pdfs.size(); i++) {
-      int32 j2 = pdfs[i];
-      substate_weight.AddVec(state_posteriors(j2), c_[j2]);
-    }
-    for (int32 m = 0; m < M; m++) {
-      BaseFloat this_substate_weight = substate_weight(m);
-      // Eq. (7): w_jm = softmax([w_{1}^T ... w_{D}^T] * v_{jm})
-      w_jm.AddMatVec(1.0, w_, kNoTrans, v_[j1].Row(m), 0.0);
-      w_jm.ApplySoftMax();
-
-      for (int32 i = 0; i < num_gauss; i++) {
-        BaseFloat weight = this_substate_weight * w_jm(i);
-        mu_jmi.AddMatVec(1.0, M_[i], kNoTrans, v_[j1].Row(m), 0.0);  // Eq. (6)
-        // Eq. (B.3): \mu_avg = \sum_{jmi} p(j) c_{jm} w_{jmi} \mu_{jmi}
-        global_mean.AddVec(weight, mu_jmi);
-        // \Sigma_B = \sum_{jmi} p(j) c_{jm} w_{jmi} \mu_{jmi} \mu_{jmi}^T
-        between_class_covar.AddVec2(weight, mu_jmi);  // Eq. (B.4)
-        gauss_weight(i) += weight;
-      }
-    }
-  }
-  between_class_covar.AddVec2(-1.0, global_mean);  // Eq. (B.4)
-
-  for (int32 i = 0; i < num_gauss; i++) {
-    SpMatrix<BaseFloat> Sigma(SigmaInv_[i]);
-    Sigma.InvertDouble();
-    // Eq. (B.2): \Sigma_W = \sum_{jmi} p(j) c_{jm} w_{jmi} \Sigma_i
-    within_class_covar.AddSp(gauss_weight(i), Sigma);
-  }
-
-  TpMatrix<BaseFloat> tmpL(dim);
-  Matrix<BaseFloat> tmpLInvFull(dim, dim);
-  tmpL.Cholesky(within_class_covar);  // \Sigma_W = L L^T
-  tmpL.InvertDouble();  // L^{-1}
-  tmpLInvFull.CopyFromTp(tmpL);  // get as full matrix.
-
-  // B := L^{-1} * \Sigma_B * L^{-T}
-  SpMatrix<BaseFloat> tmpB(dim);
-  tmpB.AddMat2Sp(1.0, tmpLInvFull, kNoTrans, between_class_covar, 0.0);
-
-  Matrix<BaseFloat> U(dim, dim);
-  diag_mean_scatter->Resize(dim);
-  xform->Resize(dim, dim + 1);
-  inv_xform->Resize(dim, dim + 1);
-
-  tmpB.Eig(diag_mean_scatter, &U);  // Eq. (B.5): B = U D V^T
-
-  int32 n;
-  diag_mean_scatter->ApplyFloor(1.0e-04, &n);
-  if (n != 0)
-    KALDI_WARN << "Floored " << n << " elements of the mean-scatter matrix.";
-
-  // Eq. (B.6): A_{pre} = U^T * L^{-1}
-  SubMatrix<BaseFloat> Apre(*xform, 0, dim, 0, dim);
-  Apre.AddMatMat(1.0, U, kTrans, tmpLInvFull, kNoTrans, 0.0);
-
-#ifdef KALDI_PARANOID
-  {
-    SpMatrix<BaseFloat> tmp(dim);
-    tmp.AddMat2Sp(1.0, Apre, kNoTrans, within_class_covar, 0.0);
-    KALDI_ASSERT(tmp.IsUnit(0.01));
-  }
-  {
-    SpMatrix<BaseFloat> tmp(dim);
-    tmp.AddMat2Sp(1.0, Apre, kNoTrans, between_class_covar, 0.0);
-    KALDI_ASSERT(tmp.IsDiagonal(0.01));
-  }
-#endif
-
-  // Eq. (B.7): b_{pre} = - A_{pre} \mu_{avg}
-  Vector<BaseFloat> b_pre(dim);
-  b_pre.AddMatVec(-1.0, Apre, kNoTrans, global_mean, 0.0);
-  for (int32 r = 0; r < dim; r++) {
-    xform->Row(r)(dim) = b_pre(r);  // W_{pre} = [ A_{pre}, b_{pre} ]
-  }
-
-  // Eq. (B.8) & (B.9): W_{inv} = [ A_{pre}^{-1}, \mu_{avg} ]
-  inv_xform->CopyFromMat(*xform);
-  inv_xform->Range(0, dim, 0, dim).InvertDouble();
-  for (int32 r = 0; r < dim; r++)
-    inv_xform->Row(r)(dim) = global_mean(r);
-}  // End of ComputePreXform()
-
-template<typename Real>
-void AmSgmm2::GetNtransSigmaInv(vector< Matrix<Real> > *out) const {
-  KALDI_ASSERT(SpkSpaceDim() > 0 &&
-      "Cannot compute N^{T} \\Sigma_{i}^{-1} without speaker projections.");
-  out->resize(NumGauss());
-  Matrix<Real> tmpcov(FeatureDim(), FeatureDim());
-  Matrix<Real> tmp_n(FeatureDim(), SpkSpaceDim());
-  for (int32 i = 0; i < NumGauss(); i++) {
-    tmpcov.CopyFromSp(SigmaInv_[i]);
-    tmp_n.CopyFromMat(N_[i]);
-    (*out)[i].Resize(SpkSpaceDim(), FeatureDim());
-    (*out)[i].AddMatMat(1.0, tmp_n, kTrans, tmpcov, kNoTrans, 0.0);
-  }
-}
-
-// Instantiate the above template.
-template
-void AmSgmm2::GetNtransSigmaInv(vector< Matrix<float> > *out) const;
-template
-void AmSgmm2::GetNtransSigmaInv(vector< Matrix<double> > *out) const;
-
-///////////////////////////////////////////////////////////////////////////////
-
-template<class Real>
-void AmSgmm2::ComputeH(std::vector< SpMatrix<Real> > *H_i) const {
-  KALDI_ASSERT(NumGauss() != 0);
-  (*H_i).resize(NumGauss());
-  SpMatrix<BaseFloat> H_i_tmp(PhoneSpaceDim());
-  for (int32 i = 0; i < NumGauss(); i++) {
-    (*H_i)[i].Resize(PhoneSpaceDim());
-    H_i_tmp.AddMat2Sp(1.0, M_[i], kTrans, SigmaInv_[i], 0.0);
-    (*H_i)[i].CopyFromSp(H_i_tmp);
-  }
-}
-
-// Instantiate the template.
-template
-void AmSgmm2::ComputeH(std::vector< SpMatrix<float> > *H_i) const;
-template
-void AmSgmm2::ComputeH(std::vector< SpMatrix<double> > *H_i) const;
-
-
-// Initializes the matrices M_{i} and w_i
-void AmSgmm2::InitializeMw(int32 phn_subspace_dim,
-                           const Matrix<BaseFloat> &norm_xform) {
-  int32 ddim = full_ubm_.Dim();
-  KALDI_ASSERT(phn_subspace_dim <= ddim + 1);
-  KALDI_ASSERT(phn_subspace_dim <= norm_xform.NumCols() + 1);
-  KALDI_ASSERT(ddim <= norm_xform.NumRows());
-
-  Vector<BaseFloat> mean(ddim);
-  int32 num_gauss = full_ubm_.NumGauss();
-  w_.Resize(num_gauss, phn_subspace_dim);
-  M_.resize(num_gauss);
-  for (int32 i = 0; i < num_gauss; i++) {
-    full_ubm_.GetComponentMean(i, &mean);
-    Matrix<BaseFloat> &thisM(M_[i]);
-    thisM.Resize(ddim, phn_subspace_dim);
-    // Eq. (27): M_{i} = [ \bar{\mu}_{i} (J)_{1:D, 1:(S-1)}]
-    thisM.CopyColFromVec(mean, 0);
-    int32 nonrandom_dim = std::min(phn_subspace_dim - 1, ddim),
-        random_dim = phn_subspace_dim - 1 - nonrandom_dim;
-    thisM.Range(0, ddim, 1, nonrandom_dim).CopyFromMat(
-        norm_xform.Range(0, ddim, 0, nonrandom_dim), kNoTrans);
-    // The following extension to the original paper allows us to
-    // initialize the model with a larger dimension of phone-subspace vector.
-    if (random_dim > 0)
-      thisM.Range(0, ddim, nonrandom_dim + 1, random_dim).SetRandn();
-  }
-}
-
-// Initializes the matrices N_i, and [if speaker_dependent_weights==true] u_i.
-void AmSgmm2::InitializeNu(int32 spk_subspace_dim,
-                          const Matrix<BaseFloat> &norm_xform,
-                          bool speaker_dependent_weights) {
-  int32 ddim = full_ubm_.Dim();
-
-  int32 num_gauss = full_ubm_.NumGauss();
-  N_.resize(num_gauss);
-  for (int32 i = 0; i < num_gauss; i++) {
-    N_[i].Resize(ddim, spk_subspace_dim);
-    // Eq. (28): N_{i} = [ (J)_{1:D, 1:T)}]
-
-    int32 nonrandom_dim = std::min(spk_subspace_dim, ddim),
-        random_dim = spk_subspace_dim - nonrandom_dim;
-
-    N_[i].Range(0, ddim, 0, nonrandom_dim).
-        CopyFromMat(norm_xform.Range(0, ddim, 0, nonrandom_dim), kNoTrans);
-    // The following extension to the original paper allows us to
-    // initialize the model with a larger dimension of speaker-subspace vector.
-    if (random_dim > 0)
-      N_[i].Range(0, ddim, nonrandom_dim, random_dim).SetRandn();
-  }
-  if (speaker_dependent_weights) {
-    u_.Resize(num_gauss, spk_subspace_dim); // will set to zero.
-  } else {
-    u_.Resize(0, 0);
-  }
-}
-
-void AmSgmm2::CopyGlobalsInitVecs(const AmSgmm2 &other,
-                                  const std::vector<int32> &pdf2group,
-                                  BaseFloat self_weight) {
-  KALDI_LOG << "Initializing model";
-  pdf2group_ = pdf2group;
-  ComputePdfMappings();
-
-  // Copy background GMMs
-  diag_ubm_.CopyFromDiagGmm(other.diag_ubm_);
-  full_ubm_.CopyFromFullGmm(other.full_ubm_);
-
-  // Copy global params
-  SigmaInv_ = other.SigmaInv_;
-
-  M_ = other.M_;
-  w_ = other.w_;
-  u_ = other.u_;
-  N_ = other.N_;
-
-  InitializeVecsAndSubstateWeights(self_weight);
-}
-
-
-// Initializes the vectors v_{j1,m} and substate weights c_{j2,m}.
-void AmSgmm2::InitializeVecsAndSubstateWeights(BaseFloat self_weight) {
-  int32 J1 = NumGroups(), J2 = NumPdfs();
-  KALDI_ASSERT(J1 > 0 && J2 >= J1);
-  int32 phn_subspace_dim = PhoneSpaceDim();
-  KALDI_ASSERT(phn_subspace_dim > 0 && "Initialize M and w first.");
-
-  v_.resize(J1);
-  if (self_weight == 1.0) {
-    for (int32 j1 = 0; j1 < J1; j1++) {
-      v_[j1].Resize(1, phn_subspace_dim);
-      v_[j1](0, 0) = 1.0;  // Eq. (26): v_{j1} = [1 0 0 ... 0]
-    }
-    c_.resize(J2);
-    for (int32 j2 = 0; j2 < J2; j2++) {
-      c_[j2].Resize(1);
-      c_[j2](0) = 1.0;    // Eq. (25): c_{j1} = 1.0
-    }
-  } else {
-    for (int32 j1 = 0; j1 < J1; j1++) {
-      int32 npdfs = group2pdf_[j1].size();
-      v_[j1].Resize(npdfs, phn_subspace_dim);
-      for (int32 m = 0; m < npdfs; m++)
-        v_[j1](m, 0) = 1.0;  // Eq. (26): v_{j1} = [1 0 0 ... 0]
-    }
-    c_.resize(J2);
-    for (int32 j2 = 0; j2 < J2; j2++) {
-      int32 j1 = pdf2group_[j2], npdfs = group2pdf_[j1].size();
-      c_[j2].Resize(npdfs);
-      if (npdfs == 1) c_[j2].Set(1.0);
-      else {
-        // note: just avoid NaNs if npdfs-1... value won't matter.
-        double other_weight = (1.0 - self_weight) / std::max((1-npdfs), 1);
-        c_[j2].Set(other_weight);
-        for (int32 k = 0; k < npdfs; k++)
-          if(group2pdf_[j1][k] == j2) c_[j2](k) = self_weight;
-      }
-    }
-  }
-}
-
-// Initializes the within-class vars Sigma_{ki}
-void AmSgmm2::InitializeCovars() {
-  std::vector< SpMatrix<BaseFloat> > &inv_covars(full_ubm_.inv_covars());
-  int32 num_gauss = full_ubm_.NumGauss();
-  int32 dim = full_ubm_.Dim();
-  SigmaInv_.resize(num_gauss);
-  for (int32 i = 0; i < num_gauss; i++) {
-    SigmaInv_[i].Resize(dim);
-    SigmaInv_[i].CopyFromSp(inv_covars[i]);
-  }
-}
-
-// Compute the "smoothing" matrix H^{(sm)} from expected counts given the model.
-void AmSgmm2::ComputeHsmFromModel(
-    const std::vector< SpMatrix<BaseFloat> > &H,
-    const Vector<BaseFloat> &state_occupancies,
-    SpMatrix<BaseFloat> *H_sm,
-    BaseFloat max_cond) const {
-  int32 num_gauss = NumGauss();
-  BaseFloat tot_sum = 0.0;
-  KALDI_ASSERT(state_occupancies.Dim() == NumPdfs());
-  Vector<BaseFloat> w_jm(num_gauss);
-  H_sm->Resize(PhoneSpaceDim());
-  H_sm->SetZero();
-  Vector<BaseFloat> gamma_i;
-  ComputeGammaI(state_occupancies, &gamma_i);
-
-  BaseFloat sum = 0.0;
-  for (int32 i = 0; i < num_gauss; i++) {
-    if (gamma_i(i) > 0) {
-      H_sm->AddSp(gamma_i(i), H[i]);
-      sum += gamma_i(i);
-    }
-  }
-  if (sum == 0.0) {
-    KALDI_WARN << "Sum of counts is zero. ";
-    // set to unit matrix--arbitrary non-singular matrix.. won't ever matter.
-    H_sm->SetUnit();
-  } else {
-    H_sm->Scale(1.0 / sum);
-    int32 tmp = H_sm->LimitCondDouble(max_cond);
-    if (tmp > 0) {
-      KALDI_WARN << "Limited " << (tmp) << " eigenvalues of H_sm";
-    }
-  }
-  tot_sum += sum;
-
-  KALDI_LOG << "total count is " << tot_sum;
-}
-
-void ComputeFeatureNormalizingTransform(const FullGmm &gmm, Matrix<BaseFloat> *xform) {
-  int32 dim = gmm.Dim();
-  int32 num_gauss = gmm.NumGauss();
-  SpMatrix<BaseFloat> within_class_covar(dim);
-  SpMatrix<BaseFloat> between_class_covar(dim);
-  Vector<BaseFloat> global_mean(dim);
-
-  // Accumulate LDA statistics from the GMM parameters.
-  {
-    BaseFloat total_weight = 0.0;
-    Vector<BaseFloat> tmp_weight(num_gauss);
-    Matrix<BaseFloat> tmp_means;
-    std::vector< SpMatrix<BaseFloat> > tmp_covars;
-    tmp_weight.CopyFromVec(gmm.weights());
-    gmm.GetCovarsAndMeans(&tmp_covars, &tmp_means);
-    for (int32 i = 0; i < num_gauss; i++) {
-      BaseFloat w_i = tmp_weight(i);
-      total_weight += w_i;
-      within_class_covar.AddSp(w_i, tmp_covars[i]);
-      between_class_covar.AddVec2(w_i, tmp_means.Row(i));
-      global_mean.AddVec(w_i, tmp_means.Row(i));
-    }
-    KALDI_ASSERT(total_weight > 0);
-    if (fabs(total_weight - 1.0) > 0.001) {
-      KALDI_WARN << "Total weight across the GMMs is " << (total_weight)
-          << ", renormalizing.";
-      global_mean.Scale(1.0 / total_weight);
-      within_class_covar.Scale(1.0 / total_weight);
-      between_class_covar.Scale(1.0 / total_weight);
-    }
-    between_class_covar.AddVec2(-1.0, global_mean);
-  }
-
-  TpMatrix<BaseFloat> chol(dim);
-  chol.Cholesky(within_class_covar);  // Sigma_W = L L^T
-  TpMatrix<BaseFloat> chol_inv(chol);
-  chol_inv.InvertDouble();
-  Matrix<BaseFloat> chol_full(dim, dim);
-  chol_full.CopyFromTp(chol_inv);
-  SpMatrix<BaseFloat> LBL(dim);
-  // LBL = L^{-1} \Sigma_B L^{-T}
-  LBL.AddMat2Sp(1.0, chol_full, kNoTrans, between_class_covar, 0.0);
-  Vector<BaseFloat> Dvec(dim);
-  Matrix<BaseFloat> U(dim, dim);
-  LBL.Eig(&Dvec, &U);
-  SortSvd(&Dvec, &U);
-
-  xform->Resize(dim, dim);
-  chol_full.CopyFromTp(chol);
-  // T := L U, eq (23)
-  xform->AddMatMat(1.0, chol_full, kNoTrans, U, kNoTrans, 0.0);
-
-#ifdef KALDI_PARANOID
-  Matrix<BaseFloat> inv_xform(*xform);
-  inv_xform.InvertDouble();
-  {  // Check that T*within_class_covar*T' = I.
-    Matrix<BaseFloat> wc_covar_full(dim, dim), tmp(dim, dim);
-    wc_covar_full.CopyFromSp(within_class_covar);
-    tmp.AddMatMat(1.0, inv_xform, kNoTrans, wc_covar_full, kNoTrans, 0.0);
-    wc_covar_full.AddMatMat(1.0, tmp, kNoTrans, inv_xform, kTrans, 0.0);
-    KALDI_ASSERT(wc_covar_full.IsUnit(0.01));
-  }
-  {  // Check that T*between_class_covar*T' = diagonal.
-    Matrix<BaseFloat> bc_covar_full(dim, dim), tmp(dim, dim);
-    bc_covar_full.CopyFromSp(between_class_covar);
-    tmp.AddMatMat(1.0, inv_xform, kNoTrans, bc_covar_full, kNoTrans, 0.0);
-    bc_covar_full.AddMatMat(1.0, tmp, kNoTrans, inv_xform, kTrans, 0.0);
-    KALDI_ASSERT(bc_covar_full.IsDiagonal(0.01));
-  }
-#endif
-}
-
-void AmSgmm2::ComputePerSpkDerivedVars(Sgmm2PerSpkDerivedVars *vars) const {
-  KALDI_ASSERT(vars != NULL);
-  if (vars->v_s.Dim() != 0) {
-    KALDI_ASSERT(vars->v_s.Dim() == SpkSpaceDim());
-    vars->o_s.Resize(NumGauss(), FeatureDim());
-    int32 num_gauss = NumGauss();
-    // first compute the o_i^{(s)} quantities.
-    for (int32 i = 0; i < num_gauss; i++) {
-       // Eqn. (32): o_i^{(s)} = N_i v^{(s)}
-      vars->o_s.Row(i).AddMatVec(1.0, N_[i], kNoTrans, vars->v_s, 0.0);
-    }
-    // the rest relates to the SSGMM.  We only need to to this
-    // if we're using speaker-dependent weights.
-    if (HasSpeakerDependentWeights()) {
-      vars->log_d_jms.clear();
-      vars->log_d_jms.resize(NumGroups());
-      vars->log_b_is.Resize(NumGauss());
-      vars->log_b_is.AddMatVec(1.0, u_, kNoTrans, vars->v_s, 0.0);
-      vars->b_is.Resize(NumGauss());
-      vars->b_is.CopyFromVec(vars->log_b_is);
-      vars->b_is.ApplyExp();
-      for (int32 i = 0; i < vars->b_is.Dim(); i++) {
-        if (vars->b_is(i) - vars->b_is(i) != 0.0) { // NaN.
-          vars->b_is(i) = 1.0;
-          KALDI_WARN << "Set NaN in b_is to 1.0";
-        }
-      }
-    } else {
-      vars->b_is.Resize(0);
-      vars->log_b_is.Resize(0);
-      vars->log_d_jms.resize(0);
-    }
-  } else {
-    vars->Clear(); // make sure everything is cleared.
-  }
-}
-
-BaseFloat AmSgmm2::GaussianSelection(const Sgmm2GselectConfig &config,
-                                    const VectorBase<BaseFloat> &data,
-                                    std::vector<int32> *gselect) const {
-  KALDI_ASSERT(diag_ubm_.NumGauss() != 0 &&
-               diag_ubm_.NumGauss() == full_ubm_.NumGauss() &&
-               diag_ubm_.Dim() == data.Dim());
-  KALDI_ASSERT(config.diag_gmm_nbest > 0 && config.full_gmm_nbest > 0 &&
-               config.full_gmm_nbest < config.diag_gmm_nbest);
-  int32 num_gauss = diag_ubm_.NumGauss();
-
-  std::vector< std::pair<BaseFloat, int32> > pruned_pairs;
-  if (config.diag_gmm_nbest < num_gauss) {    Vector<BaseFloat> loglikes(num_gauss);
-    diag_ubm_.LogLikelihoods(data, &loglikes);
-    Vector<BaseFloat> loglikes_copy(loglikes);
-    BaseFloat *ptr = loglikes_copy.Data();
-    std::nth_element(ptr, ptr+num_gauss-config.diag_gmm_nbest, ptr+num_gauss);
-    BaseFloat thresh = ptr[num_gauss-config.diag_gmm_nbest];
-    for (int32 g = 0; g < num_gauss; g++)
-      if (loglikes(g) >= thresh)  // met threshold for diagonal phase.
-        pruned_pairs.push_back(
-            std::make_pair(full_ubm_.ComponentLogLikelihood(data, g), g));
-  } else {
-    Vector<BaseFloat> loglikes(num_gauss);
-    full_ubm_.LogLikelihoods(data, &loglikes);
-    for (int32 g = 0; g < num_gauss; g++)
-      pruned_pairs.push_back(std::make_pair(loglikes(g), g));
-  }
-  KALDI_ASSERT(!pruned_pairs.empty());
-  if (pruned_pairs.size() > static_cast<size_t>(config.full_gmm_nbest)) {
-    std::nth_element(pruned_pairs.begin(),
-                     pruned_pairs.end() - config.full_gmm_nbest,
-                     pruned_pairs.end());
-    pruned_pairs.erase(pruned_pairs.begin(),
-                       pruned_pairs.end() - config.full_gmm_nbest);
-  }
-  Vector<BaseFloat> loglikes_tmp(pruned_pairs.size());  // for return value.
-  KALDI_ASSERT(gselect != NULL);
-  gselect->resize(pruned_pairs.size());
-  // Make sure pruned Gaussians appear from best to worst.
-  std::sort(pruned_pairs.begin(), pruned_pairs.end(),
-            std::greater< std::pair<BaseFloat, int32> >());
-  for (size_t i = 0; i < pruned_pairs.size(); i++) {
-    loglikes_tmp(i) = pruned_pairs[i].first;
-    (*gselect)[i] = pruned_pairs[i].second;
-  }
-  return loglikes_tmp.LogSumExp();
-}
-
-void Sgmm2GauPost::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<Sgmm2GauPost>");
-  int32 T = this->size();
-  WriteBasicType(os, binary, T);
-  for (int32 t = 0; t < T; t++) {
-    WriteToken(os, binary, "<gselect>");
-    WriteIntegerVector(os, binary, (*this)[t].gselect);
-    WriteToken(os, binary, "<tids>");
-    WriteIntegerVector(os, binary, (*this)[t].tids);
-    KALDI_ASSERT((*this)[t].tids.size() == (*this)[t].posteriors.size());
-    for (size_t i = 0; i < (*this)[t].posteriors.size(); i++) {
-      (*this)[t].posteriors[i].Write(os, binary);
-    }
-  }
-  WriteToken(os, binary, "</Sgmm2GauPost>");
-}
-
-
-void Sgmm2GauPost::Read(std::istream &is, bool binary) {
-  ExpectToken(is, binary, "<Sgmm2GauPost>");
-  int32 T;
-  ReadBasicType(is, binary, &T);
-  KALDI_ASSERT(T >= 0);
-  this->resize(T);
-  for (int32 t = 0; t < T; t++) {
-    ExpectToken(is, binary, "<gselect>");
-    ReadIntegerVector(is, binary, &((*this)[t].gselect));
-    ExpectToken(is, binary, "<tids>");
-    ReadIntegerVector(is, binary, &((*this)[t].tids));
-    size_t sz = (*this)[t].tids.size();
-    (*this)[t].posteriors.resize(sz);
-    for (size_t i = 0; i < sz; i++)
-      (*this)[t].posteriors[i].Read(is, binary);
-  }
-  ExpectToken(is, binary, "</Sgmm2GauPost>");
-}
-
-
-
-}  // namespace kaldi
diff --git a/src/sgmm2/am-sgmm2.h b/src/sgmm2/am-sgmm2.h
deleted file mode 100644
index c60e66d7a01..00000000000
--- a/src/sgmm2/am-sgmm2.h
+++ /dev/null
@@ -1,586 +0,0 @@
-// sgmm2/am-sgmm2.h
-
-// Copyright 2009-2011  Microsoft Corporation;  Lukas Burget;
-//                      Saarland University (Author: Arnab Ghoshal);
-//                      Ondrej Glembek;  Yanmin Qian;
-// Copyright 2012-2013  Johns Hopkins University (author: Daniel Povey)
-//                      Liang Lu;  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM2_AM_SGMM2_H_
-#define KALDI_SGMM2_AM_SGMM2_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-#include "gmm/model-common.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/full-gmm.h"
-#include "itf/options-itf.h"
-#include "util/table-types.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-/*
-  When reading this file, keep in mind two references: the paper
- "The Subspace Gaussian Mixture Model-- a Structured Model for Speech Recognition", by D. Povey,
-  L. Burget et. al (Computer Speech and Language, 2011), and
-  "The Symmetric Subspace Gaussian Mixture Model": Microsoft Research technical report MSR-TR-2010-138.
-  We will refer to these as "the paper" [or "the CSL paper"] and "the techreport".
-
-  (1) SSGMM
-  
-  We'll use the acronym SSGMM to refer to the Symmetric SGMM, and we'll mark in
-  the code with "[SSGMM]" things that relate to it.  The technical report
-  describes an extention to the originally described model where we have
-  speaker-dependent mixture weights.  These are implemented here.  Note: we only
-  implement the "more efficient" version of the update for the speaker
-  projection vectors \u_i.  There is also an ICASSP paper that describes the
-  stuff in the techreport (more briefly), with results, but we don't refer to
-  any equation numbers in that.
-
-  (2) SCTM
-
-  What we implement here has another extension that was not in the CSL paper: an
-  extension to the "state-clustered tied mixture" [SCTM] system-- a bit like BBN's
-  style of system, except for SGMMs not Gaussians, at the sub-state not Gaussian level.
-  We build a first
-  tree, at which level the phonetic sub-state vectors are defined, and then a
-  "more detailed" tree, at which level we share the sub-state mixture weights.
-  In this class, NumPdfs() returns the real number of pdf's (i.e. the #leaves
-  of the more detailed tree), and NumPdfGroups() returns the number of groups of
-  pdf's that share the sub-state vectors.
-  We use the index j2 for indexing 0...NumPdfs()-1 [as it's the "2nd level" of the tree],
-  and j1 for indexing 0...NumPdfGroups()-1 [as it's the "1st level" of the tree].
-  The weights are stored as c[j2][m].  There is a mapping Pdf2Group(j2) which returns
-  the corresponding j1 for a given j2, and Group2PdfList(j1) which returns a vector<int32>
-  consisting of the list of j2 indices for that j1. 
-  
-  The count quantities we store during the accumulation phase could most simply
-  be stored as gamma[j2][m][i] (where m is the sub-state index), but this is
-  inefficient.  Instead we store them separately as gamma1[j1][m][i] and gamma2[j2][m],
-  so each count gets stored in two separate places; this makes the stats more compact.
-
-  In this implementation, the normalizers n_{jmi} are now stored as n[j1][m][i],
-  without including the log-weight term log c[j2][m].  In the computation of
-  state likelihoods, we first compute the log-prob of the data given each of the
-  sub-state vectors; and we compute the log-sum of this and the posteriors over
-  each of the vectors [treating the weights as 1.0].  Call these
-  "pseudo-posteriors".  Then to take into account the contribution of the
-  weights in a state j2, we take the dot product of the weight-vector c[j2][...]
-  with this vector of pseudo-posteriors.  The log of this dot-product gets added to the
-  original log-sum.  
-*/
-
-
-struct Sgmm2SplitSubstatesConfig {
-  int32 split_substates;
-  BaseFloat perturb_factor;
-  BaseFloat power;
-  BaseFloat max_cond;
-  BaseFloat min_count;
-  Sgmm2SplitSubstatesConfig(): split_substates(0),
-                               perturb_factor(0.01),
-                               power(0.2),
-                               max_cond(100.0),
-                               min_count(40.0) { }
-  void Register(OptionsItf *opts) {
-    opts->Register("split-substates", &split_substates, "Increase number of "
-                   "substates to this overall target.");
-    opts->Register("max-cond-split", &max_cond, "Max condition number of smoothing "
-                   "matrix used in substate splitting.");
-    opts->Register("perturb-factor", &perturb_factor, "Perturbation factor for "
-                   "state vectors while splitting substates.");
-    opts->Register("power", &power, "Exponent for substate occupancies used while "
-                   "splitting substates.");
-    opts->Register("min-count", &min_count, "Minimum allowed count, used in allocating "
-                   "sub-states to state in mixture splitting.");
-  }
-};
-
-// Caution: this config is probably not used in most of the setups, we generally do the Gaussian
-// selection using separate programs
-struct Sgmm2GselectConfig {
-  /// Number of highest-scoring full-covariance Gaussians per frame.
-  int32 full_gmm_nbest;
-  /// Number of highest-scoring diagonal-covariance Gaussians per frame.
-  int32 diag_gmm_nbest;
-
-  Sgmm2GselectConfig() {
-    full_gmm_nbest = 15;
-    diag_gmm_nbest = 50;
-  }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("full-gmm-nbest", &full_gmm_nbest, "Number of highest-scoring"
-                   " full-covariance Gaussians selected per frame.");
-    opts->Register("diag-gmm-nbest", &diag_gmm_nbest, "Number of highest-scoring"
-                   " diagonal-covariance Gaussians selected per frame.");
-  }
-};
-
-/** \struct Sgmm2PerFrameDerivedVars
- *  Holds the per-frame precomputed quantities x(t), x_{i}(t), z_{i}(t), and
- *  n_{i}(t) (cf. Eq. (33)-(36)) for the SGMM, as well as the cached Gaussian
- *  selection records.
- */
-struct Sgmm2PerFrameDerivedVars {
-  std::vector<int32> gselect;
-  Vector<BaseFloat> xt;   ///< x'(t), FMLLR-adapted, dim = [D], eq.(33)
-  Matrix<BaseFloat> xti;  ///< x_{i}(t) = x'(t) - o_i(s): dim = [I][D], eq.(34)
-  Matrix<BaseFloat> zti;  ///< z_{i}(t), dim = [I][S], eq.(35)
-  Vector<BaseFloat> nti;  ///< n_{i}(t), dim = [I], eq.(36) in CSL paper, but
-                          ///< [SSGMM] with extra term log b_i^{(s)}, see eq. (24) of
-                          ///< techreport.
-  
-  void Resize(int32 ngauss, int32 feat_dim, int32 phn_dim) { // resizes but does
-    // not necessarily zero things.
-    if (xt.Dim() != feat_dim) xt.Resize(feat_dim);
-    if (xti.NumRows() != ngauss || xti.NumCols() != feat_dim)
-      xti.Resize(ngauss, feat_dim);
-    if (zti.NumRows() != ngauss || zti.NumCols() != phn_dim)
-      zti.Resize(ngauss, phn_dim);
-    if (nti.Dim() != ngauss)
-      nti.Resize(ngauss);
-  }
-};
-
-class AmSgmm2;
-
-class Sgmm2PerSpkDerivedVars {
-  // To set this up, call ComputePerSpkDerivedVars from the sgmm object.
- public:  
-  void Clear() {
-    v_s.Resize(0);
-    o_s.Resize(0, 0);
-    b_is.Resize(0);
-    log_b_is.Resize(0);
-    log_d_jms.resize(0);
-  }
-  bool Empty() { return v_s.Dim() == 0; }
-  // caution: after SetSpeakerVector you typically want to
-  // use the function AmSgmm::ComputePerSpkDerivedVars
-  const Vector<BaseFloat> &GetSpeakerVector() { return v_s; }
-  
-  void SetSpeakerVector(const Vector<BaseFloat> &v_s_in) {
-    v_s.Resize(v_s_in.Dim());
-    v_s.CopyFromVec(v_s_in);
-  }    
- protected:
-  friend class AmSgmm2;
-  friend class MleAmSgmm2Accs;
-  Vector<BaseFloat> v_s;  ///< Speaker adaptation vector v_^{(s)}. Dim is [T]
-  Matrix<BaseFloat> o_s;  ///< Per-speaker offsets o_{i}. Dimension is [I][D]
-  Vector<BaseFloat> b_is; /// < [SSGMM]: Eq. (22) in techreport, b_i^{(s)} = \exp(\u_i^T \v^{(s)})
-  Vector<BaseFloat> log_b_is; /// < [SSGMM] log of the above (more efficient to store both).
-  std::vector<Vector<BaseFloat> > log_d_jms; ///< [SSGMM] normalizers per-speaker and per-substate;
-                                             ///< indexed [j1][m].
-};
-
-/// Sgmm2LikelihoodCache caches SGMM likelihoods at two levels: the final
-/// pdf likelihoods, and the sub-state level likelihoods, which means
-/// that with the SCTM system we can avoid redundant computation.
-/// You need to call NextFrame() on the cache, between frames.
-struct Sgmm2LikelihoodCache {
- public:
-  // you'll typically initialize with (sgmm.NumGroups(), sgmm.NumPdfs()).
-  Sgmm2LikelihoodCache(int32 num_groups, int32 num_pdfs):
-      substate_cache(num_groups), pdf_cache(num_pdfs), t(1) { }
-  
-  struct SubstateCacheElement { // indexed by j1.
-    SubstateCacheElement(): t(0) { }
-    // The "likes" and "remaining_log_like" quantities store the
-    // log-like of the data given each substate vector, in a redundant
-    // way, so the likelihood is likes(i) * exp(remaining_log_like).
-    // This is to get around problems with numerical range.
-    Vector<BaseFloat> likes; 
-    BaseFloat remaining_log_like;
-    int32 t; // used in detecting "freshness."
-  };  
-  struct PdfCacheElement { // indexed by j2.
-    PdfCacheElement(): t(0) { }
-    BaseFloat log_like;
-    int32 t; // used in detecting "freshness."
-  };
-
-  void NextFrame(); // increments t.
-  std::vector<SubstateCacheElement> substate_cache; // indexed by j1.
-  std::vector<PdfCacheElement> pdf_cache; // indexed by j2.
-  int32 t;
-};
-
-
-/** \class AmSgmm2
- *  Class for definition of the subspace Gmm acoustic model
- */
-class AmSgmm2 {
- public:
-  AmSgmm2() {}
-  void Read(std::istream &is, bool binary);
-  void Write(std::ostream &os, bool binary,
-             SgmmWriteFlagsType write_params) const;
-  
-  /// Checks the various components for correct sizes. With wrong sizes,
-  /// assertion failure occurs. When the argument is set to true, dimensions of
-  /// the various components are printed.
-  void Check(bool show_properties = true);
-
-  /// Initializes the SGMM parameters from a full-covariance UBM.
-  /// The state2group vector maps from a state to the corresponding
-  /// cluster of states [i.e. j2 to j1].  For conventionally structured
-  /// systems (no 2-level tree), this can just be [ 0 1 ... n-1 ].
-  void InitializeFromFullGmm(const FullGmm &gmm,
-                             const std::vector<int32> &pdf2group,
-                             int32 phn_subspace_dim,
-                             int32 spk_subspace_dim,
-                             bool speaker_dependent_weights,
-                             BaseFloat self_weight); // self_weight relates to
-  // initialization of the weights.  if self_weight == 1.0 it means we
-  // just have 1 sub-state per group, otherwise we have one per pdf,
-  // and each pdf has "self_weight" as its "own" weight.
-  
-  /// Copies the global parameters from the supplied model, but sets
-  /// the state vectors to zero. 
-  void CopyGlobalsInitVecs(const AmSgmm2 &other,
-                           const std::vector<int32> &pdf2group,
-                           BaseFloat self_weight);
-  
-  /// Used to copy models (useful in update)
-  void CopyFromSgmm2(const AmSgmm2 &other,
-                    bool copy_normalizers,
-                    bool copy_weights);  // copy_weights is to copy w_{jmi} [which are
-   // stored, in the symmetric SSGMM.]
-  
-  /// Computes the top-scoring Gaussian indices (used for pruning of later
-  /// stages of computation). Returns frame log-likelihood given selected
-  /// Gaussians from full UBM.
-  BaseFloat GaussianSelection(const Sgmm2GselectConfig &config,
-                              const VectorBase<BaseFloat> &data,
-                              std::vector<int32> *gselect) const;
-  
-  /// This needs to be called with each new frame of data, prior to accumulation
-  /// or likelihood evaluation: it computes various pre-computed quantities.
-  void ComputePerFrameVars(const VectorBase<BaseFloat> &data,
-                           const std::vector<int32> &gselect,
-                           const Sgmm2PerSpkDerivedVars &spk_vars,
-                           Sgmm2PerFrameDerivedVars *per_frame_vars) const;
-
-
-  /// Computes the per-speaker derived vars; assumes vars->v_s is already
-  /// set up.
-  void ComputePerSpkDerivedVars(Sgmm2PerSpkDerivedVars *vars) const;
-  
-  /// This does a likelihood computation for a given state using the
-  /// pre-selected Gaussian components (in per_frame_vars).  If the
-  /// log_prune parameter is nonzero (e.g. 5.0), the LogSumExp() stage is
-  /// pruned, which is a significant speedup... smaller values are faster.
-  /// Note: you have to call cache->NextFrame() before calling this for
-  /// a new frame of data.
-  BaseFloat LogLikelihood(const Sgmm2PerFrameDerivedVars &per_frame_vars,
-                          int32 j2, // pdf_id
-                          Sgmm2LikelihoodCache *cache, // be careful to call NextFrame() when needed!
-                          Sgmm2PerSpkDerivedVars *spk_vars,
-                          BaseFloat log_prune = 0.0) const;
-  
-  /// Similar to LogLikelihood() function above, but also computes the posterior
-  /// probabilities for the pre-selected Gaussian components and all substates.
-  /// This one doesn't use caching to share computation for the groups of
-  /// pdfs. [it's less necessary, as most of the time we're doing this from alignments,
-  /// or lattices that are quite sparse, so we save little by sharing this.]
-  BaseFloat ComponentPosteriors(const Sgmm2PerFrameDerivedVars &per_frame_vars,
-                                int32 j2,
-                                Sgmm2PerSpkDerivedVars *spk_vars,
-                                Matrix<BaseFloat> *post) const;
-
-  /// Increases the total number of substates based on the state occupancies.
-  void SplitSubstates(const Vector<BaseFloat> &state_occupancies, // [indexed by pdf-id j2]
-                      const Sgmm2SplitSubstatesConfig &config);
-
-  /// Functions for increasing the phonetic and speaker space dimensions.
-  /// The argument norm_xform is a LDA-like feature normalizing transform,
-  /// computed by the ComputeFeatureNormalizingTransform function.
-  void IncreasePhoneSpaceDim(int32 target_dim,
-                             const Matrix<BaseFloat> &norm_xform);
-
-  /// Increase the subspace dimension for speakers.  The
-  /// boolean "speaker_dependent_weights" argument (for SSGMM)
-  /// only makes a difference if increasing the subspace dimension
-  /// from zero.
-  void IncreaseSpkSpaceDim(int32 target_dim,
-                           const Matrix<BaseFloat> &norm_xform,
-                           bool speaker_dependent_weights);
-
-  /// Computes (and initializes if necessary) derived vars...
-  /// for now this is just the normalizers "n" and the diagonal UBM,
-  /// and if we have the "u" matrix set up, also the w_jmi_
-  /// quantities.
-  void ComputeDerivedVars();
-
-  /// Computes the data-independent terms in the log-likelihood computation
-  /// for each Gaussian component and all substates. Eq. (31)
-  void ComputeNormalizers();
-  
-  /// Computes the weights w_jmi_, which is needed for likelihood evaluation
-  /// with SSGMMs.
-  void ComputeWeights();
-
-  /// Computes the LDA-like pre-transform and its inverse as well as the
-  /// eigenvalues of the scatter of the means used in FMLLR estimation.
-  void ComputeFmllrPreXform(const Vector<BaseFloat> &pdf_occs,
-                            Matrix<BaseFloat> *xform,
-                            Matrix<BaseFloat> *inv_xform,
-                            Vector<BaseFloat> *diag_mean_scatter) const;
-  
-  /// Various model dimensions.
-  int32 NumPdfs() const { return pdf2group_.size(); }
-  int32 NumGroups() const { return group2pdf_.size(); } // relates to SCTM.  # pdf groups,
-  // <= NumPdfs().
-  int32 Pdf2Group(int32 j2) const; // relates to SCTM.
-  int32 NumSubstatesForPdf(int32 j2) const {
-    KALDI_ASSERT(j2 < NumPdfs()); return c_[j2].Dim();
-  }
-  int32 NumSubstatesForGroup(int32 j1) const {
-    KALDI_ASSERT(j1 < NumGroups()); return v_[j1].NumRows();
-  }
-  int32 NumGauss() const { return M_.size(); }
-  int32 PhoneSpaceDim() const { return w_.NumCols(); }
-  int32 SpkSpaceDim() const { return (N_.size() > 0) ? N_[0].NumCols() : 0; }
-  int32 FeatureDim() const { return M_[0].NumRows(); }
-
-  /// True if doing SSGMM.
-  bool HasSpeakerDependentWeights() const { return (u_.NumRows() != 0); }
-
-  bool HasSpeakerSpace() const { return (!N_.empty()); }
-  
-  void RemoveSpeakerSpace() { N_.clear(); u_.Resize(0, 0); w_jmi_.clear(); }
-  
-  // [SSGMM] get the quantity d_{jm}^{(s)} and cache it with
-  // spk vars if necessary.  Called in accumulation code.
-  BaseFloat GetDjms(int32 j1, int32 m,
-                    Sgmm2PerSpkDerivedVars *spk_vars) const;
-  
-  /// Accessors
-  const FullGmm & full_ubm() const { return full_ubm_; }
-  const DiagGmm & diag_ubm() const { return diag_ubm_; }
-  
-  
-  /// Templated accessors (used to accumulate in different precision)
-  template<typename Real>
-  void GetInvCovars(int32 gauss_index, SpMatrix<Real> *out) const;
-
-  template<typename Real>
-  void GetSubstateMean(int32 j1, int32 m, int32 i,
-                       VectorBase<Real> *mean_out) const;
-    
-  template<typename Real>
-  void GetNtransSigmaInv(std::vector< Matrix<Real> > *out) const;
-
-  template<typename Real>
-  void GetSubstateSpeakerMean(int32 j1, int32 substate, int32 gauss,
-                              const Sgmm2PerSpkDerivedVars &spk,
-                              VectorBase<Real> *mean_out) const;
-  
-  template<typename Real>
-  void GetVarScaledSubstateSpeakerMean(int32 j1, int32 substate,
-                                       int32 gauss,
-                                       const Sgmm2PerSpkDerivedVars &spk,
-                                       VectorBase<Real> *mean_out) const;
-
-  /// Computes quantities H = M_i Sigma_i^{-1} M_i^T.
-  template<class Real>
-  void ComputeH(std::vector< SpMatrix<Real> > *H_i) const;
-  
- protected:
-  std::vector<int32> pdf2group_;
-  std::vector<std::vector<int32> > group2pdf_; // the reverse map.
-  
-  /// These contain the "background" model associated with the subspace GMM.
-  DiagGmm diag_ubm_;
-  FullGmm full_ubm_;
-
-  /// Globally shared parameters of the subspace GMM.  The various quantities
-  /// are: I = number of Gaussians, D = data dimension, S = phonetic subspace
-  /// dimension, T = speaker subspace dimension, J2 = number of pdfs, J1 =
-  /// number of groups of pdfs (for SCTM), #mix = number of substates [of state
-  /// j2 or state-group j1, depending on context].
-
-  /// Inverse within-class (full) covariances; dim is [I][D][D].
-  std::vector< SpMatrix<BaseFloat> > SigmaInv_;
-  /// Phonetic-subspace projections. Dimension is [I][D][S]
-  std::vector< Matrix<BaseFloat> > M_;
-  /// Speaker-subspace projections. Dimension is [I][D][T]
-  std::vector< Matrix<BaseFloat> > N_;
-  /// Phonetic-subspace weight projection vectors.  Dimension is [I][S]
-  Matrix<BaseFloat> w_;
-  /// [SSGMM] Speaker-subspace weight projection vectors. Dimension is [I][T]
-  Matrix<BaseFloat> u_;
-  
-  /// The parameters in a particular SGMM state.
-
-  /// v_{jm}, per-state phonetic-subspace vectors. Dimension is [J1][#mix][S].
-  std::vector< Matrix<BaseFloat> > v_;
-  /// c_{jm}, mixture weights. Dimension is [J2][#mix]
-  std::vector< Vector<BaseFloat> > c_;
-  /// n_{jim}, per-Gaussian normalizer. Dimension is [J1][I][#mix]
-  std::vector< Matrix<BaseFloat> > n_;
-  /// [SSGMM] w_{jmi}, dimension is [J1][#mix][I].  Computed from w_ and v_.
-  std::vector< Matrix<BaseFloat> > w_jmi_;
-
-  // Priors for MAP adaptation of M -- keeping them here for now but they may
-  // be moved somewhere else eventually
-  // These are parameters of a matrix-variate normal distribution. The means are
-  // the unadapted M_i, and we have 2 separate covaraince matrices for the rows
-  // and columns of M.
-  std::vector< Matrix<BaseFloat> > M_prior_;  // Matrix-variate Gaussian mean
-  SpMatrix<BaseFloat> row_cov_inv_;
-  SpMatrix<BaseFloat> col_cov_inv_;
-
- private:
-  /// Computes quasi-occupancies gamma_i from the state-level occupancies,
-  /// assuming model correctness.
-  void ComputeGammaI(const Vector<BaseFloat> &state_occupancies,
-                     Vector<BaseFloat> *gamma_i) const;
-  
-  /// Called inside SplitSubstates(); splits substates of one group.
-  void SplitSubstatesInGroup(const Vector<BaseFloat> &pdf_occupancies,
-                             const Sgmm2SplitSubstatesConfig &opts,
-                             const SpMatrix<BaseFloat> &sqrt_H_sm,
-                             int32 j1, int32 M);
-      
-  /// Compute a subset of normalizers; used in multi-threaded implementation.
-  void ComputeNormalizersInternal(int32 num_threads, int32 thread,
-                                  int32 *entropy_count, double *entropy_sum);
-  
-  /// The code below is called internally from LogLikelihood() and
-  /// ComponentPosteriors().  It computes the per-Gaussian log-likelihods
-  /// given each sub-state of the state.  Note: the mixture weights
-  /// are not included at this point.
-  inline void ComponentLogLikes(const Sgmm2PerFrameDerivedVars &per_frame_vars,
-                                int32 j1,
-                                Sgmm2PerSpkDerivedVars *spk_vars,
-                                Matrix<BaseFloat> *loglikes) const;
-
-  
-  /// Initializes the matrices M_ and w_.
-  void InitializeMw(int32 phn_subspace_dim,
-                     const Matrix<BaseFloat> &norm_xform);
-  /// Initializes the matrices N_ and [if speaker_dependent_weights==true] u_ 
-  void InitializeNu(int32 spk_subspace_dim,                    
-                    const Matrix<BaseFloat> &norm_xform,
-                    bool speaker_dependent_weights);
-  void InitializeVecsAndSubstateWeights(BaseFloat self_weight);
-  void InitializeCovars();  ///< initializes the within-class covariances.
-
-  void ComputeHsmFromModel(
-      const std::vector< SpMatrix<BaseFloat> > &H,
-      const Vector<BaseFloat> &state_occupancies,
-      SpMatrix<BaseFloat> *H_sm,
-      BaseFloat max_cond) const;
-
-  void ComputePdfMappings(); // sets up group2pdf_ from pdf2group_.
-  /// maps from each pdf (index j2) to the corresponding group of
-  /// pdfs (index j1) for SCTM.
-  
-  KALDI_DISALLOW_COPY_AND_ASSIGN(AmSgmm2);
-  friend class ComputeNormalizersClass;
-  friend class Sgmm2Project;
-  friend class EbwAmSgmm2Updater;
-  friend class MleAmSgmm2Accs;
-  friend class MleAmSgmm2Updater;
-  friend class MleSgmm2SpeakerAccs;
-  friend class AmSgmm2Functions;  // misc functions that need access.
-  friend class Sgmm2Feature;
-};
-
-template<typename Real>
-inline void AmSgmm2::GetInvCovars(int32 gauss_index,
-                                  SpMatrix<Real> *out) const {
-  out->Resize(SigmaInv_[gauss_index].NumRows(), kUndefined);
-  out->CopyFromSp(SigmaInv_[gauss_index]);
-}
-
-
-template<typename Real>
-inline void AmSgmm2::GetSubstateMean(int32 j1, int32 m, int32 i,
-                                    VectorBase<Real> *mean_out) const {
-  KALDI_ASSERT(mean_out != NULL);
-  KALDI_ASSERT(j1 < NumGroups() && m < NumSubstatesForGroup(j1)
-               && i < NumGauss());
-  KALDI_ASSERT(mean_out->Dim() == FeatureDim());
-  Vector<BaseFloat> mean_tmp(FeatureDim());
-  mean_tmp.AddMatVec(1.0, M_[i], kNoTrans, v_[j1].Row(m), 0.0);
-  mean_out->CopyFromVec(mean_tmp);
-}
-
-
-template<typename Real>
-inline void AmSgmm2::GetSubstateSpeakerMean(int32 j1, int32 m, int32 i,
-                                            const Sgmm2PerSpkDerivedVars &spk,
-                                           VectorBase<Real> *mean_out) const {
-  GetSubstateMean(j1, m, i, mean_out);
-  if (spk.v_s.Dim() != 0)  // have speaker adaptation...
-    mean_out->AddVec(1.0, spk.o_s.Row(i));
-}
-
-template<typename Real>
-void AmSgmm2::GetVarScaledSubstateSpeakerMean(int32 j1, int32 m, int32 i,
-                                             const Sgmm2PerSpkDerivedVars &spk,
-                                             VectorBase<Real> *mean_out) const {
-  Vector<BaseFloat> tmp_mean(mean_out->Dim()), tmp_mean2(mean_out->Dim());
-  GetSubstateSpeakerMean(j1, m, i, spk, &tmp_mean);
-  tmp_mean2.AddSpVec(1.0, SigmaInv_[i], tmp_mean, 0.0);
-  mean_out->CopyFromVec(tmp_mean2);
-}
-
-
-/// Computes the inverse of an LDA transform (without dimensionality reduction)
-/// The computed transform is used in initializing the phonetic and speaker
-/// subspaces, as well as while increasing the dimensions of those spaces.
-void ComputeFeatureNormalizingTransform(const FullGmm &gmm, Matrix<BaseFloat> *xform);
-
-
-/// This is the entry for a single time.
-struct Sgmm2GauPostElement {
-  // Need gselect info here, since "posteriors" is  relative to this set of
-  // selected Gaussians.
-  std::vector<int32> gselect;
-  std::vector<int32> tids;  // transition-ids for each entry in "posteriors"
-  std::vector<Matrix<BaseFloat> > posteriors;
-};
-
-
-/// indexed by time.
-class Sgmm2GauPost: public std::vector<Sgmm2GauPostElement> {
- public:
-  // Add the standard Kaldi Read and Write routines so
-  // we can use KaldiObjectHolder with this type.
-  explicit Sgmm2GauPost(size_t i) : std::vector<Sgmm2GauPostElement>(i) {}
-  Sgmm2GauPost() {}
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary);
-};
-
-typedef KaldiObjectHolder<Sgmm2GauPost> Sgmm2GauPostHolder;
-typedef RandomAccessTableReader<Sgmm2GauPostHolder> RandomAccessSgmm2GauPostReader;
-typedef SequentialTableReader<Sgmm2GauPostHolder> SequentialSgmm2GauPostReader;
-typedef TableWriter<Sgmm2GauPostHolder> Sgmm2GauPostWriter;
-
-}  // namespace kaldi
-
-
-#endif  // KALDI_SGMM2_AM_SGMM2_H_
diff --git a/src/sgmm2/decodable-am-sgmm2.cc b/src/sgmm2/decodable-am-sgmm2.cc
deleted file mode 100644
index 420c0dc6e74..00000000000
--- a/src/sgmm2/decodable-am-sgmm2.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// sgmm2/decodable-am-sgmm2.cc
-
-// Copyright 2009-2012  Saarland University;  Lukas Burget;
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-using std::vector;
-
-#include "sgmm2/decodable-am-sgmm2.h"
-
-namespace kaldi {
-
-
-DecodableAmSgmm2::~DecodableAmSgmm2() {
-  if (delete_vars_) {
-    delete gselect_;
-    delete feature_matrix_;
-    delete spk_;
-  }
-}
-
-BaseFloat DecodableAmSgmm2::LogLikelihoodForPdf(int32 frame, int32 pdf_id) {
-  if (frame != cur_frame_) {
-    cur_frame_ = frame;
-    sgmm_cache_.NextFrame(); // it has a frame-index internally but it doesn't
-    // have to match up with our index here, it just needs to be unique.
-
-
-    SubVector<BaseFloat> data(*feature_matrix_, frame);
-    
-    sgmm_.ComputePerFrameVars(data, (*gselect_)[frame], *spk_,
-                              &per_frame_vars_);
-  }
-  return sgmm_.LogLikelihood(per_frame_vars_, pdf_id, &sgmm_cache_, spk_,
-                             log_prune_);  
-}
-
-
-}  // namespace kaldi
diff --git a/src/sgmm2/decodable-am-sgmm2.h b/src/sgmm2/decodable-am-sgmm2.h
deleted file mode 100644
index 18498bf5b24..00000000000
--- a/src/sgmm2/decodable-am-sgmm2.h
+++ /dev/null
@@ -1,138 +0,0 @@
-// sgmm2/decodable-am-sgmm2.h
-
-// Copyright 2009-2012  Saarland University  Microsoft Corporation
-//                      Lukas Burget  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM2_DECODABLE_AM_SGMM2_H_
-#define KALDI_SGMM2_DECODABLE_AM_SGMM2_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "itf/decodable-itf.h"
-
-namespace kaldi {
-
-class DecodableAmSgmm2 : public DecodableInterface {
- public:
-  DecodableAmSgmm2(const AmSgmm2 &sgmm,
-                   const TransitionModel &tm,
-                   const Matrix<BaseFloat> &feats,
-                   const std::vector<std::vector<int32> > &gselect,
-                   BaseFloat log_prune,
-                   Sgmm2PerSpkDerivedVars *spk):
-      sgmm_(sgmm), spk_(spk),
-      trans_model_(tm), feature_matrix_(&feats),
-      gselect_(&gselect), log_prune_(log_prune), cur_frame_(-1),
-      sgmm_cache_(sgmm.NumGroups(), sgmm.NumPdfs()), delete_vars_(false) {
-    KALDI_ASSERT(gselect.size() == static_cast<size_t>(feats.NumRows()));
-  }
-
-  /// This version of the constructor takes ownership of the pointers
-  /// "feats", "gselect" and "spk", and will delete them when it is destroyed.
-  DecodableAmSgmm2(const AmSgmm2 &sgmm,
-                   const TransitionModel &tm,
-                   const Matrix<BaseFloat> *feats,
-                   const std::vector<std::vector<int32> > *gselect,
-                   Sgmm2PerSpkDerivedVars *spk,
-                   BaseFloat log_prune):
-      sgmm_(sgmm), spk_(spk),
-      trans_model_(tm), feature_matrix_(feats),
-      gselect_(gselect), log_prune_(log_prune), cur_frame_(-1),
-      sgmm_cache_(sgmm.NumGroups(), sgmm.NumPdfs()), delete_vars_(true) {
-    KALDI_ASSERT(gselect->size() == static_cast<size_t>(feats->NumRows()));
-  }
-
-  // Note, frames are numbered from zero, but transition indices are 1-based!
-  // This is for compatibility with OpenFST.
-  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return LogLikelihoodForPdf(frame, trans_model_.TransitionIdToPdfFast(tid));
-  }
-  int32 NumFramesReady() const { return feature_matrix_->NumRows(); }
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-
-  virtual bool IsLastFrame(int32 frame) const {
-    KALDI_ASSERT(frame < NumFramesReady());
-    return (frame == NumFramesReady() - 1);
-  }
-
-  virtual ~DecodableAmSgmm2();
- protected:
-  virtual BaseFloat LogLikelihoodForPdf(int32 frame, int32 pdf_id);
-
-  const AmSgmm2 &sgmm_;
-  Sgmm2PerSpkDerivedVars *spk_;
-  const TransitionModel &trans_model_;  ///< for tid to pdf mapping
-  const Matrix<BaseFloat> *feature_matrix_;
-  const std::vector<std::vector<int32> > *gselect_;
-
-  BaseFloat log_prune_;
-
-  int32 cur_frame_;
-  Sgmm2PerFrameDerivedVars per_frame_vars_;
-  Sgmm2LikelihoodCache sgmm_cache_;
-
-  bool delete_vars_; // If true, we will delete feature_matrix_, gselect_, and
-  // spk_ in the destructor.
-
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmSgmm2);
-};
-
-class DecodableAmSgmm2Scaled : public DecodableAmSgmm2 {
- public:
-  DecodableAmSgmm2Scaled(const AmSgmm2 &sgmm,
-                         const TransitionModel &tm,
-                         const Matrix<BaseFloat> &feats,
-                         const std::vector<std::vector<int32> > &gselect,
-                         BaseFloat log_prune,
-                         BaseFloat scale,
-                         Sgmm2PerSpkDerivedVars *spk)
-      : DecodableAmSgmm2(sgmm, tm, feats, gselect, log_prune, spk),
-        scale_(scale) {}
-
-  /// This version of the constructor takes ownership of the pointers
-  /// "feats", "gselect" and "spk", and will delete them in its
-  /// destructor.
-  DecodableAmSgmm2Scaled(const AmSgmm2 &sgmm,
-                         const TransitionModel &tm,
-                         const Matrix<BaseFloat> *feats,
-                         const std::vector<std::vector<int32> > *gselect,
-                         Sgmm2PerSpkDerivedVars *spk,
-                         BaseFloat log_prune,
-                         BaseFloat scale)
-      : DecodableAmSgmm2(sgmm, tm, feats, gselect, spk, log_prune),
-        scale_(scale) {}
-
-
-  // Note, frames are numbered from zero but transition-ids from one.
-  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return LogLikelihoodForPdf(frame, trans_model_.TransitionIdToPdfFast(tid))
-            * scale_;
-  }
- private:
-  BaseFloat scale_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmSgmm2Scaled);
-};
-
-
-}  // namespace kaldi
-
-#endif  // KALDI_SGMM2_DECODABLE_AM_SGMM2_H_
diff --git a/src/sgmm2/estimate-am-sgmm2-ebw.cc b/src/sgmm2/estimate-am-sgmm2-ebw.cc
deleted file mode 100644
index 42a05dcca7c..00000000000
--- a/src/sgmm2/estimate-am-sgmm2-ebw.cc
+++ /dev/null
@@ -1,736 +0,0 @@
-// sgmm2/estimate-am-sgmm2-ebw.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "sgmm2/estimate-am-sgmm2-ebw.h"
-#include "util/kaldi-thread.h"
-using std::vector;
-
-namespace kaldi {
-
-void EbwAmSgmm2Updater::Update(const MleAmSgmm2Accs &num_accs,
-                               const MleAmSgmm2Accs &den_accs,
-                               AmSgmm2 *model,
-                               SgmmUpdateFlagsType flags,
-                               BaseFloat *auxf_change_out,
-                               BaseFloat *count_out) {
-
-  // Various quantities need to be computed at the start, before we
-  // change any of the model parameters.
-  std::vector< SpMatrix<double> > Q_num, Q_den, H, S_means;
-
-  if (flags & kSgmmPhoneProjections) {
-    MleAmSgmm2Updater::ComputeQ(num_accs, *model, &Q_num);
-    MleAmSgmm2Updater::ComputeQ(den_accs, *model, &Q_den);
-  }
-  if (flags & kSgmmCovarianceMatrix) { // compute the difference between
-    // the num and den S_means matrices... this is what we will need.
-    MleAmSgmm2Updater::ComputeSMeans(num_accs, *model, &S_means);
-    std::vector< SpMatrix<double> > S_means_tmp;
-    MleAmSgmm2Updater::ComputeSMeans(den_accs, *model, &S_means_tmp);
-    for (size_t i = 0; i < S_means.size(); i++)
-      S_means[i].AddSp(-1.0, S_means_tmp[i]);
-  }
-  if (flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections))
-    model->ComputeH(&H);
-
-  Vector<double> gamma_num(num_accs.num_gaussians_);
-  for (int32 j1 = 0; j1 < num_accs.num_groups_; j1++)
-    gamma_num.AddRowSumMat(1.0, num_accs.gamma_[j1]);
-  Vector<double> gamma_den(den_accs.num_gaussians_);
-  for (int32 j1 = 0; j1 < den_accs.num_groups_; j1++)
-    gamma_den.AddRowSumMat(1.0, den_accs.gamma_[j1]);
-
-  BaseFloat tot_impr = 0.0;
-
-  if (flags & kSgmmPhoneVectors)
-    tot_impr += UpdatePhoneVectors(num_accs, den_accs, H, model);
-
-  if (flags & kSgmmPhoneProjections)
-    tot_impr += UpdateM(num_accs, den_accs, Q_num, Q_den,
-                        gamma_num, gamma_den, model);
-
-  if (flags & kSgmmPhoneWeightProjections)
-    tot_impr += UpdateW(num_accs, den_accs, gamma_num, gamma_den, model);
-
-  if (flags & kSgmmSpeakerWeightProjections)
-    tot_impr += UpdateU(num_accs, den_accs, gamma_num, gamma_den, model);
-
-  if (flags & kSgmmCovarianceMatrix)
-    tot_impr += UpdateVars(num_accs, den_accs,
-                           gamma_num, gamma_den, S_means, model);
-
-  if (flags & kSgmmSubstateWeights)
-    tot_impr += UpdateSubstateWeights(num_accs, den_accs, model);
-
-  if (flags & kSgmmSpeakerProjections)
-    tot_impr += UpdateN(num_accs, den_accs, gamma_num, gamma_den, model);
-
-
-  if (auxf_change_out) *auxf_change_out = tot_impr * num_accs.total_frames_;
-  if (count_out) *count_out = num_accs.total_frames_;
-
-  if (fabs(num_accs.total_frames_ - den_accs.total_frames_) >
-      0.01*(num_accs.total_frames_ + den_accs.total_frames_))
-    KALDI_WARN << "Num and den frame counts differ, "
-               << num_accs.total_frames_ << " vs. " << den_accs.total_frames_;
-
-  BaseFloat like_diff = num_accs.total_like_ - den_accs.total_like_;
-
-  KALDI_LOG << "***Averaged differenced likelihood per frame is "
-            << (like_diff/num_accs.total_frames_)
-            << " over " << (num_accs.total_frames_) << " frames.";
-  KALDI_LOG << "***Note: for this to be at all meaningful, if you use "
-            << "\"canceled\" stats you will have to renormalize this over "
-            << "the \"real\" frame count.";
-  KALDI_ASSERT(num_accs.total_frames_ > 0 && den_accs.total_frames_ > 0);
-
-  model->ComputeNormalizers();
-}
-
-
-class EbwUpdatePhoneVectorsClass: public MultiThreadable { // For multi-threaded.
- public:
-  EbwUpdatePhoneVectorsClass(const EbwAmSgmm2Updater *updater,
-                             const MleAmSgmm2Accs &num_accs,
-                             const MleAmSgmm2Accs &den_accs,
-                             const std::vector<SpMatrix<double> > &H,
-                             AmSgmm2 *model,
-                             double *auxf_impr):
-      updater_(updater), num_accs_(num_accs), den_accs_(den_accs),
-      model_(model), H_(H), auxf_impr_ptr_(auxf_impr), auxf_impr_(0.0) { }
-
-  EbwUpdatePhoneVectorsClass(const EbwUpdatePhoneVectorsClass &other) :
-      MultiThreadable(other),
-      updater_(other.updater_), num_accs_(other.num_accs_),
-      den_accs_(other.den_accs_), model_(other.model_),
-      H_(other.H_), auxf_impr_ptr_(other.auxf_impr_ptr_), auxf_impr_(0.0) { }
-
-  ~EbwUpdatePhoneVectorsClass() {
-    *auxf_impr_ptr_ += auxf_impr_;
-  }
-
-  inline void operator() () {
-    // Note: give them local copy of the sums we're computing,
-    // which will be propagated to the total sums in the destructor.
-    updater_->UpdatePhoneVectorsInternal(num_accs_, den_accs_, H_, model_,
-                                         &auxf_impr_, num_threads_, thread_id_);
-  }
- private:
-  const EbwAmSgmm2Updater *updater_;
-  const MleAmSgmm2Accs &num_accs_;
-  const MleAmSgmm2Accs &den_accs_;
-  AmSgmm2 *model_;
-  const std::vector<SpMatrix<double> > &H_;
-  double *auxf_impr_ptr_;
-  double auxf_impr_;
-};
-
-
-void EbwAmSgmm2Updater::ComputePhoneVecStats(
-    const MleAmSgmm2Accs &accs,
-    const AmSgmm2 &model,
-    const std::vector<SpMatrix<double> > &H,
-    int32 j1,
-    int32 m,
-    const Vector<double> &w_jm_in,
-    double gamma_jm,
-    Vector<double> *g_jm,
-    SpMatrix<double> *H_jm) {
-  Vector<double> w_jm(w_jm_in);
-  if (!accs.a_.empty() && accs.a_[j1](m, 0) != 0) { // [SSGMM]
-    w_jm.MulElements(accs.a_[j1].Row(m)); // multiply by "a" quantities..
-    w_jm.Scale(1.0 / w_jm.Sum()); // renormalize.
-  }
-  g_jm->CopyFromVec(accs.y_[j1].Row(m));
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    double gamma_jmi = accs.gamma_[j1](m, i);
-    double quadratic_term = std::max(gamma_jmi, gamma_jm * w_jm(i));
-    double scalar = gamma_jmi - gamma_jm * w_jm(i) + quadratic_term
-        * VecVec(model.w_.Row(i), model.v_[j1].Row(m));
-    g_jm->AddVec(scalar, model.w_.Row(i));
-    if (gamma_jmi != 0.0)
-      H_jm->AddSp(gamma_jmi, H[i]);  // The most important term..
-    if (quadratic_term > 1.0e-10)
-      H_jm->AddVec2(static_cast<BaseFloat>(quadratic_term), model.w_.Row(i));
-  }
-}
-
-
-// Runs the phone vectors update for a subset of states (called
-// multi-threaded).
-void EbwAmSgmm2Updater::UpdatePhoneVectorsInternal(
-    const MleAmSgmm2Accs &num_accs,
-    const MleAmSgmm2Accs &den_accs,
-    const std::vector<SpMatrix<double> > &H,
-    AmSgmm2 *model,
-    double *auxf_impr,
-    int32 num_threads,
-    int32 thread_id) const {
-
-  int32 block_size = (num_accs.num_groups_ + (num_threads-1)) / num_threads,
-      j1_start = block_size * thread_id,
-      j1_end = std::min(num_accs.num_groups_, j1_start + block_size);
-
-  int32 S = num_accs.phn_space_dim_, I = num_accs.num_gaussians_;
-
-  for (int32 j1 = j1_start; j1 < j1_end; j1++) {
-    double num_state_count = 0.0,
-        state_auxf_impr = 0.0;
-    Vector<double> w_jm(I);
-    for (int32 m = 0; m < model->NumSubstatesForGroup(j1); m++) {
-      double gamma_jm_num = num_accs.gamma_[j1].Row(m).Sum();
-      double gamma_jm_den = den_accs.gamma_[j1].Row(m).Sum();
-      num_state_count += gamma_jm_num;
-      Vector<double> g_jm_num(S);  // computed using eq. 58 of SGMM paper [for numerator stats]
-      SpMatrix<double> H_jm_num(S);  // computed using eq. 59 of SGMM paper [for numerator stats]
-      Vector<double> g_jm_den(S); // same, but for denominator stats.
-      SpMatrix<double> H_jm_den(S);
-
-      // Compute the weights for this sub-state.
-      // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm})  eq.(7)
-      w_jm.AddMatVec(1.0, Matrix<double>(model->w_), kNoTrans,
-                     Vector<double>(model->v_[j1].Row(m)), 0.0);
-      w_jm.ApplySoftMax();
-      // Note: in the ML code, in the SSGMM case, at this point the w_jm would
-      // be modified with the "a" quantities to get the "\tilde{w}_{jm}" of the
-      // SSGMM techreport.  But in this code, it gets done inside ComputePhoneVecStats.
-
-      ComputePhoneVecStats(num_accs, *model, H, j1, m, w_jm, gamma_jm_num,
-                           &g_jm_num, &H_jm_num);
-      ComputePhoneVecStats(den_accs, *model, H, j1, m, w_jm, gamma_jm_den,
-                           &g_jm_den, &H_jm_den);
-
-      Vector<double> v_jm(model->v_[j1].Row(m));
-      Vector<double> local_derivative(S); // difference of derivative of numerator
-      // and denominator objetive function.
-      local_derivative.AddVec(1.0, g_jm_num);
-      local_derivative.AddSpVec(-1.0, H_jm_num, v_jm, 1.0);
-      local_derivative.AddVec(-1.0, g_jm_den);
-      local_derivative.AddSpVec(-1.0 * -1.0, H_jm_den, v_jm, 1.0);
-
-      SpMatrix<double> quadratic_term(H_jm_num);
-      quadratic_term.AddSp(1.0, H_jm_den);
-      double substate_count = 1.0e-10 + gamma_jm_num + gamma_jm_den;
-      quadratic_term.Scale( (substate_count + options_.tau_v) / substate_count);
-      quadratic_term.Scale(1.0 / (options_.lrate_v + 1.0e-10) );
-
-      Vector<double> delta_v_jm(S);
-
-      SolverOptions opts;
-      opts.name = "v";
-      opts.K = options_.max_cond;
-      opts.eps = options_.epsilon;
-
-      double auxf_impr =
-          ((gamma_jm_num + gamma_jm_den == 0) ? 0.0 :
-           SolveQuadraticProblem(quadratic_term,
-                                 local_derivative,
-                                 opts, &delta_v_jm));
-
-      v_jm.AddVec(1.0, delta_v_jm);
-      model->v_[j1].Row(m).CopyFromVec(v_jm);
-      state_auxf_impr += auxf_impr;
-    }
-
-    *auxf_impr += state_auxf_impr;
-    if (j1 < 10 && thread_id == 0) {
-      KALDI_LOG << "Objf impr for group j = " << j1 << "  is "
-                << (state_auxf_impr / (num_state_count + 1.0e-10))
-                << " over " << num_state_count << " frames";
-    }
-  }
-}
-
-double EbwAmSgmm2Updater::UpdatePhoneVectors(const MleAmSgmm2Accs &num_accs,
-                                             const MleAmSgmm2Accs &den_accs,
-                                             const vector< SpMatrix<double> > &H,
-                                             AmSgmm2 *model) const {
-  KALDI_LOG << "Updating phone vectors.";
-
-  double count = 0.0, auxf_impr = 0.0;
-
-  int32 J1 = num_accs.num_groups_;
-  for (int32 j1 = 0; j1 < J1; j1++) count += num_accs.gamma_[j1].Sum();
-
-  EbwUpdatePhoneVectorsClass c(this, num_accs, den_accs, H, model, &auxf_impr);
-  RunMultiThreaded(c);
-
-  auxf_impr /= count;
-
-  KALDI_LOG << "**Overall auxf improvement for v is " << auxf_impr
-            << " over " << count << " frames";
-  return auxf_impr;
-}
-
-
-double EbwAmSgmm2Updater::UpdateM(const MleAmSgmm2Accs &num_accs,
-                                  const MleAmSgmm2Accs &den_accs,
-                                  const std::vector< SpMatrix<double> > &Q_num,
-                                  const std::vector< SpMatrix<double> > &Q_den,
-                                  const Vector<double> &gamma_num,
-                                  const Vector<double> &gamma_den,
-                                  AmSgmm2 *model) const {
-  int32 S = model->PhoneSpaceDim(),
-      D = model->FeatureDim(),
-      I = model->NumGauss();
-
-  Vector<double> impr_vec(I);
-
-  for (int32 i = 0; i < I; i++) {
-    double gamma_i_num = gamma_num(i), gamma_i_den = gamma_den(i);
-
-    if (gamma_i_num + gamma_i_den == 0.0) {
-      KALDI_WARN << "Not updating phonetic basis for i = " << i
-                 << " because count is zero. ";
-      continue;
-    }
-
-    Matrix<double> Mi(model->M_[i]);
-    Matrix<double> L(D, S); // this is something like the Y quantity, which
-    // represents the linear term in the objf on M-- except that we make it the local
-    // derivative about the current value, instead of the derivative around zero.
-    // But it's not exactly the derivative w.r.t. M, due to the factor of Sigma_i.
-    // The auxiliary function is Q(x) = tr(M^T P Y) - 0.5 tr(P M Q M^T),
-    // where P is Y^{-1}.  The quantity L we define here will be Y - M Q,
-    // and you can think of this as like the local derivative, except there is
-    // a term P in there.
-    L.AddMat(1.0, num_accs.Y_[i]);
-    L.AddMatSp(-1.0, Mi, kNoTrans, Q_num[i], 1.0);
-    L.AddMat(-1.0, den_accs.Y_[i]);
-    L.AddMatSp(-1.0*-1.0, Mi, kNoTrans, Q_den[i], 1.0);
-
-    SpMatrix<double> Q(S); // This is a combination of the Q's for the numerator and denominator.
-    Q.AddSp(1.0, Q_num[i]);
-    Q.AddSp(1.0, Q_den[i]);
-
-    double state_count = 1.0e-10 + gamma_i_num + gamma_i_den; // the count
-    // represented by the quadratic part of the stats.
-    Q.Scale( (state_count + options_.tau_M) / state_count );
-    Q.Scale( 1.0 / (options_.lrate_M + 1.0e-10) );
-
-
-    SolverOptions opts;
-    opts.name = "M";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-
-    Matrix<double> deltaM(D, S);
-    double impr =
-        SolveQuadraticMatrixProblem(Q, L,
-                                    SpMatrix<double>(model->SigmaInv_[i]),
-                                    opts, &deltaM);
-
-    impr_vec(i) = impr;
-    Mi.AddMat(1.0, deltaM);
-    model->M_[i].CopyFromMat(Mi);
-    if (i < 10 || impr / state_count > 3.0) {
-      KALDI_VLOG(2) << "Objf impr for projection M for i = " << i << ", is "
-                    << (impr/(gamma_i_num + 1.0e-20)) << " over " << gamma_i_num
-                    << " frames";
-    }
-  }
-  BaseFloat tot_count = gamma_num.Sum(), tot_impr = impr_vec.Sum();
-
-  tot_impr /= (tot_count + 1.0e-20);
-  KALDI_LOG << "Overall auxiliary function improvement for model projections "
-            << "M is " << tot_impr << " over " << tot_count << " frames";
-
-  KALDI_VLOG(1) << "Updating M: num-count is " << gamma_num;
-  KALDI_VLOG(1) << "Updating M: den-count is " << gamma_den;
-  KALDI_VLOG(1) << "Updating M: objf-impr is " << impr_vec;
-
-  return tot_impr;
-}
-
-
-// Note: we do just one iteration of the weight-projection update here.  The
-// weak-sense auxiliary functions used don't really make sense if we do it for
-// multiple iterations.  It would be possible to use a similar auxiliary
-// function to the one on my (D. Povey)'s thesis for the Gaussian mixture
-// weights, which would make sense for multiple iterations, but this would be a
-// bit more complex to implement and probably would not give much improvement
-// over this approach.
-double EbwAmSgmm2Updater::UpdateW(const MleAmSgmm2Accs &num_accs,
-                                  const MleAmSgmm2Accs &den_accs,
-                                  const Vector<double> &gamma_num,
-                                  const Vector<double> &gamma_den,
-                                  AmSgmm2 *model) {
-  KALDI_LOG << "Updating weight projections";
-
-  int32 I = num_accs.num_gaussians_, S = num_accs.phn_space_dim_;
-
-  Matrix<double> g_i_num(I, S), g_i_den(I, S);
-
-  // View F_i_{num,den} as vectors of SpMatrix [i.e. symmetric matrices,
-  // linearized into vectors]
-  Matrix<double> F_i_num(I, (S*(S+1))/2), F_i_den(I, (S*(S+1))/2);
-
-  Vector<double> impr_vec(I);
-
-  // Get the F_i and g_i quantities-- this is done in parallel (multi-core),
-  // using the same code we use in the ML update [except we get it for
-  // numerator and denominator separately.]
-  Matrix<double> w(model->w_);
-  {
-    std::vector<Matrix<double> > log_a_num;
-    if (model->HasSpeakerDependentWeights())
-      MleAmSgmm2Updater::ComputeLogA(num_accs, &log_a_num);
-    double garbage;
-    UpdateWClass c_num(num_accs, *model, w, log_a_num, &F_i_num, &g_i_num, &garbage);
-    RunMultiThreaded(c_num);
-  }
-  {
-    std::vector<Matrix<double> > log_a_den;
-    if (model->HasSpeakerDependentWeights())
-      MleAmSgmm2Updater::ComputeLogA(den_accs, &log_a_den);
-    double garbage;
-    UpdateWClass c_den(den_accs, *model, w, log_a_den, &F_i_den, &g_i_den, &garbage);
-    RunMultiThreaded(c_den);
-  }
-
-  for (int32 i = 0; i < I; i++) {
-
-    // auxf was originally formulated in terms of the change in w (i.e. the
-    // g quantities are the local derivatives), so there is less hassle than
-    // with some of the other updates, in changing it to be discriminative.
-    // we essentially just difference the linear terms and add the quadratic
-    // terms.
-
-    Vector<double> derivative(g_i_num.Row(i));
-    derivative.AddVec(-1.0, g_i_den.Row(i));
-    // F_i_num quadratic_term is a bit like the negated 2nd derivative
-    // of the numerator stats-- actually it's not the actual 2nd deriv,
-    // but an upper bound on it.
-    SpMatrix<double> quadratic_term(S), tmp_F(S);
-    quadratic_term.CopyFromVec(F_i_num.Row(i));
-    tmp_F.CopyFromVec(F_i_den.Row(i)); // tmp_F is used for Vector->SpMatrix conversion.
-    quadratic_term.AddSp(1.0, tmp_F);
-
-    double state_count = gamma_num(i) + gamma_den(i);
-
-    quadratic_term.Scale((state_count + options_.tau_w) / (state_count + 1.0e-10));
-    quadratic_term.Scale(1.0 / (options_.lrate_w + 1.0e-10) );
-
-    Vector<double> delta_w(S);
-
-    SolverOptions opts;
-    opts.name = "w";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-
-    double objf_impr =
-        SolveQuadraticProblem(quadratic_term, derivative, opts, &delta_w);
-
-    impr_vec(i) = objf_impr;
-    if (i < 10 || objf_impr / (gamma_num(i) + 1.0e-10) > 2.0) {
-      KALDI_LOG << "Predicted objf impr for w per frame is "
-                << (objf_impr / (gamma_num(i) + 1.0e-10))
-                << " over " << gamma_num(i) << " frames.";
-    }
-    model->w_.Row(i).AddVec(1.0, delta_w);
-  }
-  KALDI_VLOG(1) << "Updating w: numerator count is " << gamma_num;
-  KALDI_VLOG(1) << "Updating w: denominator count is " << gamma_den;
-  KALDI_VLOG(1) << "Updating w: objf-impr is " << impr_vec;
-
-  double tot_num_count = gamma_num.Sum(), tot_impr = impr_vec.Sum();
-  tot_impr /= tot_num_count;
-
-  KALDI_LOG << "**Overall objf impr for w per frame is "
-            << tot_impr << " over " << tot_num_count
-            << " frames.";
-  return tot_impr;
-}
-
-
-double EbwAmSgmm2Updater::UpdateU(const MleAmSgmm2Accs &num_accs,
-                                  const MleAmSgmm2Accs &den_accs,
-                                  const Vector<double> &gamma_num,
-                                  const Vector<double> &gamma_den,
-                                  AmSgmm2 *model) {
-  int32 T = num_accs.spk_space_dim_;
-  double tot_impr = 0.0;
-  for (int32 i = 0; i < num_accs.num_gaussians_; i++) {
-    if (gamma_num(i) < 200.0) {
-      KALDI_LOG << "Numerator count is small " << gamma_num(i) << " for gaussian "
-                << i << ", not updating u_i.";
-      continue;
-    }
-    Vector<double> u_i(model->u_.Row(i));
-    Vector<double> delta_u(T);
-    Vector<double> t(T); // derivative.
-    t.AddVec(1.0, num_accs.t_.Row(i));
-    t.AddVec(-1.0, den_accs.t_.Row(i));
-    SpMatrix<double> U(T); // quadratic term.
-    U.AddSp(1.0, num_accs.U_[i]);
-    U.AddSp(1.0, den_accs.U_[i]);
-
-    double state_count = gamma_num(i) + gamma_den(i);
-    U.Scale((state_count + options_.tau_u) / (state_count + 1.0e-10));
-    U.Scale(1.0 / (options_.lrate_u + 1.0e-10) );
-
-    SolverOptions opts;
-    opts.name = "u";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-
-    double impr = SolveQuadraticProblem(U, t, opts, &delta_u);
-    double impr_per_frame = impr / gamma_num(i);
-    if (impr_per_frame > options_.max_impr_u) {
-      KALDI_WARN << "Updating speaker weight projections u, for Gaussian index "
-                 << i << ", impr/frame is " << impr_per_frame << " over "
-                 << gamma_num(i) << " frames, scaling back to not exceed "
-                 << options_.max_impr_u;
-      double scale = options_.max_impr_u / impr_per_frame;
-      impr *= scale;
-      delta_u.Scale(scale);
-      // Note: a linear scaling of "impr" with "scale" is not quite accurate
-      // in depicting how the quadratic auxiliary function varies as we change
-      // the scale on "delta", but this does not really matter-- the goal is
-      // to limit the auxiliary-function change to not be too large.
-    }
-    if (i < 10) {
-      KALDI_LOG << "Objf impr for spk weight-projection u for i = " << (i)
-                << ", is " << (impr / (gamma_num(i) + 1.0e-20)) << " over "
-                << gamma_num(i) << " frames";
-    }
-    u_i.AddVec(1.0, delta_u);
-    model->u_.Row(i).CopyFromVec(u_i);
-    tot_impr += impr;
-  }
-  KALDI_LOG << "**Overall objf impr for u is " << (tot_impr/gamma_num.Sum())
-            << ", over " << gamma_num.Sum() << " frames";
-  return tot_impr;
-}
-
-
-double EbwAmSgmm2Updater::UpdateN(const MleAmSgmm2Accs &num_accs,
-                                  const MleAmSgmm2Accs &den_accs,
-                                  const Vector<double> &gamma_num,
-                                  const Vector<double> &gamma_den,
-                                  AmSgmm2 *model) const {
-  if (num_accs.spk_space_dim_ == 0 || num_accs.R_.size() == 0 ||
-      num_accs.Z_.size() == 0) {
-    KALDI_ERR << "Speaker subspace dim is zero or no stats accumulated";
-  }
-
-  int32 I = num_accs.num_gaussians_, D = num_accs.feature_dim_,
-      T = num_accs.spk_space_dim_;
-
-  Vector<double> impr_vec(I);
-
-  for (int32 i = 0; i < I; i++) {
-    double gamma_i_num = gamma_num(i), gamma_i_den = gamma_den(i);
-    if (gamma_i_num + gamma_i_den == 0.0) {
-      KALDI_WARN << "Not updating speaker basis for i = " << i
-                 << " because count is zero. ";
-      continue;
-    }
-    Matrix<double> Ni(model->N_[i]);
-    // See comment near declaration of L in UpdateM().  This update is the
-    // same, but change M->N, Y->Z and Q->R.
-
-    Matrix<double> L(D, T);
-    L.AddMat(1.0, num_accs.Z_[i]);
-    L.AddMatSp(-1.0, Ni, kNoTrans, num_accs.R_[i], 1.0);
-    L.AddMat(-1.0, den_accs.Z_[i]);
-    L.AddMatSp(-1.0*-1.0, Ni, kNoTrans, den_accs.R_[i], 1.0);
-
-    SpMatrix<double> R(T); // combination of the numerator and denominator R's.
-    R.AddSp(1.0, num_accs.R_[i]);
-    R.AddSp(1.0, den_accs.R_[i]);
-
-    double state_count = 1.0e-10 + gamma_i_num + gamma_i_den; // the count
-    // represented by the quadratic part of the stats.
-    R.Scale( (state_count + options_.tau_N) / state_count );
-    R.Scale( 1.0 / (options_.lrate_N + 1.0e-10) );
-
-    Matrix<double> deltaN(D, T);
-
-    SolverOptions opts;
-    opts.name = "N";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-
-    double impr =
-        SolveQuadraticMatrixProblem(R, L,
-                                    SpMatrix<double>(model->SigmaInv_[i]),
-                                    opts, &deltaN);
-    impr_vec(i) = impr;
-    Ni.AddMat(1.0, deltaN);
-    model->N_[i].CopyFromMat(Ni);
-    if (i < 10 || impr / (state_count+1.0e-20) > 3.0) {
-      KALDI_LOG << "Objf impr for spk projection N for i = " << (i)
-                << ", is " << (impr / (gamma_i_num + 1.0e-20)) << " over "
-                << gamma_i_num << " frames";
-    }
-  }
-
-  KALDI_VLOG(1) << "Updating N: numerator count is " << gamma_num;
-  KALDI_VLOG(1) << "Updating N: denominator count is " << gamma_den;
-  KALDI_VLOG(1) << "Updating N: objf-impr is " << impr_vec;
-
-  double tot_count = gamma_num.Sum(), tot_impr = impr_vec.Sum();
-  tot_impr /= (tot_count + 1.0e-20);
-  KALDI_LOG << "**Overall auxf impr for N is " << tot_impr
-            << " over " << tot_count << " frames";
-  return tot_impr;
-}
-
-double EbwAmSgmm2Updater::UpdateVars(const MleAmSgmm2Accs &num_accs,
-                                     const MleAmSgmm2Accs &den_accs,
-                                     const Vector<double> &gamma_num,
-                                     const Vector<double> &gamma_den,
-                                     const std::vector< SpMatrix<double> > &S_means,
-                                     AmSgmm2 *model) const {
-  // Note: S_means contains not only the quantity S_means in the paper,
-  // but also has a term - (Y_i M_i^T + M_i Y_i^T).  Plus, it is differenced
-  // between numerator and denominator.  We don't calculate it here,
-  // because it had to be computed with the original model, before we
-  // changed the M quantities.
-  int32 I = num_accs.num_gaussians_;
-  KALDI_ASSERT(S_means.size() == I);
-  Vector<double> impr_vec(I);
-
-  for (int32 i = 0; i < I; i++) {
-    double num_count = gamma_num(i), den_count = gamma_den(i);
-
-    SpMatrix<double> SigmaStats(S_means[i]);
-    SigmaStats.AddSp(1.0, num_accs.S_[i]);
-    SigmaStats.AddSp(-1.0, den_accs.S_[i]);
-    // SigmaStats now contain the stats for estimating Sigma (as in the main SGMM paper),
-    // differenced between num and den.
-    SpMatrix<double> SigmaInvOld(model->SigmaInv_[i]), SigmaOld(model->SigmaInv_[i]);
-    SigmaOld.Invert();
-    double count = num_count - den_count;
-    KALDI_ASSERT(options_.lrate_Sigma <= 1.0);
-    double inv_lrate = 1.0 / options_.lrate_Sigma;
-    // These formulas assure that the objective function behaves in
-    // a roughly symmetric way w.r.t. num and den counts.
-    double E_den = 1.0 + inv_lrate, E_num = inv_lrate - 1.0;
-
-    double smoothing_count =
-        (options_.tau_Sigma * inv_lrate) + // multiply tau_Sigma by inverse-lrate
-        (E_den * den_count) +              // for compatibility with other updates.
-        (E_num * num_count) +
-        1.0e-10;
-    SigmaStats.AddSp(smoothing_count, SigmaOld);
-    count += smoothing_count;
-    SigmaStats.Scale(1.0 / count);
-    SpMatrix<double> SigmaInv(SigmaStats); // before floor and ceiling.  Currently sigma,
-    // not its inverse.
-    bool verbose = false;
-    int n_floor = SigmaInv.ApplyFloor(SigmaOld, options_.cov_min_value, verbose);
-    SigmaInv.Invert(); // make it inverse variance.
-    int n_ceiling = SigmaInv.ApplyFloor(SigmaInvOld, options_.cov_min_value, verbose);
-
-    // this auxf_change.
-    double auxf_change = -0.5 * count *(TraceSpSp(SigmaInv, SigmaStats)
-                                        - TraceSpSp(SigmaInvOld, SigmaStats)
-                                        - SigmaInv.LogDet()
-                                        + SigmaInvOld.LogDet());
-
-    model->SigmaInv_[i].CopyFromSp(SigmaInv);
-    impr_vec(i) = auxf_change;
-    if (i < 10 || auxf_change / (num_count+den_count+1.0e-10) > 2.0
-        || n_floor+n_ceiling > 0) {
-      KALDI_LOG << "Updating variance: Auxf change per frame for Gaussian "
-                << i << " is " << (auxf_change / num_count) << " over "
-                << num_count << " frames " << "(den count was " << den_count
-                << "), #floor,ceil was " << n_floor << ", " << n_ceiling;
-    }
-  }
-  KALDI_VLOG(1) << "Updating Sigma: numerator count is " << gamma_num;
-  KALDI_VLOG(1) << "Updating Sigma: denominator count is " << gamma_den;
-  KALDI_VLOG(1) << "Updating Sigma: objf-impr is " << impr_vec;
-
-  double tot_count = gamma_num.Sum(), tot_impr = impr_vec.Sum();
-  tot_impr /= tot_count+1.0e-20;
-  KALDI_LOG << "**Overall auxf impr for Sigma is " << tot_impr
-            << " over " << tot_count << " frames";
-  return tot_impr;
-}
-
-
-double EbwAmSgmm2Updater::UpdateSubstateWeights(
-    const MleAmSgmm2Accs &num_accs,
-    const MleAmSgmm2Accs &den_accs,
-    AmSgmm2 *model) {
-  KALDI_LOG << "Updating substate mixture weights";
-
-  double tot_count = 0.0, tot_impr = 0.0;
-  for (int32 j2 = 0; j2 < num_accs.num_pdfs_; j2++) {
-    int32 M = model->NumSubstatesForPdf(j2);
-    Vector<double> num_occs(M), den_occs(M),
-        orig_weights(model->c_[j2]), weights(model->c_[j2]);
-
-    for (int32 m = 0; m < M; m++) {
-      num_occs(m) = num_accs.gamma_c_[j2](m)
-          + options_.tau_c * weights(m);
-      den_occs(m) = den_accs.gamma_c_[j2](m);
-    }
-
-    if (weights.Dim() > 1) {
-      double begin_auxf = 0.0, end_auxf = 0.0;
-      for (int32 m = 0; m < M; m++) {  // see eq. 4.32, Dan Povey's PhD thesis.
-        begin_auxf += num_occs(m) * log (weights(m))
-            - den_occs(m) * weights(m) / orig_weights(m);
-      }
-      for (int32 iter = 0; iter < 50; iter++) {
-        Vector<double> k_jm(M);
-        double max_m = 0.0;
-        for (int32 m = 0; m < M; m++)
-          max_m = std::max(max_m, den_occs(m)/orig_weights(m));
-        for (int32 m = 0; m < M; m++)
-          k_jm(m) = max_m - den_occs(m)/orig_weights(m);
-        for (int32 m = 0; m < M; m++)
-          weights(m) = num_occs(m) + k_jm(m)*weights(m);
-        weights.Scale(1.0 / weights.Sum());
-      }
-      for (int32 m = 0; m < M; m++)
-        weights(m) = std::max(weights(m),
-                              static_cast<double>(options_.min_substate_weight));
-      weights.Scale(1.0 / weights.Sum()); // renormalize.
-
-      for (int32 m = 0; m < M; m++) {
-        end_auxf += num_occs(m) * log (weights(m))
-            - den_occs(m) * weights(m) / orig_weights(m);
-      }
-      tot_impr += end_auxf - begin_auxf;
-      double this_impr = ((end_auxf - begin_auxf) / num_occs.Sum());
-      if (j2 < 10 || this_impr > 0.5) {
-        KALDI_LOG << "Updating substate weights: auxf impr for pdf " << j2
-                  << " is " << this_impr << " per frame over " << num_occs.Sum()
-                  << " frames (den count is " << den_occs.Sum() << ")";
-      }
-    }
-    model->c_[j2].CopyFromVec(weights);
-    tot_count += den_occs.Sum(); // Note: num and den occs should be the
-    // same, except num occs are smoothed, so this is what we want.
-  }
-
-  tot_impr /= (tot_count + 1.0e-20);
-
-  KALDI_LOG << "**Overall auxf impr for c is " << tot_impr
-            << " over " << tot_count << " frames";
-  return tot_impr;
-}
-
-}  // namespace kaldi
diff --git a/src/sgmm2/estimate-am-sgmm2-ebw.h b/src/sgmm2/estimate-am-sgmm2-ebw.h
deleted file mode 100644
index c1ec188f367..00000000000
--- a/src/sgmm2/estimate-am-sgmm2-ebw.h
+++ /dev/null
@@ -1,242 +0,0 @@
-// sgmm2/estimate-am-sgmm2-ebw.h
-
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM2_ESTIMATE_AM_SGMM2_EBW_H_
-#define KALDI_SGMM2_ESTIMATE_AM_SGMM2_EBW_H_ 1
-
-#include <string>
-#include <vector>
-
-#include "gmm/model-common.h"
-#include "itf/options-itf.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-
-namespace kaldi {
-
-/**
-   This header implements a form of Extended Baum-Welch training for SGMMs.
-   If you are confused by this comment, see Dan Povey's thesis for an explanation of
-   Extended Baum-Welch.
-   A note on the EBW (Extended Baum-Welch) updates for the SGMMs... In general there is
-   a parameter-specific value D that is similar to the D in EBW for GMMs.  The value of
-   D is generally set to:
-     E * (denominator-count for that parameter)   +   tau-value for that parameter
-   where the tau-values are user-specified parameters that are specific to the type of
-   the parameter (e.g. phonetic vector, subspace projection, etc.).  Things are a bit
-   more complex for this update than for GMMs, because it's not just a question of picking
-   a tau-value for smoothing: there is sometimes a scatter-matrix of some kind (e.g.
-   an outer product of vectors, or something) that defines a quadratic objective function
-   that we'll add as smoothing.  We have to pick where to get this scatter-matrix from.
-   We feel that it's appropriate for the "E" part of the D to get its scatter-matrix from
-   denominator stats, and the tau part of the D to get half its scatter-matrix from the
-   both the numerator and denominator stats, assigned a weight proportional to how much
-   stats there were.  When you see the auxiliary function written out, it's clear why this
-   makes sense.
-
- */
-
-struct EbwAmSgmm2Options {
-  BaseFloat tau_v; ///<  Smoothing constant for updates of sub-state vectors v_{jm}
-  BaseFloat lrate_v; ///< Learning rate used in updating v-- default 0.5
-  BaseFloat tau_M; ///<  Smoothing constant for the M quantities (phone-subspace projections)
-  BaseFloat lrate_M; ///< Learning rate used in updating M-- default 0.5
-  BaseFloat tau_N; ///<  Smoothing constant for the N quantities (speaker-subspace projections)
-  BaseFloat lrate_N; ///< Learning rate used in updating N-- default 0.5
-  BaseFloat tau_c;  ///< Tau value for smoothing substate weights (c)
-  BaseFloat tau_w;  ///< Tau value for smoothing update of phonetic-subspace weight projectsions (w)
-  BaseFloat lrate_w; ///< Learning rate used in updating w-- default 1.0
-  BaseFloat tau_u;  ///< Tau value for smoothing update of speaker-subspace weight projectsions (u)
-  BaseFloat lrate_u; ///< Learning rate used in updating u-- default 1.0
-  BaseFloat max_impr_u; ///< Maximum improvement/frame allowed for u [0.25, carried over from ML update.]
-  BaseFloat tau_Sigma; ///< Tau value for smoothing covariance-matrices Sigma.
-  BaseFloat lrate_Sigma; ///< Learning rate used in updating Sigma-- default 0.5
-  BaseFloat min_substate_weight; ///< Minimum allowed weight in a sub-state.
-  
-  BaseFloat cov_min_value; ///< E.g. 0.5-- the maximum any eigenvalue of a covariance
-  /// is allowed to change.  [this is the minimum; the maximum is the inverse of this,
-  /// i.e. 2.0 in this case.  For example, 0.9 would constrain the covariance quite tightly,
-  /// 0.1 would be a loose setting.
-  
-  BaseFloat max_cond; ///< large value used in SolveQuadraticProblem.
-  BaseFloat epsilon;  ///< very small value used in SolveQuadraticProblem; workaround
-  /// for an issue in some implementations of SVD.
-  
-  EbwAmSgmm2Options() {
-    tau_v = 50.0;
-    lrate_v = 0.5;
-    tau_M = 500.0;
-    lrate_M = 0.5;
-    tau_N = 500.0;
-    lrate_N = 0.5;
-    tau_c = 10.0;
-    tau_w = 50.0;
-    lrate_w = 1.0;
-    tau_u = 50.0;
-    lrate_u = 1.0;
-    max_impr_u = 0.25;
-    tau_Sigma = 500.0;
-    lrate_Sigma = 0.5;
-
-    min_substate_weight = 1.0e-05;
-    cov_min_value = 0.5;
-    
-    max_cond = 1.0e+05;
-    epsilon = 1.0e-40;
-  }
-
-  void Register(OptionsItf *opts) {
-    std::string module = "EbwAmSgmm2Options: ";
-    opts->Register("tau-v", &tau_v, module+
-                   "Smoothing constant for phone vector estimation.");
-    opts->Register("lrate-v", &lrate_v, module+
-                   "Learning rate constant for phone vector estimation.");
-    opts->Register("tau-m", &tau_M, module+
-                   "Smoothing constant for estimation of phonetic-subspace projections (M).");
-    opts->Register("lrate-m", &lrate_M, module+
-                   "Learning rate constant for phonetic-subspace projections.");
-    opts->Register("tau-n", &tau_N, module+
-                   "Smoothing constant for estimation of speaker-subspace projections (N).");
-    opts->Register("lrate-n", &lrate_N, module+
-                   "Learning rate constant for speaker-subspace projections.");
-    opts->Register("tau-c", &tau_c, module+
-                   "Smoothing constant for estimation of substate weights (c)");
-    opts->Register("tau-w", &tau_w, module+
-                   "Smoothing constant for estimation of phonetic-space weight projections (w)");
-    opts->Register("lrate-w", &lrate_w, module+
-                   "Learning rate constant for phonetic-space weight-projections (w)");
-    opts->Register("tau-u", &tau_u, module+
-                   "Smoothing constant for estimation of speaker-space weight projections (u)");
-    opts->Register("lrate-u", &lrate_u, module+
-                   "Learning rate constant for speaker-space weight-projections (u)");
-    opts->Register("tau-sigma", &tau_Sigma, module+
-                   "Smoothing constant for estimation of within-class covariances (Sigma)");
-    opts->Register("lrate-sigma", &lrate_Sigma, module+
-                   "Constant that controls speed of learning for variances (larger->slower)");
-    opts->Register("cov-min-value", &cov_min_value, module+
-                   "Minimum value that an eigenvalue of the updated covariance matrix can take, "
-                   "relative to its old value (maximum is inverse of this.)");
-    opts->Register("min-substate-weight", &min_substate_weight, module+
-                   "Floor for weights of sub-states.");
-    opts->Register("max-cond", &max_cond, module+
-                   "Value used in handling singular matrices during update.");
-    opts->Register("epsilon", &max_cond, module+
-                   "Value used in handling singular matrices during update.");
-  }
-};
-
-
-/** \class EbwAmSgmmUpdater
- *  Contains the functions needed to update the SGMM parameters.
- */
-class EbwAmSgmm2Updater {
- public:
-  explicit EbwAmSgmm2Updater(const EbwAmSgmm2Options &options):
-      options_(options) {}
-  
-  void Update(const MleAmSgmm2Accs &num_accs,
-              const MleAmSgmm2Accs &den_accs,
-              AmSgmm2 *model,
-              SgmmUpdateFlagsType flags,
-              BaseFloat *auxf_change_out,
-              BaseFloat *count_out);
-    
- protected:
-  // The following two classes relate to multi-core parallelization of some
-  // phases of the update.
-  friend class EbwUpdateWClass;
-  friend class EbwUpdatePhoneVectorsClass;
- private:
-  EbwAmSgmm2Options options_;
-
-  Vector<double> gamma_j_;  ///< State occupancies
-
-  double UpdatePhoneVectors(const MleAmSgmm2Accs &num_accs,
-                            const MleAmSgmm2Accs &den_accs,
-                            const std::vector< SpMatrix<double> > &H,
-                            AmSgmm2 *model) const;
-  
-  // Called from UpdatePhoneVectors; updates a subset of states
-  // (relates to multi-threading).
-  void UpdatePhoneVectorsInternal(const MleAmSgmm2Accs &num_accs,
-                                  const MleAmSgmm2Accs &den_accs,
-                                  const std::vector<SpMatrix<double> > &H,
-                                  AmSgmm2 *model,
-                                  double *auxf_impr,
-                                  int32 num_threads,
-                                  int32 thread_id) const;
-  // Called from UpdatePhoneVectorsInternal
-  static void ComputePhoneVecStats(const MleAmSgmm2Accs &accs,
-                                   const AmSgmm2 &model,
-                                   const std::vector<SpMatrix<double> > &H,
-                                   int32 j1,
-                                   int32 m,
-                                   const Vector<double> &w_jm,
-                                   double gamma_jm,
-                                   Vector<double> *g_jm,
-                                   SpMatrix<double> *H_jm);
-                                    
-  double UpdateM(const MleAmSgmm2Accs &num_accs,
-                 const MleAmSgmm2Accs &den_accs,
-                 const std::vector< SpMatrix<double> > &Q_num,
-                 const std::vector< SpMatrix<double> > &Q_den,
-                 const Vector<double> &gamma_num,
-                 const Vector<double> &gamma_den,
-                 AmSgmm2 *model) const;
-  
-  double UpdateN(const MleAmSgmm2Accs &num_accs,
-                 const MleAmSgmm2Accs &den_accs,
-                 const Vector<double> &gamma_num,
-                 const Vector<double> &gamma_den,
-                 AmSgmm2 *model) const;
-  
-  double UpdateVars(const MleAmSgmm2Accs &num_accs,
-                    const MleAmSgmm2Accs &den_accs,
-                    const Vector<double> &gamma_num,
-                    const Vector<double> &gamma_den,
-                    const std::vector< SpMatrix<double> > &S_means,
-                    AmSgmm2 *model) const;
-
-  /// Note: in the discriminative case we do just one iteration of
-  /// updating the w quantities.
-  double UpdateW(const MleAmSgmm2Accs &num_accs,
-                 const MleAmSgmm2Accs &den_accs,
-                 const Vector<double> &gamma_num,
-                 const Vector<double> &gamma_den,
-                 AmSgmm2 *model);
-
-
-  double UpdateU(const MleAmSgmm2Accs &num_accs,
-                 const MleAmSgmm2Accs &den_accs,
-                 const Vector<double> &gamma_num,
-                 const Vector<double> &gamma_den,
-                 AmSgmm2 *model);
-  
-  double UpdateSubstateWeights(const MleAmSgmm2Accs &num_accs,
-                               const MleAmSgmm2Accs &den_accs,
-                               AmSgmm2 *model);
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(EbwAmSgmm2Updater);
-  EbwAmSgmm2Updater() {}  // Prevent unconfigured updater.
-};
-
-
-}  // namespace kaldi
-
-
-#endif  // KALDI_SGMM2_ESTIMATE_AM_SGMM2_EBW_H_
diff --git a/src/sgmm2/estimate-am-sgmm2-test.cc b/src/sgmm2/estimate-am-sgmm2-test.cc
deleted file mode 100644
index bfdb161d95f..00000000000
--- a/src/sgmm2/estimate-am-sgmm2-test.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-// sgmm2/estimate-am-sgmm2-test.cc
-
-// Copyright 2009-2011  Saarland University (author:  Arnab Ghoshal)
-//           2012-2013  Johns Hopkins University (author: Daniel Povey)
-//                      Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-math.h"
-#include "gmm/model-test-common.h"
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "util/kaldi-io.h"
-
-using kaldi::AmSgmm2;
-using kaldi::MleAmSgmm2Accs;
-using kaldi::int32;
-using kaldi::BaseFloat;
-using kaldi::Exp;
-
-namespace ut = kaldi::unittest;
-
-// Tests the Read() and Write() methods for the accumulators, in both binary
-// and ASCII mode, as well as Check().
-void TestSgmm2AccsIO(const AmSgmm2 &sgmm,
-                     const kaldi::Matrix<BaseFloat> &feats) {
-  using namespace kaldi;
-  kaldi::SgmmUpdateFlagsType flags = kaldi::kSgmmAll & ~kSgmmSpeakerWeightProjections;
-  kaldi::Sgmm2PerFrameDerivedVars frame_vars;
-  kaldi::Sgmm2PerSpkDerivedVars empty;
-  frame_vars.Resize(sgmm.NumGauss(), sgmm.FeatureDim(),
-                    sgmm.PhoneSpaceDim());
-  kaldi::Sgmm2GselectConfig sgmm_config;
-  sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest,
-                                        sgmm.NumGauss());
-  MleAmSgmm2Accs accs(sgmm, flags, true);
-  BaseFloat loglike = 0.0;
-
-  for (int32 i = 0; i < feats.NumRows(); i++) {
-    std::vector<int32> gselect;
-    sgmm.GaussianSelection(sgmm_config, feats.Row(i), &gselect);
-    sgmm.ComputePerFrameVars(feats.Row(i), gselect, empty, &frame_vars);
-    loglike += accs.Accumulate(sgmm, frame_vars, 0, 1.0, &empty);
-  }
-  accs.CommitStatsForSpk(sgmm, empty);
-
-  kaldi::MleAmSgmm2Options update_opts;
-  AmSgmm2 *sgmm1 = new AmSgmm2();
-  sgmm1->CopyFromSgmm2(sgmm, false, false);
-  kaldi::MleAmSgmm2Updater updater(update_opts);
-  updater.Update(accs, sgmm1, flags);
-  sgmm1->ComputeDerivedVars();
-  std::vector<int32> gselect;
-  Sgmm2LikelihoodCache like_cache(sgmm.NumGroups(), sgmm.NumPdfs());
-
-  sgmm1->GaussianSelection(sgmm_config, feats.Row(0), &gselect);
-  sgmm1->ComputePerFrameVars(feats.Row(0), gselect, empty, &frame_vars);
-  BaseFloat loglike1 = sgmm1->LogLikelihood(frame_vars, 0, &like_cache, &empty);
-  delete sgmm1;
-
-  // First, non-binary write
-  accs.Write(kaldi::Output("tmpf", false).Stream(), false);
-  bool binary_in;
-  MleAmSgmm2Accs *accs1 = new MleAmSgmm2Accs();
-  // Non-binary read
-  kaldi::Input ki1("tmpf", &binary_in);
-  accs1->Read(ki1.Stream(), binary_in, false);
-  accs1->Check(sgmm, true);
-  AmSgmm2 *sgmm2 = new AmSgmm2();
-  sgmm2->CopyFromSgmm2(sgmm, false, false);
-  updater.Update(*accs1, sgmm2, flags);
-  sgmm2->ComputeDerivedVars();
-  sgmm2->GaussianSelection(sgmm_config, feats.Row(0), &gselect);
-  sgmm2->ComputePerFrameVars(feats.Row(0), gselect, empty, &frame_vars);
-  Sgmm2LikelihoodCache like_cache2(sgmm2->NumGroups(), sgmm2->NumPdfs());
-  BaseFloat loglike2 = sgmm2->LogLikelihood(frame_vars, 0, &like_cache2, &empty);
-  kaldi::AssertEqual(loglike1, loglike2, 1e-4);
-  delete accs1;
-
-  // Next, binary write
-  accs.Write(kaldi::Output("tmpfb", true).Stream(), true);
-  MleAmSgmm2Accs *accs2 = new MleAmSgmm2Accs();
-  // Binary read
-  kaldi::Input ki2("tmpfb", &binary_in);
-  accs2->Read(ki2.Stream(), binary_in, false);
-  accs2->Check(sgmm, true);
-  AmSgmm2 *sgmm3 = new AmSgmm2();
-  sgmm3->CopyFromSgmm2(sgmm, false, false);
-  updater.Update(*accs2, sgmm3, flags);
-  sgmm3->ComputeDerivedVars();
-  sgmm3->GaussianSelection(sgmm_config, feats.Row(0), &gselect);
-  sgmm3->ComputePerFrameVars(feats.Row(0), gselect, empty, &frame_vars);
-  Sgmm2LikelihoodCache like_cache3(sgmm3->NumGroups(), sgmm3->NumPdfs());
-  BaseFloat loglike3 = sgmm3->LogLikelihood(frame_vars, 0, &like_cache3, &empty);
-  kaldi::AssertEqual(loglike1, loglike3, 1e-6);
-
-  // Testing the MAP update of M
-  update_opts.tau_map_M = 10;
-  update_opts.full_col_cov = (RandUniform() > 0.5)? true : false;
-  update_opts.full_row_cov = (RandUniform() > 0.5)? true : false;
-  kaldi::MleAmSgmm2Updater updater_map(update_opts);
-  sgmm3->CopyFromSgmm2(sgmm, false, false);
-  updater_map.Update(*accs2, sgmm3, flags);
-
-  delete accs2;
-  delete sgmm2;
-  delete sgmm3;
-
-  unlink("tmpf");
-  unlink("tmpfb");
-}
-
-void UnitTestEstimateSgmm2() {
-  int32 dim = 1 + kaldi::RandInt(0, 9);  // random dimension of the gmm
-  int32 num_comp = 2 + kaldi::RandInt(0, 9);  // random mixture size
-  kaldi::FullGmm full_gmm;
-  ut::InitRandFullGmm(dim, num_comp, &full_gmm);
-
-  AmSgmm2 sgmm;
-  kaldi::Sgmm2GselectConfig config;
-  std::vector<int32> pdf2group;
-  pdf2group.push_back(0);
-  sgmm.InitializeFromFullGmm(full_gmm, pdf2group, dim+1, dim, false, 0.9); // TODO-- make this true!
-  sgmm.ComputeNormalizers();
-
-  kaldi::Matrix<BaseFloat> feats;
-
-  {  // First, generate random means and variances
-    int32 num_feat_comp = num_comp + kaldi::RandInt(-num_comp/2, num_comp/2);
-    kaldi::Matrix<BaseFloat> means(num_feat_comp, dim),
-        vars(num_feat_comp, dim);
-    for (int32 m = 0; m < num_feat_comp; m++) {
-      for (int32 d= 0; d < dim; d++) {
-        means(m, d) = kaldi::RandGauss();
-        vars(m, d) = Exp(kaldi::RandGauss()) + 1e-2;
-      }
-    }
-    // Now generate random features with those means and variances.
-    feats.Resize(num_feat_comp * 200, dim);
-    for (int32 m = 0; m < num_feat_comp; m++) {
-      kaldi::SubMatrix<BaseFloat> tmp(feats, m*200, 200, 0, dim);
-      ut::RandDiagGaussFeatures(200, means.Row(m), vars.Row(m), &tmp);
-    }
-  }
-  sgmm.ComputeDerivedVars();
-  TestSgmm2AccsIO(sgmm, feats);
-}
-
-int main() {
-  for (int i = 0; i < 10; i++)
-    UnitTestEstimateSgmm2();
-  std::cout << "Test OK.\n";
-  return 0;
-}
diff --git a/src/sgmm2/estimate-am-sgmm2.cc b/src/sgmm2/estimate-am-sgmm2.cc
deleted file mode 100644
index 6bb277314df..00000000000
--- a/src/sgmm2/estimate-am-sgmm2.cc
+++ /dev/null
@@ -1,1952 +0,0 @@
-// sgmm2/estimate-am-sgmm2.cc
-
-// Copyright 2009-2011  Microsoft Corporation;  Lukas Burget;
-//                      Saarland University (Author: Arnab Ghoshal);
-//                      Ondrej Glembek;  Yanmin Qian;
-// Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey)
-//                      Liang Lu;  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-
-using std::string;
-using std::vector;
-
-void MleAmSgmm2Accs::Write(std::ostream &out_stream, bool binary) const {
-
-  WriteToken(out_stream, binary, "<SGMMACCS>");
-  WriteToken(out_stream, binary, "<NUMPDFS>");
-  WriteBasicType(out_stream, binary, num_pdfs_);
-  WriteToken(out_stream, binary, "<NUMGROUPS>");
-  WriteBasicType(out_stream, binary, num_groups_);
-  WriteToken(out_stream, binary, "<NUMGaussians>");
-  WriteBasicType(out_stream, binary, num_gaussians_);
-  WriteToken(out_stream, binary, "<FEATUREDIM>");
-  WriteBasicType(out_stream, binary, feature_dim_);
-  WriteToken(out_stream, binary, "<PHONESPACEDIM>");
-  WriteBasicType(out_stream, binary, phn_space_dim_);
-  WriteToken(out_stream, binary, "<SPKSPACEDIM>");
-  WriteBasicType(out_stream, binary, spk_space_dim_);
-  if (!binary) out_stream << "\n";
-
-  if (Y_.size() != 0) {
-    KALDI_ASSERT(gamma_.size() != 0);
-    WriteToken(out_stream, binary, "<Y>");
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      Matrix<BaseFloat>(Y_[i]).Write(out_stream, binary);
-    }
-  }
-  if (Z_.size() != 0) {
-    KALDI_ASSERT(R_.size() != 0);
-    WriteToken(out_stream, binary, "<Z>");
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      Matrix<BaseFloat>(Z_[i]).Write(out_stream, binary);
-    }
-    WriteToken(out_stream, binary, "<R>");
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      SpMatrix<BaseFloat>(R_[i]).Write(out_stream, binary);
-    }
-  }
-  if (S_.size() != 0) {
-    KALDI_ASSERT(gamma_.size() != 0);
-    WriteToken(out_stream, binary, "<S>");
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      SpMatrix<BaseFloat>(S_[i]).Write(out_stream, binary);
-    }
-  }
-  if (y_.size() != 0) {
-    KALDI_ASSERT(gamma_.size() != 0);
-    WriteToken(out_stream, binary, "<y>");
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      Matrix<BaseFloat>(y_[j1]).Write(out_stream, binary);
-    }
-  }
-  if (gamma_.size() != 0) { // These stats are large
-    // -> write as single precision.
-    WriteToken(out_stream, binary, "<gamma>");
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      Matrix<BaseFloat> gamma_j1(gamma_[j1]);
-      gamma_j1.Write(out_stream, binary);
-    }
-  }
-  if (t_.NumRows() != 0) {
-    WriteToken(out_stream, binary, "<t>");
-    Matrix<BaseFloat>(t_).Write(out_stream, binary);
-  }
-  if (U_.size() != 0) {
-    WriteToken(out_stream, binary, "<U>");
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      SpMatrix<BaseFloat>(U_[i]).Write(out_stream, binary);
-    }
-  }
-  if (gamma_c_.size() != 0) {
-    WriteToken(out_stream, binary, "<gamma_c>");
-    for (int32 j2 = 0; j2 < num_pdfs_; j2++) {
-      Vector<BaseFloat>(gamma_c_[j2]).Write(out_stream, binary);
-    }
-  }
-  if (a_.size() != 0) {
-    WriteToken(out_stream, binary, "<a>");
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      Matrix<BaseFloat>(a_[j1]).Write(out_stream, binary);
-    }
-  }
-  WriteToken(out_stream, binary, "<total_like>");
-  WriteBasicType(out_stream, binary, total_like_);
-
-  WriteToken(out_stream, binary, "<total_frames>");
-  WriteBasicType(out_stream, binary, total_frames_);
-
-  WriteToken(out_stream, binary, "</SGMMACCS>");
-}
-
-void MleAmSgmm2Accs::Read(std::istream &in_stream, bool binary,
-                         bool add) {
-  ExpectToken(in_stream, binary, "<SGMMACCS>");
-  ExpectToken(in_stream, binary, "<NUMPDFS>");
-  ReadBasicType(in_stream, binary, &num_pdfs_);
-  ExpectToken(in_stream, binary, "<NUMGROUPS>");
-  ReadBasicType(in_stream, binary, &num_groups_);
-  ExpectToken(in_stream, binary, "<NUMGaussians>");
-  ReadBasicType(in_stream, binary, &num_gaussians_);
-  ExpectToken(in_stream, binary, "<FEATUREDIM>");
-  ReadBasicType(in_stream, binary, &feature_dim_);
-  ExpectToken(in_stream, binary, "<PHONESPACEDIM>");
-  ReadBasicType(in_stream, binary, &phn_space_dim_);
-  ExpectToken(in_stream, binary, "<SPKSPACEDIM>");
-  ReadBasicType(in_stream, binary, &spk_space_dim_);
-
-  string token;
-  ReadToken(in_stream, binary, &token);
-
-  while (token != "</SGMMACCS>") {
-    if (token == "<Y>") {
-      Y_.resize(num_gaussians_);
-      for (size_t i = 0; i < Y_.size(); i++) {
-        Y_[i].Read(in_stream, binary, add);
-      }
-    } else if (token == "<Z>") {
-      Z_.resize(num_gaussians_);
-      for (size_t i = 0; i < Z_.size(); i++) {
-        Z_[i].Read(in_stream, binary, add);
-      }
-    } else if (token == "<R>") {
-      R_.resize(num_gaussians_);
-      if (gamma_s_.Dim() == 0) gamma_s_.Resize(num_gaussians_);
-      for (size_t i = 0; i < R_.size(); i++) {
-        R_[i].Read(in_stream, binary, add);
-      }
-    } else if (token == "<S>") {
-      S_.resize(num_gaussians_);
-      for (size_t i = 0; i < S_.size(); i++) {
-        S_[i].Read(in_stream, binary, add);
-      }
-    } else if (token == "<y>") {
-      y_.resize(num_groups_);
-      for (int32 j1 = 0; j1 < num_groups_; j1++) {
-        y_[j1].Read(in_stream, binary, add);
-      }
-    } else if (token == "<gamma>") {
-      gamma_.resize(num_groups_);
-      for (int32 j1 = 0; j1 < num_groups_; j1++) {
-        gamma_[j1].Read(in_stream, binary, add);
-      }
-      // Don't read gamma_s, it's just a temporary variable and
-      // not part of the permanent (non-speaker-specific) accs.
-    } else if (token == "<a>") {
-      a_.resize(num_groups_);
-      for (int32 j1 = 0; j1 < num_groups_; j1++) {
-        a_[j1].Read(in_stream, binary, add);
-      }
-    } else if (token == "<gamma_c>") {
-      gamma_c_.resize(num_pdfs_);
-      for (int32 j2 = 0; j2 < num_pdfs_; j2++) {
-        gamma_c_[j2].Read(in_stream, binary, add);
-      }
-    } else if (token == "<t>") {
-      t_.Read(in_stream, binary, add);
-    } else if (token == "<U>") {
-      U_.resize(num_gaussians_);
-      for (int32 i = 0; i < num_gaussians_; i++) {
-        U_[i].Read(in_stream, binary, add);
-      }
-    } else if (token == "<total_like>") {
-      double total_like;
-      ReadBasicType(in_stream, binary, &total_like);
-      if (add)
-        total_like_ += total_like;
-      else
-        total_like_ = total_like;
-    } else if (token == "<total_frames>") {
-      double total_frames;
-      ReadBasicType(in_stream, binary, &total_frames);
-      if (add)
-        total_frames_ += total_frames;
-      else
-        total_frames_ = total_frames;
-    } else {
-      KALDI_ERR << "Unexpected token '" << token << "' in model file ";
-    }
-    ReadToken(in_stream, binary, &token);
-  }
-}
-
-void MleAmSgmm2Accs::Check(const AmSgmm2 &model,
-                          bool show_properties) const {
-  if (show_properties)
-    KALDI_LOG << "Sgmm2PdfModel: J1 = " << num_groups_ << ", J2 = "
-              << num_pdfs_ << ", D = " << feature_dim_ << ", S = "
-              << phn_space_dim_ << ", T = " << spk_space_dim_ << ", I = "
-              << num_gaussians_;
-
-  KALDI_ASSERT(num_pdfs_ == model.NumPdfs() && num_pdfs_ > 0);
-  KALDI_ASSERT(num_groups_ == model.NumGroups() && num_groups_ > 0);
-  KALDI_ASSERT(num_gaussians_ == model.NumGauss() && num_gaussians_ > 0);
-  KALDI_ASSERT(feature_dim_ == model.FeatureDim() && feature_dim_ > 0);
-  KALDI_ASSERT(phn_space_dim_ == model.PhoneSpaceDim() && phn_space_dim_ > 0);
-  KALDI_ASSERT(spk_space_dim_ == model.SpkSpaceDim());
-
-  std::ostringstream debug_str;
-
-  if (Y_.size() == 0) {
-    debug_str << "Y: no.  ";
-  } else {
-    KALDI_ASSERT(gamma_.size() != 0);
-    KALDI_ASSERT(Y_.size() == static_cast<size_t>(num_gaussians_));
-    bool nz = false;
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      KALDI_ASSERT(Y_[i].NumRows() == feature_dim_ &&
-                   Y_[i].NumCols() == phn_space_dim_);
-      if (!nz && Y_[i](0, 0) != 0) { nz = true; }
-    }
-    debug_str << "Y: yes, " << string(nz ? "nonzero. " : "zero. ");
-  }
-
-  if (Z_.size() == 0) {
-    KALDI_ASSERT(R_.size() == 0);
-    debug_str << "Z, R: no.  ";
-  } else {
-    KALDI_ASSERT(gamma_s_.Dim() == num_gaussians_);
-    KALDI_ASSERT(Z_.size() == static_cast<size_t>(num_gaussians_));
-    KALDI_ASSERT(R_.size() == static_cast<size_t>(num_gaussians_));
-    bool Z_nz = false, R_nz = false;
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      KALDI_ASSERT(Z_[i].NumRows() == feature_dim_ &&
-                   Z_[i].NumCols() == spk_space_dim_);
-      KALDI_ASSERT(R_[i].NumRows() == spk_space_dim_);
-      if (!Z_nz && Z_[i](0, 0) != 0) { Z_nz = true; }
-      if (!R_nz && R_[i](0, 0) != 0) { R_nz = true; }
-    }
-    bool gamma_s_nz = !gamma_s_.IsZero();
-    debug_str << "Z: yes, " << string(Z_nz ? "nonzero. " : "zero. ");
-    debug_str << "R: yes, " << string(R_nz ? "nonzero. " : "zero. ");
-    debug_str << "gamma_s: yes, " << string(gamma_s_nz ? "nonzero. " : "zero. ");
-  }
-
-  if (S_.size() == 0) {
-    debug_str << "S: no.  ";
-  } else {
-    KALDI_ASSERT(gamma_.size() != 0);
-    bool S_nz = false;
-    KALDI_ASSERT(S_.size() == static_cast<size_t>(num_gaussians_));
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      KALDI_ASSERT(S_[i].NumRows() == feature_dim_);
-      if (!S_nz && S_[i](0, 0) != 0) { S_nz = true; }
-    }
-    debug_str << "S: yes, " << string(S_nz ? "nonzero. " : "zero. ");
-  }
-
-  if (y_.size() == 0) {
-    debug_str << "y: no.  ";
-  } else {
-    KALDI_ASSERT(gamma_.size() != 0);
-    bool nz = false;
-    KALDI_ASSERT(y_.size() == static_cast<size_t>(num_groups_));
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      KALDI_ASSERT(y_[j1].NumRows() == model.NumSubstatesForGroup(j1));
-      KALDI_ASSERT(y_[j1].NumCols() == phn_space_dim_);
-      if (!nz && y_[j1](0, 0) != 0) { nz = true; }
-    }
-    debug_str << "y: yes, " << string(nz ? "nonzero. " : "zero. ");
-  }
-
-  if (a_.size() == 0) {
-    debug_str << "a: no.  ";
-  } else {
-    debug_str << "a: yes.  ";
-    bool nz = false;
-    KALDI_ASSERT(a_.size() == static_cast<size_t>(num_groups_));
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      KALDI_ASSERT(a_[j1].NumRows() == model.NumSubstatesForGroup(j1) &&
-                   a_[j1].NumCols() == num_gaussians_);
-      if (!nz && a_[j1].Sum() != 0) nz = true;
-    }
-    debug_str << "a: yes, " << string(nz ? "nonzero. " : "zero. "); // TODO: take out "string"
-  }
-
-  double tot_gamma = 0.0;
-  if (gamma_.size() == 0) {
-    debug_str << "gamma: no.  ";
-  } else {
-    debug_str << "gamma: yes.  ";
-    KALDI_ASSERT(gamma_.size() == static_cast<size_t>(num_groups_));
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      KALDI_ASSERT(gamma_[j1].NumRows() == model.NumSubstatesForGroup(j1) &&
-                   gamma_[j1].NumCols() == num_gaussians_);
-      tot_gamma += gamma_[j1].Sum();
-    }
-    bool nz = (tot_gamma != 0.0);
-    KALDI_ASSERT(gamma_c_.size() == num_pdfs_ && "gamma_ set up but not gamma_c_.");
-    debug_str << "gamma: yes, " << string(nz ? "nonzero. " : "zero. ");
-  }
-
-  if (gamma_c_.size() == 0) {
-    KALDI_ERR << "gamma_c_ not set up."; // required for all accs.
-  } else {
-    KALDI_ASSERT(gamma_c_.size() == num_pdfs_);
-    double tot_gamma_c = 0.0;
-    for (int32 j2 = 0; j2 < num_pdfs_; j2++) {
-      KALDI_ASSERT(gamma_c_[j2].Dim() == model.NumSubstatesForPdf(j2));
-      tot_gamma_c += gamma_c_[j2].Sum();
-    }
-    bool nz = (tot_gamma_c != 0.0);
-    debug_str << "gamma_c: yes, " << string(nz ? "nonzero. " : "zero. ");
-    if (!gamma_.empty() && !ApproxEqual(tot_gamma_c, tot_gamma))
-      KALDI_WARN << "Counts from gamma and gamma_c differ "
-                 << tot_gamma << " vs. " << tot_gamma_c;
-  }
-
-  if (t_.NumRows() == 0) {
-    debug_str << "t: no.  ";
-  } else {
-    KALDI_ASSERT(t_.NumRows() == num_gaussians_ &&
-                 t_.NumCols() == spk_space_dim_);
-    KALDI_ASSERT(!U_.empty()); // t and U are used together.
-    bool nz = (t_.FrobeniusNorm() != 0);
-    debug_str << "t: yes, " << string(nz ? "nonzero. " : "zero. ");
-  }
-
-  if (U_.size() == 0) {
-    debug_str << "U: no.  ";
-  } else {
-    bool nz = false;
-    KALDI_ASSERT(U_.size() == num_gaussians_);
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      if (!nz && U_[i].FrobeniusNorm() != 0) nz = true;
-      KALDI_ASSERT(U_[i].NumRows() == spk_space_dim_);
-    }
-    KALDI_ASSERT(t_.NumRows() != 0); // t and U are used together.
-    debug_str << "t: yes, " << string(nz ? "nonzero. " : "zero. ");
-  }
-
-  if (show_properties)
-    KALDI_LOG << "Subspace GMM model properties: " << debug_str.str();
-}
-
-void MleAmSgmm2Accs::ResizeAccumulators(const AmSgmm2 &model,
-                                        SgmmUpdateFlagsType flags,
-                                        bool have_spk_vecs) {
-  num_pdfs_ = model.NumPdfs();
-  num_groups_ = model.NumGroups();
-  num_gaussians_ = model.NumGauss();
-  feature_dim_ = model.FeatureDim();
-  phn_space_dim_ = model.PhoneSpaceDim();
-  spk_space_dim_ = model.SpkSpaceDim();
-  total_frames_ = total_like_ = 0;
-
-  if (flags & (kSgmmPhoneProjections | kSgmmCovarianceMatrix)) {
-    Y_.resize(num_gaussians_);
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      Y_[i].Resize(feature_dim_, phn_space_dim_);
-    }
-  } else {
-    Y_.clear();
-  }
-
-  if (flags & (kSgmmSpeakerProjections | kSgmmSpeakerWeightProjections)) {
-    gamma_s_.Resize(num_gaussians_);
-  } else {
-    gamma_s_.Resize(0);
-  }
-
-  if (flags & kSgmmSpeakerProjections) {
-    if (spk_space_dim_ == 0) {
-      KALDI_ERR << "Cannot set up accumulators for speaker projections "
-                << "because speaker subspace has not been set up";
-    }
-    Z_.resize(num_gaussians_);
-    R_.resize(num_gaussians_);
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      Z_[i].Resize(feature_dim_, spk_space_dim_);
-      R_[i].Resize(spk_space_dim_);
-    }
-  } else {
-    Z_.clear();
-    R_.clear();
-  }
-
-  if (flags & kSgmmCovarianceMatrix) {
-    S_.resize(num_gaussians_);
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      S_[i].Resize(feature_dim_);
-    }
-  } else {
-    S_.clear();
-  }
-
-  if (flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections |
-               kSgmmCovarianceMatrix | kSgmmPhoneProjections)) {
-    gamma_.resize(num_groups_);
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      gamma_[j1].Resize(model.NumSubstatesForGroup(j1), num_gaussians_);
-    }
-  } else {
-    gamma_.clear();
-  }
-
-  if (flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections)
-      && model.HasSpeakerDependentWeights() && have_spk_vecs) { // SSGMM code.
-    a_.resize(num_groups_);
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      a_[j1].Resize(model.NumSubstatesForGroup(j1),
-                    num_gaussians_);
-    }
-  } else {
-    a_.clear();
-  }
-
-  if (flags & kSgmmSpeakerWeightProjections) {
-    KALDI_ASSERT(model.HasSpeakerDependentWeights() &&
-                 "remove the flag \"u\" if you don't have u set up.");
-    a_s_.Resize(num_gaussians_);
-    t_.Resize(num_gaussians_, spk_space_dim_);
-    U_.resize(num_gaussians_);
-    for (int32 i = 0; i < num_gaussians_; i++)
-      U_[i].Resize(spk_space_dim_);
-  } else {
-    a_s_.Resize(0);
-    t_.Resize(0, 0);
-    U_.resize(0);
-  }
-
-  if (true) { // always set up gamma_c_; it's nominally for
-    // estimation of substate weights, but it's also required when
-    // GetStateOccupancies() is called.
-    gamma_c_.resize(num_pdfs_);
-    for (int32 j2 = 0; j2 < num_pdfs_; j2++) {
-      gamma_c_[j2].Resize(model.NumSubstatesForPdf(j2));
-    }
-  }
-
-
-  if (flags & kSgmmPhoneVectors) {
-    y_.resize(num_groups_);
-    for (int32 j1 = 0; j1 < num_groups_; j1++) {
-      y_[j1].Resize(model.NumSubstatesForGroup(j1), phn_space_dim_);
-    }
-  } else {
-    y_.clear();
-  }
-}
-
-BaseFloat MleAmSgmm2Accs::Accumulate(const AmSgmm2 &model,
-                                    const Sgmm2PerFrameDerivedVars &frame_vars,
-                                    int32 j2,
-                                    BaseFloat weight,
-                                    Sgmm2PerSpkDerivedVars *spk_vars) {
-  // Calculate Gaussian posteriors and collect statistics
-  Matrix<BaseFloat> posteriors;
-  BaseFloat log_like = model.ComponentPosteriors(frame_vars, j2, spk_vars, &posteriors);
-  posteriors.Scale(weight);
-  BaseFloat count = AccumulateFromPosteriors(model, frame_vars, posteriors,
-                                             j2, spk_vars);
-  // Note: total_frames_ is incremented in AccumulateFromPosteriors().
-  total_like_ += count * log_like;
-  return log_like;
-}
-
-BaseFloat MleAmSgmm2Accs::AccumulateFromPosteriors(
-    const AmSgmm2 &model,
-    const Sgmm2PerFrameDerivedVars &frame_vars,
-    const Matrix<BaseFloat> &posteriors,
-    int32 j2,
-    Sgmm2PerSpkDerivedVars *spk_vars) {
-  double tot_count = 0.0;
-  const vector<int32> &gselect = frame_vars.gselect;
-  // Intermediate variables
-  Vector<BaseFloat> gammat(gselect.size()), // sum of gammas over mix-weight.
-      a_is_part(gselect.size()); //
-  Vector<BaseFloat> xt_jmi(feature_dim_), mu_jmi(feature_dim_),
-      zt_jmi(spk_space_dim_);
-
-  int32 j1 = model.Pdf2Group(j2);
-  int32 num_substates = model.NumSubstatesForGroup(j1);
-
-  for (int32 m = 0; m < num_substates; m++) {
-    BaseFloat d_jms = model.GetDjms(j1, m, spk_vars);
-    BaseFloat gammat_jm = 0.0;
-    for (int32 ki = 0; ki < static_cast<int32>(gselect.size()); ki++) {
-      int32 i = gselect[ki];
-
-      // Eq. (39): gamma_{jmi}(t) = p (j, m, i|t)
-      BaseFloat gammat_jmi = RandPrune(posteriors(ki, m), rand_prune_);
-      if (gammat_jmi == 0.0) continue;
-      gammat(ki) += gammat_jmi;
-      if (gamma_s_.Dim() != 0)
-        gamma_s_(i) += gammat_jmi;
-      gammat_jm += gammat_jmi;
-
-      // Accumulate statistics for non-zero gaussian posteriors
-      tot_count += gammat_jmi;
-      if (!gamma_.empty()) {
-        // Eq. (40): gamma_{jmi} = \sum_t gamma_{jmi}(t)
-        gamma_[j1](m, i) += gammat_jmi;
-      }
-      if (!y_.empty()) {
-        // Eq. (41): y_{jm} = \sum_{t, i} \gamma_{jmi}(t) z_{i}(t)
-        // Suggestion:  move this out of the loop over m
-        y_[j1].Row(m).AddVec(gammat_jmi, frame_vars.zti.Row(ki));
-      }
-      if (!Y_.empty()) {
-        // Eq. (42): Y_{i} = \sum_{t, j, m} \gamma_{jmi}(t) x_{i}(t) v_{jm}^T
-        Y_[i].AddVecVec(gammat_jmi, frame_vars.xti.Row(ki),
-                        model.v_[j1].Row(m));
-      }
-      // Accumulate for speaker projections
-      if (!Z_.empty()) {
-        KALDI_ASSERT(spk_space_dim_ > 0);
-        // Eq. (43): x_{jmi}(t) = x_k(t) - M{i} v_{jm}
-        model.GetSubstateMean(j1, m, i, &mu_jmi);
-        xt_jmi.CopyFromVec(frame_vars.xt);
-        xt_jmi.AddVec(-1.0, mu_jmi);
-        // Eq. (44): Z_{i} = \sum_{t, j, m} \gamma_{jmi}(t) x_{jmi}(t) v^{s}'
-        if (spk_vars->v_s.Dim() != 0)  // interpret empty v_s as zero.
-          Z_[i].AddVecVec(gammat_jmi, xt_jmi, spk_vars->v_s);
-        // Eq. (49): \gamma_{i}^{(s)} = \sum_{t\in\Tau(s), j, m} gamma_{jmi}
-        // Will be used when you call CommitStatsForSpk(), to update R_.
-      }
-    } // loop over selected Gaussians
-    if (gammat_jm != 0.0) {
-      if (!a_.empty()) { // SSGMM code.
-        KALDI_ASSERT(d_jms > 0);
-        // below is eq. 40 in the MSR techreport.  Caution: there
-        // was an error in the original techreport.  The index i
-        // in the summation and the quantity \gamma_{jmi}^{(t)}
-        // should be differently named, e.g. i'.
-        a_[j1].Row(m).AddVec(gammat_jm / d_jms, spk_vars->b_is);
-      }
-      if (a_s_.Dim() != 0) { // [SSGMM]
-        KALDI_ASSERT(d_jms > 0);
-        KALDI_ASSERT(!model.w_jmi_.empty());
-        a_s_.AddVec(gammat_jm / d_jms, model.w_jmi_[j1].Row(m));
-      }
-      if (!gamma_c_.empty())
-        gamma_c_[j2](m) += gammat_jm;
-    }
-  } // loop over substates
-
-  if (!S_.empty()) {
-    for (int32 ki = 0; ki < static_cast<int32>(gselect.size()); ki++) {
-      // Eq. (47): S_{i} = \sum_{t, j, m} \gamma_{jmi}(t) x_{i}(t) x_{i}(t)^T
-      if (gammat(ki) != 0.0) {
-        int32 i = gselect[ki];
-        S_[i].AddVec2(gammat(ki), frame_vars.xti.Row(ki));
-      }
-    }
-  }
-  total_frames_ += tot_count;
-  return tot_count;
-}
-
-void MleAmSgmm2Accs::CommitStatsForSpk(const AmSgmm2 &model,
-                                       const Sgmm2PerSpkDerivedVars &spk_vars) {
-  const VectorBase<BaseFloat> &v_s = spk_vars.v_s;
-  if (v_s.Dim() != 0 && !v_s.IsZero() && !R_.empty()) {
-    for (int32 i = 0; i < num_gaussians_; i++)
-      // Accumulate Statistics R_{ki}
-      if (gamma_s_(i) != 0.0)
-        R_[i].AddVec2(gamma_s_(i),
-                      Vector<double>(v_s));
-  }
-  if (a_s_.Dim() != 0) {
-    Vector<BaseFloat> tmp(gamma_s_);
-    // tmp(i) = gamma_s^{(i)} - a_i^{(s)} b_i^{(s)}.
-    tmp.AddVecVec(-1.0, Vector<BaseFloat>(a_s_), spk_vars.b_is, 1.0);
-    t_.AddVecVec(1.0, tmp, v_s); // eq. 53 of techreport.
-    for (int32 i = 0; i < num_gaussians_; i++) {
-      U_[i].AddVec2(a_s_(i) * spk_vars.b_is(i),
-                    Vector<double>(v_s)); // eq. 54 of techreport.
-    }
-  }
-  gamma_s_.SetZero();
-  a_s_.SetZero();
-}
-
-void MleAmSgmm2Accs::GetStateOccupancies(Vector<BaseFloat> *occs) const {
-  int32 J2 = gamma_c_.size();
-  occs->Resize(J2);
-  for (int32 j2 = 0; j2 < J2; j2++) {
-    (*occs)(j2) = gamma_c_[j2].Sum();
-  }
-}
-
-void MleAmSgmm2Updater::Update(const MleAmSgmm2Accs &accs,
-                               AmSgmm2 *model,
-                               SgmmUpdateFlagsType flags) {
-  // Q_{i}, quadratic term for phonetic subspace estimation. Dim is [I][S][S]
-  std::vector< SpMatrix<double> > Q;
-
-  // Eq (74): S_{i}^{(means)}, scatter of substate mean vectors for estimating
-  // the shared covariance matrices. [Actually this variable contains also the
-  // term -(Y_i M_i^T + M_i Y_I^T).]  Dimension is [I][D][D].
-  std::vector< SpMatrix<double> > S_means;
-  std::vector<Matrix<double> > log_a;
-
-  Vector<double> gamma_i(accs.num_gaussians_);
-  for (int32 j1 = 0; j1 < accs.num_groups_; j1++)
-    gamma_i.AddRowSumMat(1.0, accs.gamma_[j1]); // add sum of rows of
-  // accs.gamma_[j1], to gamma_i.
-
-  if (flags & kSgmmPhoneProjections)
-    ComputeQ(accs, *model, &Q);
-  if (flags & kSgmmCovarianceMatrix)
-    ComputeSMeans(accs, *model, &S_means);
-  if (!accs.a_.empty())
-    ComputeLogA(accs, &log_a);
-
-  // quantities used in both vector and weights updates...
-  vector< SpMatrix<double> > H;
-  // "smoothing" matrices, weighted sums of above.
-  SpMatrix<double> H_sm; // weighted sum of H.  Used e.g. in renormalizing phonetic space.
-  if ((flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections))
-      || options_.renormalize_V)
-    model->ComputeH(&H);
-
-  BaseFloat tot_impr = 0.0;
-
-  if (flags & kSgmmPhoneVectors)
-    tot_impr += UpdatePhoneVectors(accs, H, log_a, model);
-  if (flags & kSgmmPhoneProjections) {
-    if (options_.tau_map_M > 0.0)
-      tot_impr += MapUpdateM(accs, Q, gamma_i, model);  // MAP adaptation of M
-    else
-      tot_impr += UpdateM(accs, Q, gamma_i, model);
-  }
-  if (flags & kSgmmPhoneWeightProjections)
-    tot_impr += UpdateW(accs, log_a, gamma_i, model);
-  if (flags & kSgmmCovarianceMatrix)
-    tot_impr += UpdateVars(accs, S_means, gamma_i, model);
-  if (flags & kSgmmSubstateWeights)
-    tot_impr += UpdateSubstateWeights(accs, model);
-  if (flags & kSgmmSpeakerProjections)
-    tot_impr += UpdateN(accs, gamma_i, model);
-  if (flags & kSgmmSpeakerWeightProjections)
-    tot_impr += UpdateU(accs, gamma_i, model);
-
-  if ((flags & kSgmmSpeakerProjections) && (options_.renormalize_N))
-    RenormalizeN(accs, gamma_i, model); // if you renormalize N you have to
-  // alter any speaker vectors you're keeping around, as well.
-  // So be careful with this option.
-
-  if (options_.renormalize_V)
-    RenormalizeV(accs, model, gamma_i, H);
-
-  KALDI_LOG << "*Overall auxf improvement, combining all parameters, is "
-            << tot_impr;
-
-  KALDI_LOG << "***Overall data likelihood is "
-            << (accs.total_like_/accs.total_frames_)
-            << " over " << accs.total_frames_ << " frames.";
-
-  model->n_.clear(); // has become invalid.
-  model->w_jmi_.clear(); // has become invalid.
-  // we updated the v or w quantities.
-}
-
-// Compute the Q_{i} (Eq. 64)
-void MleAmSgmm2Updater::ComputeQ(const MleAmSgmm2Accs &accs,
-                                const AmSgmm2 &model,
-                                std::vector< SpMatrix<double> > *Q) {
-  Q->resize(accs.num_gaussians_);
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    (*Q)[i].Resize(accs.phn_space_dim_);
-    for (int32 j1 = 0; j1 < accs.num_groups_; j1++) {
-      for (int32 m = 0; m < model.NumSubstatesForGroup(j1); m++) {
-        if (accs.gamma_[j1](m, i) > 0.0) {
-          (*Q)[i].AddVec2(static_cast<BaseFloat>(accs.gamma_[j1](m, i)),
-                          model.v_[j1].Row(m));
-        }
-      }
-    }
-  }
-}
-
-// Compute the S_i^{(means)} quantities (Eq. 74).
-// Note: we seem to have also included in this variable
-// the term - (Y_i M_I^T + M_i Y_i^T).
-void MleAmSgmm2Updater::ComputeSMeans(const MleAmSgmm2Accs &accs,
-                                     const AmSgmm2 &model,
-                                     std::vector< SpMatrix<double> > *S_means) {
-  S_means->resize(accs.num_gaussians_);
-  Matrix<double> YM_MY(accs.feature_dim_, accs.feature_dim_);
-  Vector<BaseFloat> mu_jmi(accs.feature_dim_);
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    // YM_MY = - (Y_{i} M_{i}^T)
-    YM_MY.AddMatMat(-1.0, accs.Y_[i], kNoTrans,
-                    Matrix<double>(model.M_[i]), kTrans, 0.0);
-    // Add its own transpose: YM_MY = - (Y_{i} M_{i}^T + M_{i} Y_{i}^T)
-    {
-      Matrix<double> M(YM_MY, kTrans);
-      YM_MY.AddMat(1.0, M);
-    }
-    (*S_means)[i].Resize(accs.feature_dim_, kUndefined);
-    (*S_means)[i].CopyFromMat(YM_MY);  // Sigma_{i} = -(YM' + MY')
-
-    for (int32 j1 = 0; j1 < accs.num_groups_; j1++) {
-      for (int32 m = 0; m < model.NumSubstatesForGroup(j1); m++) {
-        if (accs.gamma_[j1](m, i) != 0.0) {
-          // Sigma_{i} += gamma_{jmi} * mu_{jmi}*mu_{jmi}^T
-          mu_jmi.AddMatVec(1.0, model.M_[i], kNoTrans, model.v_[j1].Row(m), 0.0);
-          (*S_means)[i].AddVec2(static_cast<BaseFloat>(accs.gamma_[j1](m, i)), mu_jmi);
-        }
-      }
-    }
-    KALDI_ASSERT(1.0 / (*S_means)[i](0, 0) != 0.0);
-  }
-}
-
-
-class UpdatePhoneVectorsClass: public MultiThreadable { // For multi-threaded.
- public:
-  UpdatePhoneVectorsClass(const MleAmSgmm2Updater &updater,
-                          const MleAmSgmm2Accs &accs,
-                          const std::vector<SpMatrix<double> > &H,
-                          const std::vector<Matrix<double> > &log_a,
-                          AmSgmm2 *model,
-                          double *auxf_impr):
-      updater_(updater), accs_(accs), model_(model),
-      H_(H), log_a_(log_a), auxf_impr_ptr_(auxf_impr),
-      auxf_impr_(0.0) { }
-
-  UpdatePhoneVectorsClass(const UpdatePhoneVectorsClass &other) :
-      MultiThreadable(other),
-      updater_(other.updater_), accs_(other.accs_), model_(other.model_),
-      H_(other.H_), log_a_(other.log_a_), auxf_impr_ptr_(other.auxf_impr_ptr_),
-      auxf_impr_(0.0) { }
-
-  ~UpdatePhoneVectorsClass() {
-    *auxf_impr_ptr_ += auxf_impr_;
-  }
-
-  inline void operator() () {
-    // Note: give them local copy of the sums we're computing,
-    // which will be propagated to the total sums in the destructor.
-    updater_.UpdatePhoneVectorsInternal(accs_, H_, log_a_, model_,
-                                        &auxf_impr_, num_threads_, thread_id_);
-  }
- private:
-  const MleAmSgmm2Updater &updater_;
-  const MleAmSgmm2Accs &accs_;
-  AmSgmm2 *model_;
-  const std::vector<SpMatrix<double> > &H_;
-  const std::vector<Matrix<double> > &log_a_;
-  double *auxf_impr_ptr_;
-  double auxf_impr_;
-};
-
-/**
-   In this update, smoothing terms are not supported.  However, it does compute
-   the auxiliary function after doing the update, and backtracks if it did not
-   increase (due to the weight terms, increase is not mathematically
-   guaranteed). */
-
-double MleAmSgmm2Updater::UpdatePhoneVectors(
-    const MleAmSgmm2Accs &accs,
-    const vector< SpMatrix<double> > &H,
-    const vector< Matrix<double> > &log_a,
-    AmSgmm2 *model) const {
-
-  KALDI_LOG << "Updating phone vectors";
-
-  double count = 0.0, auxf_impr = 0.0;  // sum over all states
-
-  for (int32 j1 = 0; j1 < accs.num_groups_; j1++)
-    count += accs.gamma_[j1].Sum();
-
-  UpdatePhoneVectorsClass c(*this, accs, H, log_a, model, &auxf_impr);
-  RunMultiThreaded(c);
-
-  double auxf_per_frame = auxf_impr / (count + 1.0e-20);
-
-  KALDI_LOG << "**Overall auxf impr for v is " << auxf_per_frame << " over "
-            << count << " frames";
-  return auxf_per_frame;
-}
-
-//static
-void MleAmSgmm2Updater::ComputeLogA(const MleAmSgmm2Accs &accs,
-                                    std::vector<Matrix<double> > *log_a) {
-  // This computes the logarithm of the statistics a_{jmi} defined
-  // in Eq. 40 of the SSGMM techreport.  Although the log of a_{jmi} never
-  // explicitly appears in the techreport, it happens to be more convenient
-  // in the code to use the log of it.
-  // Note: because of the way a is computed, for each (j,m) the
-  // entries over i should always be all zero or all nonzero.
-  int32 num_zeros = 0;
-  KALDI_ASSERT(accs.a_.size() == accs.num_groups_);
-  log_a->resize(accs.num_groups_);
-  for (int32 j1 = 0; j1 < accs.num_groups_; j1++) {
-    int32 num_substates = accs.a_[j1].NumRows();
-    KALDI_ASSERT(num_substates > 0);
-    (*log_a)[j1].Resize(num_substates, accs.num_gaussians_);
-    for (int32 m = 0; m < num_substates; m++) {
-      if (accs.a_[j1](m, 0) == 0.0) { // Zero accs.
-        num_zeros++;
-        if (accs.gamma_[j1].Row(m).Sum() != 0.0)
-          KALDI_WARN << "Inconsistency between a and gamma stats. [BAD!]";
-        // leave the row zero.  This means the sub-state saw no stats.
-      } else {
-        (*log_a)[j1].Row(m).CopyFromVec(accs.a_[j1].Row(m));
-        (*log_a)[j1].Row(m).ApplyLog();
-      }
-    }
-  }
-  if (num_zeros != 0)
-    KALDI_WARN << num_zeros
-               << " sub-states with zero \"a\" (and presumably gamma) stats.";
-}
-
-void MleAmSgmm2Updater::UpdatePhoneVectorsInternal(
-    const MleAmSgmm2Accs &accs,
-    const vector< SpMatrix<double> > &H,
-    const vector< Matrix<double> > &log_a,
-    AmSgmm2 *model,
-    double *auxf_impr_ptr,
-    int32 num_threads,
-    int32 thread_id) const {
-
-  int32 J1 = accs.num_groups_, block_size = (J1 + (num_threads-1)) / num_threads,
-      j1_start = block_size * thread_id,
-      j1_end = std::min(accs.num_groups_, j1_start + block_size);
-
-  double tot_auxf_impr = 0.0;
-
-  for (int32 j1 = j1_start; j1 < j1_end; j1++) {
-    for (int32 m = 0; m < model->NumSubstatesForGroup(j1); m++) {
-      double gamma_jm = accs.gamma_[j1].Row(m).Sum();
-      SpMatrix<double> X_jm(accs.phn_space_dim_);  // = \sum_i \gamma_{jmi} H_i
-
-      for (int32 i = 0; i < accs.num_gaussians_; i++) {
-        double gamma_jmi = accs.gamma_[j1](m, i);
-        if (gamma_jmi != 0.0)
-          X_jm.AddSp(gamma_jmi, H[i]);
-      }
-
-      Vector<double> v_jm_orig(model->v_[j1].Row(m)),
-          v_jm(v_jm_orig);
-
-      double exact_auxf_start = 0.0, exact_auxf = 0.0, approx_auxf_impr = 0.0;
-      int32 backtrack_iter, max_backtrack = 10;
-      for (backtrack_iter = 0; backtrack_iter < max_backtrack; backtrack_iter++) {
-        // Note: the 1st time we go through this loop we have not yet updated
-        // v_jm and it has the old value; the 2nd time, it has the updated value
-        // and we will typically break at this point, after verifying that
-        // the auxf has improved.
-
-        // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm})  eq.(7)
-        Vector<double> w_jm(accs.num_gaussians_);
-        w_jm.AddMatVec(1.0, Matrix<double>(model->w_), kNoTrans,
-                       v_jm, 0.0);
-        if (!log_a.empty()) w_jm.AddVec(1.0, log_a[j1].Row(m)); // SSGMM techreport eq. 42
-        w_jm.Add(-w_jm.LogSumExp());  // it is now log w_jm
-
-
-        exact_auxf = VecVec(w_jm, accs.gamma_[j1].Row(m))
-            + VecVec(v_jm, accs.y_[j1].Row(m))
-            -0.5 * VecSpVec(v_jm, X_jm, v_jm);
-
-        if (backtrack_iter == 0) {
-          exact_auxf_start = exact_auxf;
-        } else {
-          if (exact_auxf >= exact_auxf_start) {
-            break;  // terminate backtracking.
-          } else {
-            KALDI_LOG << "Backtracking computation of v_jm for j = " << j1
-                      << " and m = " << m << " because auxf changed by "
-                      << (exact_auxf-exact_auxf_start) << " [vs. predicted:] "
-                      << approx_auxf_impr;
-            v_jm.AddVec(1.0, v_jm_orig);
-            v_jm.Scale(0.5);
-          }
-        }
-
-        if (backtrack_iter == 0) {  // computing updated value.
-          w_jm.ApplyExp();  // it is now w_jm
-          SpMatrix<double> H_jm(X_jm);
-          Vector<double> g_jm(accs.y_[j1].Row(m));
-          for (int32 i = 0; i < accs.num_gaussians_; i++) {
-            double gamma_jmi = accs.gamma_[j1](m, i);
-            double quadratic_term = std::max(gamma_jmi, gamma_jm * w_jm(i));
-            double scalar = gamma_jmi - gamma_jm * w_jm(i) + quadratic_term
-                * VecVec(model->w_.Row(i), model->v_[j1].Row(m));
-            g_jm.AddVec(scalar, model->w_.Row(i));
-            if (quadratic_term > 1.0e-10) {
-              H_jm.AddVec2(static_cast<BaseFloat>(quadratic_term), model->w_.Row(i));
-            }
-          }
-
-          SolverOptions opts;
-          opts.name = "v";
-          opts.K = options_.max_cond;
-          opts.eps = options_.epsilon;
-
-          approx_auxf_impr = SolveQuadraticProblem(H_jm, g_jm, opts, &v_jm);
-        }
-      }
-      double exact_auxf_impr = exact_auxf - exact_auxf_start;
-      tot_auxf_impr += exact_auxf_impr;
-      if (backtrack_iter == max_backtrack) {
-        KALDI_WARN << "Backtracked " << max_backtrack << " times [not updating]";
-      } else {
-        model->v_[j1].Row(m).CopyFromVec(v_jm);
-      }
-
-      if (j1 < 3 && m < 3) {
-        KALDI_LOG << "Auxf impr for j = " << j1 << " m = " << m << " is "
-                  << (exact_auxf_impr/gamma_jm+1.0e-20) << " per frame over "
-                  << gamma_jm << " frames.";
-      }
-    }
-  }
-  *auxf_impr_ptr = tot_auxf_impr;
-}
-
-
-void MleAmSgmm2Updater::RenormalizeV(const MleAmSgmm2Accs &accs,
-                                    AmSgmm2 *model,
-                                    const Vector<double> &gamma_i,
-                                    const vector<SpMatrix<double> > &H) {
-  // Compute H^{(sm)}, the "smoothing" matrix-- average of H's.
-  SpMatrix<double> H_sm(accs.phn_space_dim_);
-  for (int32 i = 0; i < accs.num_gaussians_; i++)
-    H_sm.AddSp(gamma_i(i), H[i]);
-  KALDI_ASSERT(gamma_i.Sum() > 0.0);
-  H_sm.Scale(1.0 / gamma_i.Sum());
-
-  SpMatrix<double> Sigma(accs.phn_space_dim_);
-  int32 count = 0;
-  for (int32 j1 = 0; j1 < accs.num_groups_; j1++) {
-    for (int32 m = 0; m < model->NumSubstatesForGroup(j1); m++) {
-      count++;
-      Sigma.AddVec2(static_cast<BaseFloat>(1.0), model->v_[j1].Row(m));
-    }
-  }
-  if (!Sigma.IsPosDef()) {
-    KALDI_LOG << "Not renormalizing v because scatter is not positive definite"
-              << " -- maybe first iter?";
-    return;
-  }
-  Sigma.Scale(1.0 / count);
-  KALDI_LOG << "Scatter of vectors v is : ";
-  Sigma.PrintEigs("Sigma");
-
-  // Want to make variance of v unit and H_sm (like precision matrix) diagonal.
-  TpMatrix<double> L(accs.phn_space_dim_);
-  L.Cholesky(Sigma);
-  TpMatrix<double> LInv(L);
-  LInv.Invert();
-
-  Matrix<double> tmpL(accs.phn_space_dim_, accs.phn_space_dim_);
-  tmpL.CopyFromTp(L);
-
-  SpMatrix<double> H_sm_proj(accs.phn_space_dim_);
-  H_sm_proj.AddMat2Sp(1.0, tmpL, kTrans, H_sm, 0.0);
-  // H_sm_proj := L^{T} * H_sm * L.
-  // This is right because we would transform the vectors themselves
-  // by L^{-1}, and H_sm is like the inverse of the vectors,
-  // so it's {L^{-1}}^{-T} = L^T.
-
-  Matrix<double> U(accs.phn_space_dim_, accs.phn_space_dim_);
-  Vector<double> eigs(accs.phn_space_dim_);
-  H_sm_proj.SymPosSemiDefEig(&eigs, &U, 1.0);  // 1.0 means no checking +ve def -> faster
-  KALDI_LOG << "Note on the next diagnostic: the first number is generally not "
-            << "that meaningful as it relates to the static offset";
-  H_sm_proj.PrintEigs("H_sm_proj (Significance of dims in vector space.. note)");
-
-  // Transform on vectors is U^T L^{-1}.
-  // Why?  Because transform on H_sm is T =U^T L^T
-  // and we want T^{-T} by normal rules of vector/covector and we
-  // have (U^T L^T)^{-T} = (L U)^{-1} = U^T L^{-1}.
-  Matrix<double> Trans(accs.phn_space_dim_, accs.phn_space_dim_);  // T^{-T}
-  Matrix<double> tmpLInv(accs.phn_space_dim_, accs.phn_space_dim_);
-  tmpLInv.CopyFromTp(LInv);
-  Trans.AddMatMat(1.0, U, kTrans, tmpLInv, kNoTrans, 0.0);
-  Matrix<double> TransInv(Trans);
-  TransInv.Invert();  // T in above...
-
-#ifdef KALDI_PARANOID
-  {
-    SpMatrix<double> H_sm_tmp(accs.phn_space_dim_);
-    H_sm_tmp.AddMat2Sp(1.0, TransInv, kTrans, H_sm, 0.0);
-    KALDI_ASSERT(H_sm_tmp.IsDiagonal(0.1));
-  }
-  {
-    SpMatrix<double> Sigma_tmp(accs.phn_space_dim_);
-    Sigma_tmp.AddMat2Sp(1.0, Trans, kNoTrans, Sigma, 0.0);
-    KALDI_ASSERT(Sigma_tmp.IsUnit(0.1));
-  }
-#endif
-
-  for (int32 j1 = 0; j1 < accs.num_groups_; j1++) {
-    for (int32 m = 0; m < model->NumSubstatesForGroup(j1); m++) {
-      Vector<double> tmp(accs.phn_space_dim_);
-      tmp.AddMatVec(1.0, Trans, kNoTrans, Vector<double>(model->v_[j1].Row(m)), 0.0);
-      model->v_[j1].Row(m).CopyFromVec(tmp);
-    }
-  }
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    Vector<double> tmp(accs.phn_space_dim_);
-    tmp.AddMatVec(1.0, TransInv, kTrans, Vector<double>(model->w_.Row(i)), 0.0);
-    model->w_.Row(i).CopyFromVec(tmp);
-
-    Matrix<double> tmpM(accs.feature_dim_, accs.phn_space_dim_);
-    // Multiplying on right not left so must not transpose TransInv.
-    tmpM.AddMatMat(1.0, Matrix<double>(model->M_[i]), kNoTrans,
-                   TransInv, kNoTrans, 0.0);
-    model->M_[i].CopyFromMat(tmpM);
-  }
-  KALDI_LOG << "Renormalized subspace.";
-}
-
-double MleAmSgmm2Updater::UpdateM(const MleAmSgmm2Accs &accs,
-                                 const std::vector< SpMatrix<double> > &Q,
-                                 const Vector<double> &gamma_i,
-                                 AmSgmm2 *model) {
-  double tot_count = 0.0, tot_like_impr = 0.0;
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    if (gamma_i(i) < accs.feature_dim_) {
-      KALDI_WARN << "For component " << i << ": not updating M due to very "
-                 << "small count (=" << gamma_i(i) << ").";
-      continue;
-    }
-
-    SolverOptions opts;
-    opts.name = "M";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-
-    Matrix<double> Mi(model->M_[i]);
-    double impr =
-        SolveQuadraticMatrixProblem(Q[i], accs.Y_[i],
-                                    SpMatrix<double>(model->SigmaInv_[i]),
-                                    opts, &Mi);
-
-    model->M_[i].CopyFromMat(Mi);
-
-    if (i < 10) {
-      KALDI_VLOG(2) << "Objf impr for projection M for i = " << i << ", is "
-                    << (impr/(gamma_i(i) + 1.0e-20)) << " over " << gamma_i(i)
-                    << " frames";
-    }
-    tot_count += gamma_i(i);
-    tot_like_impr += impr;
-  }
-  tot_like_impr /= (tot_count + 1.0e-20);
-  KALDI_LOG << "Overall objective function improvement for model projections "
-            << "M is " << tot_like_impr << " over " << tot_count << " frames";
-  return tot_like_impr;
-}
-
-
-// Estimate the parameters of a Gaussian prior over the M matrices. There are
-// as many mean matrices as UBM size and two covariance matrices for the rows
-// of M and columns of M. The prior means M_i are fixed to the unadapted values.
-// This is what was done in Lu, et al. "Maximum a posteriori adaptation of
-// subspace Gaussian mixture models for cross-lingual speech recognition",
-// ICASSP 2012.
-void MleAmSgmm2Updater::ComputeMPrior(AmSgmm2 *model) {
-  KALDI_ASSERT(options_.map_M_prior_iters > 0);
-  int32 Ddim = model->FeatureDim();
-  int32 Sdim = model->PhoneSpaceDim();
-  int32 nGaussians = model->NumGauss();
-
-  // inverse variance of the columns of M: dim is # of rows
-  model->col_cov_inv_.Resize(Ddim);
-  // inverse covariance of the rows of M: dim is # of columns
-  model->row_cov_inv_.Resize(Sdim);
-
-  model->col_cov_inv_.SetUnit();
-  model->row_cov_inv_.SetUnit();
-
-  if (model->M_prior_.size() == 0) {
-    model->M_prior_.resize(nGaussians);
-    for (int32 i = 0; i < nGaussians; i++) {
-      model->M_prior_[i].Resize(Ddim, Sdim);
-      model->M_prior_[i].CopyFromMat(model->M_[i]); // We initialize Mpri as this
-    }
-  }
-
-  if (options_.full_col_cov || options_.full_row_cov) {
-    Matrix<double> avg_M(Ddim, Sdim);  // average of the Gaussian prior means
-    for (int32 i = 0; i < nGaussians; i++)
-      avg_M.AddMat(1.0, Matrix<double>(model->M_prior_[i]));
-    avg_M.Scale(1.0 / nGaussians);
-
-    Matrix<double> MDiff(Ddim, Sdim);
-    for (int32 iter = 0; iter < options_.map_M_prior_iters; iter++) {
-      { // diagnostic block.
-        double prior_like = -0.5 * nGaussians * (Ddim * Sdim * Log(2 * M_PI)
-                + Sdim * (-model->row_cov_inv_.LogPosDefDet())
-                + Ddim * (-model->col_cov_inv_.LogPosDefDet()));
-        for (int32 i = 0; i < nGaussians; i++) {
-          MDiff.CopyFromMat(Matrix<double>(model->M_prior_[i]));
-          MDiff.AddMat(-1.0, avg_M);  // MDiff = M_{i} - avg(M)
-          SpMatrix<double> tmp(Ddim);
-          // tmp = MDiff.Omega_r^{-1}*MDiff^T.
-          tmp.AddMat2Sp(1.0, MDiff, kNoTrans,
-                        SpMatrix<double>(model->row_cov_inv_), 0.0);
-          prior_like -= 0.5 * TraceSpSp(tmp, SpMatrix<double>(model->col_cov_inv_));
-        }
-        KALDI_LOG << "Before iteration " << iter
-            << " of updating prior over M, log like per dimension modeled is "
-            << prior_like / (nGaussians * Ddim * Sdim);
-      }
-
-      // First estimate the column covariances (\Omega_r in paper)
-      if (options_.full_col_cov) {
-        size_t limited;
-        model->col_cov_inv_.SetZero();
-        for (int32 i = 0; i < nGaussians; i++) {
-          MDiff.CopyFromMat(Matrix<double>(model->M_prior_[i]));
-          MDiff.AddMat(-1.0, avg_M);  // MDiff = M_{i} - avg(M)
-          // Omega_r += 1/(D*I) * Mdiff * Omega_c^{-1} * Mdiff^T
-          model->col_cov_inv_.AddMat2Sp(1.0 / (Ddim * nGaussians),
-                                        Matrix<BaseFloat>(MDiff), kNoTrans,
-                                        model->row_cov_inv_, 1.0);
-        }
-        model->col_cov_inv_.PrintEigs("col_cov");
-        limited = model->col_cov_inv_.LimitCond(options_.max_cond,
-                                                true /*invert the matrix*/);
-        if (limited != 0) {
-          KALDI_LOG << "Computing column covariances for M: limited " << limited
-                    << " singular values, max condition is "
-                    << options_.max_cond;
-        }
-      }
-
-      // Now estimate the row covariances (\Omega_c in paper)
-      if (options_.full_row_cov) {
-        size_t limited;
-        model->row_cov_inv_.SetZero();
-        for (int32 i = 0; i < nGaussians; i++) {
-          MDiff.CopyFromMat(Matrix<double>(model->M_prior_[i]));
-          MDiff.AddMat(-1.0, avg_M);  // MDiff = M_{i} - avg(M)
-          // Omega_c += 1/(S*I) * Mdiff^T * Omega_r^{-1} * Mdiff.
-          model->row_cov_inv_.AddMat2Sp(1.0 / (Sdim * nGaussians),
-                                        Matrix<BaseFloat>(MDiff), kTrans,
-                                        model->col_cov_inv_, 1.0);
-        }
-        model->row_cov_inv_.PrintEigs("row_cov");
-        limited = model->row_cov_inv_.LimitCond(options_.max_cond,
-                                                true /*invert the matrix*/);
-        if (limited != 0) {
-          KALDI_LOG << "Computing row covariances for M: limited " << limited
-                    << " singular values, max condition is "
-                    << options_.max_cond;
-        }
-      }
-    }  // end iterations
-  }
-}
-
-
-// MAP adaptation of M with a matrix-variate Gaussian prior
-double MleAmSgmm2Updater::MapUpdateM(const MleAmSgmm2Accs &accs,
-                                     const std::vector< SpMatrix<double> > &Q,
-                                     const Vector<double> &gamma_i,
-                                     AmSgmm2 *model) {
-  int32 Ddim = model->FeatureDim();
-  int32 Sdim = model->PhoneSpaceDim();
-  int32 nGaussians = model->NumGauss();
-
-  KALDI_LOG << "Prior smoothing parameter: Tau = " << options_.tau_map_M;
-  if (model->M_prior_.size() == 0 || model->col_cov_inv_.NumRows() == 0
-      || model->row_cov_inv_.NumRows() == 0) {
-    KALDI_LOG << "Computing the prior first";
-    ComputeMPrior(model);
-  }
-
-  Matrix<double> G(Ddim, Sdim);
-  // \tau \Omega_c^{-1} avg(M) \Omega_r^{-1}, depends on Gaussian index
-  Matrix<double> prior_term_i(Ddim, Sdim);
-  SpMatrix<double> P2(model->col_cov_inv_);
-  SpMatrix<double> Q2(model->row_cov_inv_);
-  Q2.Scale(options_.tau_map_M);
-
-  double totcount = 0.0, tot_like_impr = 0.0;
-  for (int32 i = 0; i < nGaussians; ++i) {
-    if (gamma_i(i) < accs.feature_dim_) {
-      KALDI_WARN << "For component " << i << ": not updating M due to very "
-                 << "small count (=" << gamma_i(i) << ").";
-      continue;
-    }
-
-    Matrix<double> tmp(Ddim, Sdim, kSetZero);
-    tmp.AddSpMat(1.0, SpMatrix<double>(model->col_cov_inv_),
-                 Matrix<double>(model->M_prior_[i]), kNoTrans, 0.0);
-    prior_term_i.AddMatSp(options_.tau_map_M, tmp, kNoTrans,
-                          SpMatrix<double>(model->row_cov_inv_), 0.0);
-
-    Matrix<double> SigmaY(Ddim, Sdim, kSetZero);
-    SigmaY.AddSpMat(1.0, SpMatrix<double>(model->SigmaInv_[i]), accs.Y_[i],
-                    kNoTrans, 0.0);
-    G.CopyFromMat(SigmaY);  // G = \Sigma_{i}^{-1} Y_{i}
-    G.AddMat(1.0, prior_term_i); // G += \tau \Omega_c^{-1} avg(M) \Omega_r^{-1}
-    SpMatrix<double> P1(model->SigmaInv_[i]);
-    Matrix<double> Mi(model->M_[i]);
-
-    SolverOptions opts;
-    opts.name = "M";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-    double impr =
-        SolveDoubleQuadraticMatrixProblem(G, P1, P2, Q[i], Q2, opts, &Mi);
-    model->M_[i].CopyFromMat(Mi);
-    if (i < 10) {
-      KALDI_LOG << "Objf impr for projection M for i = " << i << ", is "
-                << (impr / (gamma_i(i) + 1.0e-20)) << " over " << gamma_i(i)
-                << " frames";
-    }
-    totcount += gamma_i(i);
-    tot_like_impr += impr;
-  }
-  tot_like_impr /= (totcount + 1.0e-20);
-  KALDI_LOG << "Overall objective function improvement for model projections "
-            << "M is " << tot_like_impr << " over " << totcount << " frames";
-  return tot_like_impr;
-}
-
-
-/// This function gets stats used inside UpdateW, where it accumulates
-/// the F_i and g_i quantities.  Note: F_i is viewed as a vector of SpMatrix
-/// (one for each i); each row of F_i is viewed as an SpMatrix even though
-/// it's stored as a vector....
-/// Note: on the first iteration w is just a double-precision copy of the matrix
-/// model->w_; thereafter it may differ.
-/// log_a relates to the SSGMM.
-
-// static
-void MleAmSgmm2Updater::UpdateWGetStats(const MleAmSgmm2Accs &accs,
-                                        const AmSgmm2 &model,
-                                        const Matrix<double> &w,
-                                        const std::vector<Matrix<double> > &log_a,
-                                        Matrix<double> *F_i,
-                                        Matrix<double> *g_i,
-                                        double *tot_like,
-                                        int32 num_threads,
-                                        int32 thread_id) {
-
-  // Accumulate stats from a block of states (this gets called in parallel).
-  int32 block_size = (accs.num_groups_ + (num_threads-1)) / num_threads,
-      j1_start = block_size * thread_id,
-      j1_end = std::min(accs.num_groups_, j1_start + block_size);
-
-  // Unlike in the report the inner most loop is over Gaussians, where
-  // per-gaussian statistics are accumulated. This is more memory demanding
-  // but more computationally efficient, as outer product v_{jvm} v_{jvm}^T
-  // is computed only once for all gaussians.
-
-  SpMatrix<double> v_vT(accs.phn_space_dim_);
-
-  for (int32 j1 = j1_start; j1 < j1_end; j1++) {
-    int32 num_substates = model.NumSubstatesForGroup(j1);
-    Matrix<double> w_j(num_substates, accs.num_gaussians_);
-    // The linear term and quadratic term for each Gaussian-- two scalars
-    // for each Gaussian, they appear in the accumulation formulas.
-    Matrix<double> linear_term(num_substates, accs.num_gaussians_);
-    Matrix<double> quadratic_term(num_substates, accs.num_gaussians_);
-    Matrix<double> v_vT_m(num_substates,
-                          (accs.phn_space_dim_*(accs.phn_space_dim_+1))/2);
-
-    // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm})  eq.(7)
-    Matrix<double> v_j_double(model.v_[j1]);
-    w_j.AddMatMat(1.0, v_j_double, kNoTrans, w, kTrans, 0.0);
-    if (!log_a.empty()) w_j.AddMat(1.0, log_a[j1]); // SSGMM techreport eq. 42
-
-    for (int32 m = 0; m < model.NumSubstatesForGroup(j1); m++) {
-      SubVector<double> w_jm(w_j, m);
-      double gamma_jm = accs.gamma_[j1].Row(m).Sum();
-      w_jm.Add(-1.0 * w_jm.LogSumExp());
-      *tot_like += VecVec(w_jm, accs.gamma_[j1].Row(m));
-      w_jm.ApplyExp();
-      v_vT.SetZero();
-      // v_vT := v_{jkm} v_{jkm}^T
-      v_vT.AddVec2(static_cast<BaseFloat>(1.0), v_j_double.Row(m));
-      v_vT_m.Row(m).CopyFromPacked(v_vT); // a bit wasteful, but does not dominate.
-
-      for (int32 i = 0; i < accs.num_gaussians_; i++) {
-        // Suggestion: g_jkm can be computed more efficiently
-        // using the Vector/Matrix routines for all i at once
-        // linear term around cur value.
-        linear_term(m, i) = accs.gamma_[j1](m, i) - gamma_jm * w_jm(i);
-        quadratic_term(m, i) = std::max(accs.gamma_[j1](m, i),
-                                        gamma_jm * w_jm(i));
-      }
-    } // loop over substates
-    g_i->AddMatMat(1.0, linear_term, kTrans, v_j_double, kNoTrans, 1.0);
-    F_i->AddMatMat(1.0, quadratic_term, kTrans, v_vT_m, kNoTrans, 1.0);
-  } // loop over states
-}
-
-// The parallel weight update, in the paper.
-double MleAmSgmm2Updater::UpdateW(const MleAmSgmm2Accs &accs,
-                                  const std::vector<Matrix<double> > &log_a,
-                                  const Vector<double> &gamma_i,
-                                  AmSgmm2 *model) {
-  KALDI_LOG << "Updating weight projections";
-
-  // tot_like_{after, before} are totals over multiple iterations,
-  // not valid likelihoods. but difference is valid (when divided by tot_count).
-  double tot_predicted_like_impr = 0.0, tot_like_before = 0.0,
-      tot_like_after = 0.0;
-
-  Matrix<double> g_i(accs.num_gaussians_, accs.phn_space_dim_);
-  // View F_i as a vector of SpMatrix.
-  Matrix<double> F_i(accs.num_gaussians_,
-                     (accs.phn_space_dim_*(accs.phn_space_dim_+1))/2);
-
-  Matrix<double> w(model->w_);
-  double tot_count = gamma_i.Sum();
-
-  for (int iter = 0; iter < options_.weight_projections_iters; iter++) {
-    F_i.SetZero();
-    g_i.SetZero();
-    double k_like_before = 0.0;
-
-    UpdateWClass c(accs, *model, w, log_a, &F_i, &g_i, &k_like_before);
-    RunMultiThreaded(c);
-
-    Matrix<double> w_orig(w);
-    double k_predicted_like_impr = 0.0, k_like_after = 0.0;
-    double min_step = 0.001, step_size;
-
-    SolverOptions opts;
-    opts.name = "w";
-    opts.K = options_.max_cond;
-    opts.eps = options_.epsilon;
-
-    for (step_size = 1.0; step_size >= min_step; step_size /= 2) {
-      k_predicted_like_impr = 0.0;
-      k_like_after = 0.0;
-
-      for (int32 i = 0; i < accs.num_gaussians_; i++) {
-        // auxf is formulated in terms of change in w.
-        Vector<double> delta_w(accs.phn_space_dim_);
-        // returns objf impr with step_size = 1,
-        // but it may not be 1 so we recalculate it.
-        SpMatrix<double> this_F_i(accs.phn_space_dim_);
-        this_F_i.CopyFromVec(F_i.Row(i));
-        SolveQuadraticProblem(this_F_i, g_i.Row(i), opts, &delta_w);
-
-        delta_w.Scale(step_size);
-        double predicted_impr = VecVec(delta_w, g_i.Row(i)) -
-            0.5 * VecSpVec(delta_w,  this_F_i, delta_w);
-
-        // should never be negative because
-        // we checked inside SolveQuadraticProblem.
-        KALDI_ASSERT(predicted_impr >= -1.0e-05);
-
-        if (i < 10)
-          KALDI_LOG << "Predicted objf impr for w, iter = " << iter
-                    << ", i = " << i << " is "
-                    << (predicted_impr/gamma_i(i)+1.0e-20)
-                    << " per frame over " << gamma_i(i) << " frames.";
-        k_predicted_like_impr += predicted_impr;
-        w.Row(i).AddVec(1.0, delta_w);
-      }
-      for (int32 j1 = 0; j1 < accs.num_groups_; j1++) {
-        int32 M = model->NumSubstatesForGroup(j1);
-        Matrix<double> w_j(M, accs.num_gaussians_);
-        w_j.AddMatMat(1.0, Matrix<double>(model->v_[j1]), kNoTrans,
-                       w, kTrans, 0.0);
-        if (!log_a.empty()) w_j.AddMat(1.0, log_a[j1]); // SSGMM techreport eq. 42
-        for (int32 m = 0; m < M; m++) {
-          SubVector<double> w_jm(w_j, m);
-          w_jm.Add(-1.0 * w_jm.LogSumExp());
-        }
-        k_like_after += TraceMatMat(w_j, accs.gamma_[j1], kTrans);
-      }
-      KALDI_VLOG(2) << "For iteration " << iter << ", updating w gives "
-                    << "predicted per-frame like impr "
-                    << (k_predicted_like_impr / tot_count) << ", actual "
-                    << ((k_like_after - k_like_before) / tot_count) << ", over "
-                    << tot_count << " frames";
-      if (k_like_after < k_like_before) {
-        w.CopyFromMat(w_orig);  // Undo what we computed.
-        if (fabs(k_like_after - k_like_before) / tot_count < 1.0e-05) {
-          k_like_after = k_like_before;
-          KALDI_WARN << "Not updating weights as not increasing auxf and "
-                     << "probably due to numerical issues (since small change).";
-          break;
-        } else {
-          KALDI_WARN << "Halving step size for weights as likelihood did "
-                     << "not increase";
-        }
-      } else {
-        break;
-      }
-    }
-    if (step_size < min_step) {
-      // Undo any step as we have no confidence that this is right.
-      w.CopyFromMat(w_orig);
-    } else {
-      tot_predicted_like_impr += k_predicted_like_impr;
-      tot_like_after += k_like_after;
-      tot_like_before += k_like_before;
-    }
-  }
-
-  model->w_.CopyFromMat(w);
-  model->w_jmi_.clear(); // invalidated.
-
-  tot_predicted_like_impr /= tot_count;
-  tot_like_after = (tot_like_after - tot_like_before) / tot_count;
-  KALDI_LOG << "**Overall objf impr for w is " << tot_predicted_like_impr
-            << ", actual " << tot_like_after << ", over "
-            << tot_count << " frames";
-  return tot_like_after;
-}
-
-double MleAmSgmm2Updater::UpdateU(const MleAmSgmm2Accs &accs,
-                                 const Vector<double> &gamma_i,
-                                 AmSgmm2 *model) {
-  double tot_impr = 0.0;
-  SolverOptions opts;
-  opts.name = "u";
-  opts.K = options_.max_cond;
-  opts.eps = options_.epsilon;
-
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    if (gamma_i(i) < 200.0) {
-      KALDI_LOG << "Count is small " << gamma_i(i) << " for gaussian "
-                << i << ", not updating u_i.";
-      continue;
-    }
-    Vector<double> u_i(model->u_.Row(i));
-    Vector<double> delta_u(accs.spk_space_dim_);
-    double impr =
-        SolveQuadraticProblem(accs.U_[i], accs.t_.Row(i), opts, &delta_u);
-    double impr_per_frame = impr / gamma_i(i);
-    if (impr_per_frame > options_.max_impr_u) {
-      KALDI_WARN << "Updating speaker weight projections u, for Gaussian index "
-                 << i << ", impr/frame is " << impr_per_frame << " over "
-                 << gamma_i(i) << " frames, scaling back to not exceed "
-                 << options_.max_impr_u;
-      double scale = options_.max_impr_u / impr_per_frame;
-      impr *= scale;
-      delta_u.Scale(scale);
-      // Note: a linear scaling of "impr" with "scale" is not quite accurate
-      // in depicting how the quadratic auxiliary function varies as we change
-      // the scale on "delta", but this does not really matter-- the goal is
-      // to limit the auxiliary-function change to not be too large.
-    }
-    if (i < 10) {
-      KALDI_LOG << "Objf impr for spk weight-projection u for i = " << (i)
-                << ", is " << (impr / (gamma_i(i) + 1.0e-20)) << " over "
-                << gamma_i(i) << " frames";
-    }
-    u_i.AddVec(1.0, delta_u);
-    model->u_.Row(i).CopyFromVec(u_i);
-    tot_impr += impr;
-  }
-  KALDI_LOG << "**Overall objf impr for u is " << (tot_impr/gamma_i.Sum())
-            << ", over " << gamma_i.Sum() << " frames";
-  return tot_impr / gamma_i.Sum();
-}
-
-double MleAmSgmm2Updater::UpdateN(const MleAmSgmm2Accs &accs,
-                                 const Vector<double>  &gamma_i,
-                                 AmSgmm2 *model) {
-  double tot_count = 0.0, tot_like_impr = 0.0;
-  if (accs.spk_space_dim_ == 0 || accs.R_.size() == 0 || accs.Z_.size() == 0) {
-    KALDI_ERR << "Speaker subspace dim is zero or no stats accumulated";
-  }
-  SolverOptions opts;
-  opts.name = "N";
-  opts.K = options_.max_cond;
-  opts.eps = options_.epsilon;
-
-
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    if (gamma_i(i) < 2 * accs.spk_space_dim_) {
-      KALDI_WARN << "Not updating speaker basis for i = " << (i)
-                 << " because count is too small " << (gamma_i(i));
-      continue;
-    }
-    Matrix<double> Ni(model->N_[i]);
-    double impr =
-        SolveQuadraticMatrixProblem(accs.R_[i], accs.Z_[i],
-                                    SpMatrix<double>(model->SigmaInv_[i]),
-                                    opts, &Ni);
-    model->N_[i].CopyFromMat(Ni);
-    if (i < 10) {
-      KALDI_LOG << "Objf impr for spk projection N for i = " << (i)
-                << ", is " << (impr / (gamma_i(i) + 1.0e-20)) << " over "
-                << gamma_i(i) << " frames";
-    }
-    tot_count += gamma_i(i);
-    tot_like_impr += impr;
-  }
-
-  KALDI_LOG << "**Overall objf impr for N is " << (tot_like_impr/tot_count)
-            << " over " << tot_count << " frames";
-  return (tot_like_impr/tot_count);
-}
-
-void MleAmSgmm2Updater::RenormalizeN(const MleAmSgmm2Accs &accs,
-                                    const Vector<double> &gamma_i,
-                                    AmSgmm2 *model) {
-  KALDI_ASSERT(accs.R_.size() != 0);
-  double tot_count = gamma_i.Sum();
-  if (tot_count == 0) {
-    KALDI_WARN << "Not renormalizing N, since there are no counts.";
-    return;
-  }
-
-  SpMatrix<double> RTot(accs.spk_space_dim_);
-  //  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-  //    RTot.AddSp(1.0, accs.R_[i]);
-  //  }
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    RTot.AddSp(gamma_i(i), accs.R_[i]);
-  }
-  RTot.Scale(1.0 / tot_count);
-  Matrix<double> U(accs.spk_space_dim_, accs.spk_space_dim_);
-  Vector<double> eigs(accs.spk_space_dim_);
-  RTot.SymPosSemiDefEig(&eigs, &U);
-  KALDI_LOG << "Renormalizing N, eigs are: " << (eigs);
-  Vector<double> sqrteigs(accs.spk_space_dim_);
-  for (int32 t = 0; t < accs.spk_space_dim_; t++) {
-    sqrteigs(t) = sqrt(eigs(t));
-  }
-  // e.g.   diag(eigs)^{-0.5} * U' * RTot * U * diag(eigs)^{-0.5}  = 1
-  // But inverse transpose of this transformation needs to take place on R,
-  // i.e. not (on left: diag(eigs)^{-0.5} * U')
-  // but: (inverse it: U . diag(eigs)^{0.5},
-  // transpose it: diag(eigs)^{0.5} U^T. Need to do this on the right to N
-  // (because N has the spk vecs on the right), so N := N U diag(eigs)^{0.5}
-  U.MulColsVec(sqrteigs);
-  Matrix<double> Ntmp(accs.feature_dim_, accs.spk_space_dim_);
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    Ntmp.AddMatMat(1.0, Matrix<double>(model->N_[i]), kNoTrans, U, kNoTrans, 0.0);
-    model->N_[i].CopyFromMat(Ntmp);
-  }
-}
-
-
-double MleAmSgmm2Updater::UpdateVars(const MleAmSgmm2Accs &accs,
-                                    const std::vector< SpMatrix<double> > &S_means,
-                                    const Vector<double> &gamma_i,
-                                    AmSgmm2 *model) {
-  KALDI_ASSERT(S_means.size() == static_cast<size_t>(accs.num_gaussians_));
-
-  SpMatrix<double> Sigma_i(accs.feature_dim_), Sigma_i_ml(accs.feature_dim_);
-  double tot_objf_impr = 0.0, tot_t = 0.0;
-  SpMatrix<double> covfloor(accs.feature_dim_);
-  Vector<double> objf_improv(accs.num_gaussians_);
-
-  // First pass over all (shared) Gaussian components to calculate the
-  // ML estimate of the covariances, and the total covariance for flooring.
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    // Eq. (75): Sigma_{i}^{ml} = 1/gamma_{i} [S_{i} + S_{i}^{(means)} - ...
-    //                                          Y_{i} M_{i}^T - M_{i} Y_{i}^T]
-    // Note the S_means already contains the Y_{i} M_{i}^T terms.
-    Sigma_i_ml.CopyFromSp(S_means[i]);
-    Sigma_i_ml.AddSp(1.0, accs.S_[i]);
-
-    covfloor.AddSp(1.0, Sigma_i_ml);
-    // inverting  small values e.g. 4.41745328e-40 seems to generate inf,
-    // although would be fixed up later.
-    if (gamma_i(i) > 1.0e-20) {
-      Sigma_i_ml.Scale(1 / (gamma_i(i) + 1.0e-20));
-    } else {
-      Sigma_i_ml.SetUnit();
-    }
-    KALDI_ASSERT(1.0 / Sigma_i_ml(0, 0) != 0.0);
-    // Eq. (76): Compute the objective function with the old parameter values
-    objf_improv(i) = model->SigmaInv_[i].LogPosDefDet() -
-        TraceSpSp(SpMatrix<double>(model->SigmaInv_[i]), Sigma_i_ml);
-
-    model->SigmaInv_[i].CopyFromSp(Sigma_i_ml);  // inverted in the next loop.
-  }
-
-  // Compute the covariance floor.
-  if (gamma_i.Sum() == 0) {  // If no count, use identity.
-    KALDI_WARN << "Updating variances: zero counts. Setting floor to unit.";
-    covfloor.SetUnit();
-  } else {  // else, use the global average covariance.
-    covfloor.Scale(options_.cov_floor / gamma_i.Sum());
-    int32 tmp;
-    if ((tmp = covfloor.LimitCondDouble(options_.max_cond)) != 0) {
-      KALDI_WARN << "Covariance flooring matrix is poorly conditioned. Fixed "
-                 << "up " << tmp << " eigenvalues.";
-    }
-  }
-
-  if (options_.cov_diag_ratio > 1000) {
-    KALDI_LOG << "Assuming you want to build a diagonal system since "
-              << "cov_diag_ratio is large: making diagonal covFloor.";
-    for (int32 i = 0; i < covfloor.NumRows(); i++)
-      for (int32 j = 0; j < i; j++)
-        covfloor(i, j) = 0.0;
-  }
-
-  // Second pass over all (shared) Gaussian components to calculate the
-  // floored estimate of the covariances, and update the model.
-  for (int32 i = 0; i < accs.num_gaussians_; i++) {
-    Sigma_i.CopyFromSp(model->SigmaInv_[i]);
-    Sigma_i_ml.CopyFromSp(Sigma_i);
-    // In case of insufficient counts, make the covariance matrix diagonal.
-    // cov_diag_ratio is 2 by default, set to very large to always get diag-cov
-    if (gamma_i(i) < options_.cov_diag_ratio * accs.feature_dim_) {
-      KALDI_WARN << "For Gaussian component " << i << ": Too low count "
-                 << gamma_i(i) << " for covariance matrix estimation. Setting to "
-                 << "diagonal";
-      for (int32 d = 0; d < accs.feature_dim_; d++)
-        for (int32 e = 0; e < d; e++)
-          Sigma_i(d, e) = 0.0;  // SpMatrix, can only set lower triangular part
-
-      int floored = Sigma_i.ApplyFloor(covfloor);
-      if (floored > 0) {
-        KALDI_WARN << "For Gaussian component " << i << ": Floored " << floored
-                   << " covariance eigenvalues.";
-      }
-      model->SigmaInv_[i].CopyFromSp(Sigma_i);
-      model->SigmaInv_[i].InvertDouble();
-    } else {  // Updating the full covariance matrix.
-      try {
-        int floored = Sigma_i.ApplyFloor(covfloor);
-        if (floored > 0) {
-          KALDI_WARN << "For Gaussian component " << i << ": Floored "
-                     << floored << " covariance eigenvalues.";
-        }
-        model->SigmaInv_[i].CopyFromSp(Sigma_i);
-        model->SigmaInv_[i].InvertDouble();
-
-        objf_improv(i) += Sigma_i.LogPosDefDet() +
-            TraceSpSp(SpMatrix<double>(model->SigmaInv_[i]), Sigma_i_ml);
-        objf_improv(i) *= (-0.5 * gamma_i(i));  // Eq. (76)
-
-        tot_objf_impr += objf_improv(i);
-        tot_t += gamma_i(i);
-        if (i < 5) {
-          KALDI_VLOG(2) << "objf impr from variance update =" << objf_improv(i)
-              / (gamma_i(i) + 1.0e-20) << " over " << (gamma_i(i))
-                        << " frames for i = " << (i);
-        }
-      } catch(...) {
-        KALDI_WARN << "Updating within-class covariance matrix i = " << (i)
-                   << ", numerical problem";
-        // This is a catch-all thing in case of unanticipated errors, but
-        // flooring should prevent this occurring for the most part.
-        model->SigmaInv_[i].SetUnit();  // Set to unit.
-      }
-    }
-  }
-  KALDI_LOG << "**Overall objf impr for variance update = "
-            << (tot_objf_impr / (tot_t+ 1.0e-20))
-            << " over " << tot_t << " frames";
-  return tot_objf_impr / (tot_t + 1.0e-20);
-}
-
-
-double MleAmSgmm2Updater::UpdateSubstateWeights(
-    const MleAmSgmm2Accs &accs, AmSgmm2 *model) {
-  KALDI_LOG << "Updating substate mixture weights";
-  // Also set the vector gamma_j which is a cache of the state occupancies.
-
-  double tot_gamma = 0.0, objf_impr = 0.0;
-  for (int32 j2 = 0; j2 < accs.num_pdfs_; j2++) {
-    double gamma_j_sm = 0.0;
-    int32 num_substates = model->NumSubstatesForPdf(j2);
-    const Vector<double> &occs(accs.gamma_c_[j2]);
-    Vector<double> smoothed_occs(occs);
-    smoothed_occs.Add(options_.tau_c);
-    gamma_j_sm += smoothed_occs.Sum();
-    tot_gamma += occs.Sum();
-
-    for (int32 m = 0; m < num_substates; m++) {
-      double cur_weight = model->c_[j2](m);
-      if (cur_weight <= 0) {
-        KALDI_WARN << "Zero or negative weight, flooring";
-        cur_weight = 1.0e-10;  // future work(arnab): remove magic numbers
-      }
-      model->c_[j2](m) = smoothed_occs(m) / gamma_j_sm;
-      objf_impr += Log(model->c_[j2](m) / cur_weight) * occs(m);
-    }
-  }
-  KALDI_LOG << "**Overall objf impr for c is " << (objf_impr/tot_gamma)
-            << ", over " << tot_gamma << " frames.";
-  return (objf_impr/tot_gamma);
-}
-
-
-MleSgmm2SpeakerAccs::MleSgmm2SpeakerAccs(const AmSgmm2 &model,
-                                         BaseFloat prune)
-    : rand_prune_(prune) {
-  KALDI_ASSERT(model.SpkSpaceDim() != 0);
-  H_spk_.resize(model.NumGauss());
-  for (int32 i = 0; i < model.NumGauss(); i++) {
-    // Eq. (82): H_{i}^{spk} = N_{i}^T \Sigma_{i}^{-1} N_{i}
-    H_spk_[i].Resize(model.SpkSpaceDim());
-    H_spk_[i].AddMat2Sp(1.0, Matrix<double>(model.N_[i]),
-                        kTrans, SpMatrix<double>(model.SigmaInv_[i]), 0.0);
-  }
-
-  model.GetNtransSigmaInv(&NtransSigmaInv_);
-
-  gamma_s_.Resize(model.NumGauss());
-  y_s_.Resize(model.SpkSpaceDim());
-  if (model.HasSpeakerDependentWeights())
-    a_s_.Resize(model.NumGauss());
-}
-
-void MleSgmm2SpeakerAccs::Clear() {
-  y_s_.SetZero();
-  gamma_s_.SetZero();
-  if (a_s_.Dim() != 0) a_s_.SetZero();
-}
-
-BaseFloat
-MleSgmm2SpeakerAccs::Accumulate(const AmSgmm2 &model,
-                               const Sgmm2PerFrameDerivedVars &frame_vars,
-                               int32 j2,
-                               BaseFloat weight,
-                               Sgmm2PerSpkDerivedVars *spk_vars) {
-  // Calculate Gaussian posteriors and collect statistics
-  Matrix<BaseFloat> posteriors;
-  BaseFloat log_like = model.ComponentPosteriors(frame_vars, j2, spk_vars,
-                                                 &posteriors);
-  posteriors.Scale(weight);
-  AccumulateFromPosteriors(model, frame_vars, posteriors, j2, spk_vars);
-  return log_like;
-}
-
-BaseFloat
-MleSgmm2SpeakerAccs::AccumulateFromPosteriors(const AmSgmm2 &model,
-                                             const Sgmm2PerFrameDerivedVars &frame_vars,
-                                             const Matrix<BaseFloat> &posteriors,
-                                             int32 j2,
-                                             Sgmm2PerSpkDerivedVars *spk_vars) {
-  double tot_count = 0.0;
-  int32 feature_dim = model.FeatureDim(),
-      spk_space_dim = model.SpkSpaceDim();
-  KALDI_ASSERT(spk_space_dim != 0);
-  const vector<int32> &gselect = frame_vars.gselect;
-
-  // Intermediate variables
-  Vector<double> xt_jmi(feature_dim), mu_jmi(feature_dim),
-      zt_jmi(spk_space_dim);
-  int32 num_substates = model.NumSubstatesForPdf(j2),
-      j1 = model.Pdf2Group(j2);
-  bool have_spk_dep_weights = (a_s_.Dim() != 0);
-
-  for (int32 m = 0; m < num_substates; m++) {
-    BaseFloat gammat_jm = 0.0;
-    for (int32 ki = 0; ki < static_cast<int32>(gselect.size()); ki++) {
-      int32 i = gselect[ki];
-      // Eq. (39): gamma_{jmi}(t) = p (j, m, i|t)
-      BaseFloat gammat_jmi = RandPrune(posteriors(ki, m), rand_prune_);
-      if (gammat_jmi != 0.0) {
-        gammat_jm += gammat_jmi;
-        tot_count += gammat_jmi;
-        model.GetSubstateMean(j1, m, i, &mu_jmi);
-        xt_jmi.CopyFromVec(frame_vars.xt);
-        xt_jmi.AddVec(-1.0, mu_jmi);
-        // Eq. (48): z{jmi}(t) = N_{i}^{T} \Sigma_{i}^{-1} x_{jmi}(t)
-        zt_jmi.AddMatVec(1.0, NtransSigmaInv_[i], kNoTrans, xt_jmi, 0.0);
-        // Eq. (49): \gamma_{i}^{(s)} = \sum_{t\in\Tau(s), j, m} gamma_{jmi}
-        gamma_s_(i) += gammat_jmi;
-        // Eq. (50): y^{(s)} = \sum_{t, j, m, i} gamma_{jmi}(t) z_{jmi}(t)
-        y_s_.AddVec(gammat_jmi, zt_jmi);
-      }
-    }
-    if (have_spk_dep_weights) {
-      KALDI_ASSERT(!model.w_jmi_.empty());
-      BaseFloat d_jms = model.GetDjms(j1, m, spk_vars);
-      if (d_jms == -1.0) d_jms = 1.0; // Explanation: d_jms is set to -1 when we didn't have
-      // speaker vectors in training.  We treat this the same as the speaker vector being
-      // zero, and d_jms becomes 1 in this case.
-      a_s_.AddVec(gammat_jm/d_jms, model.w_jmi_[j1].Row(m));
-    }
-  }
-  return tot_count;
-}
-
-void MleSgmm2SpeakerAccs::Update(const AmSgmm2 &model,
-                                BaseFloat min_count,
-                                Vector<BaseFloat> *v_s,
-                                BaseFloat *objf_impr_out,
-                                BaseFloat *count_out) {
-  double tot_gamma = gamma_s_.Sum();
-  if (tot_gamma < min_count) {
-    KALDI_WARN << "Updating speaker vectors, count is " << tot_gamma
-               << " < " << min_count << "not updating.";
-    if (objf_impr_out) *objf_impr_out = 0.0;
-    if (count_out) *count_out = 0.0;
-    return;
-  }
-  if (a_s_.Dim() == 0) // No speaker-dependent weights...
-    UpdateNoU(v_s, objf_impr_out, count_out);
-  else
-    UpdateWithU(model, v_s, objf_impr_out, count_out);
-}
-
-
-// Basic update, no SSGMM.
-void MleSgmm2SpeakerAccs::UpdateNoU(Vector<BaseFloat> *v_s,
-                                BaseFloat *objf_impr_out,
-                                BaseFloat *count_out) {
-  double tot_gamma = gamma_s_.Sum();
-  KALDI_ASSERT(y_s_.Dim() != 0);
-  int32 T = y_s_.Dim();  // speaker-subspace dim.
-  int32 num_gauss = gamma_s_.Dim();
-  if (v_s->Dim() != T) v_s->Resize(T);  // will set it to zero.
-
-  // Eq. (84): H^{(s)} = \sum_{i} \gamma_{i}(s) H_{i}^{spk}
-  SpMatrix<double> H_s(T);
-
-  for (int32 i = 0; i < num_gauss; i++)
-    H_s.AddSp(gamma_s_(i), H_spk_[i]);
-
-  // Don't make these options to SolveQuadraticProblem configurable...
-  // they really don't make a difference at all unless the matrix in
-  // question is singular, which wouldn't happen in this case.
-  Vector<double> v_s_dbl(*v_s);
-  double tot_objf_impr =
-      SolveQuadraticProblem(H_s, y_s_, SolverOptions("v_s"), &v_s_dbl);
-
-  v_s->CopyFromVec(v_s_dbl);
-
-  KALDI_LOG << "*Objf impr for speaker vector is " << (tot_objf_impr / tot_gamma)
-            << " over " << tot_gamma << " frames.";
-
-  if (objf_impr_out) *objf_impr_out = tot_objf_impr;
-  if (count_out) *count_out = tot_gamma;
-}
-
-// Basic update, no SSGMM.
-void MleSgmm2SpeakerAccs::UpdateWithU(const AmSgmm2 &model,
-                                     Vector<BaseFloat> *v_s_ptr,
-                                     BaseFloat *objf_impr_out,
-                                     BaseFloat *count_out) {
-  double tot_gamma = gamma_s_.Sum();
-  KALDI_ASSERT(y_s_.Dim() != 0);
-  int32 T = y_s_.Dim();  // speaker-subspace dim.
-  int32 num_gauss = gamma_s_.Dim();
-  if (v_s_ptr->Dim() != T) v_s_ptr->Resize(T);  // will set it to zero.
-
-  // Eq. (84): H^{(s)} = \sum_{i} \gamma_{i}(s) H_{i}^{spk}
-  SpMatrix<double> H_s(T);
-
-  for (int32 i = 0; i < num_gauss; i++)
-    H_s.AddSp(gamma_s_(i), H_spk_[i]);
-
-  Vector<double> v_s(*v_s_ptr);
-  int32 num_iters = 5, // don't set this to 1, as we discard last iter.
-      num_backtracks = 0,
-      max_backtracks = 10;
-  Vector<double> auxf(num_iters);
-  Matrix<double> v_s_per_iter(num_iters, T);
-  // The update for v^{(s)} is the one described in the technical report
-  // section 5.1 (eq. 33 and below).
-
-  for (int32 iter = 0; iter < num_iters; iter++) { // converges very fast,
-    // and each iteration is fast, so don't need to make this configurable.
-    v_s_per_iter.Row(iter).CopyFromVec(v_s);
-
-    SpMatrix<double> F(H_s); // the 2nd-order quadratic term on this iteration...
-    // F^{(p)} in the techerport.
-    Vector<double> g(y_s_); // g^{(p)} in the techreport.
-    g.AddSpVec(-1.0, H_s, v_s, 1.0);
-    Vector<double> log_b_is(num_gauss); // b_i^{(s)}, indexed by i.
-    log_b_is.AddMatVec(1.0, Matrix<double>(model.u_), kNoTrans, v_s, 0.0);
-    Vector<double> tilde_w_is(log_b_is);
-    Vector<double> log_a_s_(a_s_);
-    log_a_s_.ApplyLog();
-    tilde_w_is.AddVec(1.0, log_a_s_);
-    tilde_w_is.Add(-1.0 * tilde_w_is.LogSumExp()); // normalize.
-    // currently tilde_w_is is in log form.
-    auxf(iter) = VecVec(v_s, y_s_) - 0.5 * VecSpVec(v_s, H_s, v_s)
-        + VecVec(gamma_s_, tilde_w_is); // "new" term (weights)
-
-    if (iter > 0 && auxf(iter) < auxf(iter-1) &&
-        !ApproxEqual(auxf(iter), auxf(iter-1))) { // auxf did not improve.
-      // backtrack halfway, and do this iteration again.
-      KALDI_WARN << "Backtracking in speaker vector update, on iter "
-                 << iter << ", auxfs are " << auxf(iter-1) << " -> "
-                 << auxf(iter);
-      v_s.Scale(0.5);
-      v_s.AddVec(0.5, v_s_per_iter.Row(iter-1));
-      if (++num_backtracks >= max_backtracks) {
-        KALDI_WARN << "Backtracked " << max_backtracks
-                   << " times in speaker-vector update.";
-        // backtrack all the way, and terminate:
-        v_s_per_iter.Row(num_iters-1).CopyFromVec(v_s_per_iter.Row(iter-1));
-        // the following statement ensures we will get
-        // the appropriate auxiliary-function.
-        auxf(num_iters-1) = auxf(iter-1);
-        break;
-      }
-      iter--;
-    }
-    tilde_w_is.ApplyExp();
-    for (int32 i = 0; i < num_gauss; i++) {
-      g.AddVec(gamma_s_(i) - tot_gamma * tilde_w_is(i), model.u_.Row(i));
-      F.AddVec2(tot_gamma * tilde_w_is(i), model.u_.Row(i));
-    }
-    Vector<double> delta(v_s.Dim());
-    SolveQuadraticProblem(F, g, SolverOptions("v_s"), &delta);
-    v_s.AddVec(1.0, delta);
-  }
-  // so that we only accept things where the auxf has been checked, we
-  // actually take the penultimate speaker-vector. --> don't set
-  // num-iters = 1.
-  v_s_ptr->CopyFromVec(v_s_per_iter.Row(num_iters-1));
-
-  double auxf_change = auxf(num_iters-1) - auxf(0);
-  KALDI_LOG << "*Objf impr for speaker vector is " << (auxf_change / tot_gamma)
-            << " per frame, over " << tot_gamma << " frames.";
-
-  if (objf_impr_out) *objf_impr_out = auxf_change;
-  if (count_out) *count_out = tot_gamma;
-}
-
-
-MleAmSgmm2Accs::~MleAmSgmm2Accs() {
-  if (gamma_s_.Sum() != 0.0)
-    KALDI_ERR << "In destructor of MleAmSgmm2Accs: detected that you forgot to "
-        "call CommitStatsForSpk()";
-}
-
-
-}  // namespace kaldi
diff --git a/src/sgmm2/estimate-am-sgmm2.h b/src/sgmm2/estimate-am-sgmm2.h
deleted file mode 100644
index 3768008b3b7..00000000000
--- a/src/sgmm2/estimate-am-sgmm2.h
+++ /dev/null
@@ -1,478 +0,0 @@
-// sgmm2/estimate-am-sgmm2.h
-
-// Copyright 2009-2011  Microsoft Corporation;  Lukas Burget;
-//                      Saarland University (Author: Arnab Ghoshal);
-//                      Ondrej Glembek;  Yanmin Qian;
-// Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey)
-//                      Liang Lu;  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM2_ESTIMATE_AM_SGMM2_H_
-#define KALDI_SGMM2_ESTIMATE_AM_SGMM2_H_ 1
-
-#include <string>
-#include <vector>
-
-#include "sgmm2/am-sgmm2.h"
-#include "gmm/model-common.h"
-#include "itf/options-itf.h"
-#include "util/kaldi-thread.h"
-
-namespace kaldi {
-
-/** \struct MleAmSgmm2Options
- *  Configuration variables needed in the SGMM estimation process.
- */
-struct MleAmSgmm2Options {
-  /// Smoothing constant for sub-state weights [count to add to each one].
-  BaseFloat tau_c;
-  /// Floor covariance matrices Sigma_i to this times average cov.
-  BaseFloat cov_floor;
-  /// ratio to dim below which we use diagonal. default 2, set to inf for diag.
-  BaseFloat cov_diag_ratio;
-  /// Max on condition of matrices in update beyond which we do not update.
-  /// Should probably be related to numerical properties of machine
-  /// or BaseFloat type.
-  BaseFloat max_cond;
-
-  bool renormalize_V;  // Renormalize the phonetic space.
-  bool renormalize_N;  // Renormalize the speaker space.
-
-  /// Number of iters when re-estimating weight projections "w".
-  int weight_projections_iters;
-
-  BaseFloat epsilon;  ///< very small value used to prevent SVD crashing.
-  BaseFloat max_impr_u; ///< max improvement per frame allowed in update of u.
-
-  BaseFloat tau_map_M;  ///< For MAP update of the phonetic subspace M
-  int map_M_prior_iters;  ///< num of iterations to update the prior of M
-  bool full_row_cov;  ///< Estimate row covariance instead of using I
-  bool full_col_cov;  ///< Estimate col covariance instead of using I
-
-  MleAmSgmm2Options() {
-    cov_floor = 0.025;
-    tau_c  = 2.0;
-    cov_diag_ratio = 2.0;  // set this to very large to get diagonal-cov models.
-    max_cond = 1.0e+05;
-    epsilon = 1.0e-40;
-    renormalize_V = true;
-    renormalize_N = false;  // default to false since will invalidate spk vectors
-    // on disk.
-    weight_projections_iters = 3;
-    max_impr_u = 0.25;
-
-    map_M_prior_iters = 5;
-    tau_map_M = 0.0;  // No MAP update by default (~500-1000 depending on prior)
-    full_row_cov = false;
-    full_col_cov = false;
-  }
-
-  void Register(OptionsItf *opts) {
-    std::string module = "MleAmSgmm2Options: ";
-    opts->Register("tau-c", &tau_c, module+
-                   "Count for smoothing weight update.");
-    opts->Register("cov-floor", &cov_floor, module+
-                   "Covariance floor (fraction of average covariance).");
-    opts->Register("cov-diag-ratio", &cov_diag_ratio, module+
-                   "Minimum occ/dim ratio below which use diagonal covariances.");
-    opts->Register("max-cond", &max_cond, module+"Maximum condition number used to "
-                   "regularize the solution of certain quadratic auxiliary functions.");
-    opts->Register("weight-projections-iters", &weight_projections_iters, module+
-                   "Number for iterations for weight projection estimation.");
-    opts->Register("renormalize-v", &renormalize_V, module+"If true, renormalize "
-                   "the phonetic-subspace vectors to have meaningful sizes.");
-    opts->Register("renormalize-n", &renormalize_N, module+"If true, renormalize "
-                   "the speaker subspace to have meaningful sizes.");
-    opts->Register("max-impr-u", &max_impr_u, module+"Maximum objective function "
-                   "improvement per frame allowed in update of u (to "
-                   "maintain stability.");
-
-    opts->Register("tau-map-M", &tau_map_M, module+"Smoothing for MAP estimate "
-                   "of M (0 means ML update).");
-    opts->Register("map-M-prior-iters", &map_M_prior_iters, module+
-                   "Number of iterations to estimate prior covariances for M.");
-    opts->Register("full-row-cov", &full_row_cov, module+
-                   "Estimate row covariance instead of using I.");
-    opts->Register("full-col-cov", &full_col_cov, module+
-                   "Estimate column covariance instead of using I.");
-  }
-};
-
-/** \class MleAmSgmm2Accs
- *  Class for the accumulators associated with the phonetic-subspace model
- *  parameters
- */
-class MleAmSgmm2Accs {
- public:
-  explicit MleAmSgmm2Accs(BaseFloat rand_prune = 1.0e-05)
-      : total_frames_(0.0), total_like_(0.0), feature_dim_(0),
-        phn_space_dim_(0), spk_space_dim_(0), num_gaussians_(0),
-        num_pdfs_(0), num_groups_(0), rand_prune_(rand_prune) {}
-
-  MleAmSgmm2Accs(const AmSgmm2 &model, SgmmUpdateFlagsType flags,
-                 bool have_spk_vecs,
-                 BaseFloat rand_prune = 1.0e-05)
-      : total_frames_(0.0), total_like_(0.0), rand_prune_(rand_prune) {
-    ResizeAccumulators(model, flags, have_spk_vecs);
-  }
-
-  ~MleAmSgmm2Accs();
-
-  void Read(std::istream &in_stream, bool binary, bool add);
-  void Write(std::ostream &out_stream, bool binary) const;
-
-  /// Checks the various accumulators for correct sizes given a model. With
-  /// wrong sizes, assertion failure occurs. When the show_properties argument
-  /// is set to true, dimensions and presence/absence of the various
-  /// accumulators are printed. For use when accumulators are read from file.
-  void Check(const AmSgmm2 &model, bool show_properties = true) const;
-
-  /// Resizes the accumulators to the correct sizes given the model. The flags
-  /// argument controls which accumulators to resize.
-  void ResizeAccumulators(const AmSgmm2 &model, SgmmUpdateFlagsType flags,
-                          bool have_spk_vecs);
-
-  /// Returns likelihood.
-  BaseFloat Accumulate(const AmSgmm2 &model,
-                       const Sgmm2PerFrameDerivedVars &frame_vars,
-                       int32 pdf_index, // == j2.
-                       BaseFloat weight,
-                       Sgmm2PerSpkDerivedVars *spk_vars);
-
-  /// Returns count accumulated (may differ from posteriors.Sum()
-  /// due to weight pruning).
-  BaseFloat AccumulateFromPosteriors(const AmSgmm2 &model,
-                                     const Sgmm2PerFrameDerivedVars &frame_vars,
-                                     const Matrix<BaseFloat> &posteriors,
-                                     int32 pdf_index, // == j2.
-                                     Sgmm2PerSpkDerivedVars *spk_vars);
-
-  /// Accumulates global stats for the current speaker (if applicable).  If
-  /// flags contains kSgmmSpeakerProjections (N), or
-  /// kSgmmSpeakerWeightProjections (u), must call this after finishing the
-  /// speaker's data.
-  void CommitStatsForSpk(const AmSgmm2 &model,
-                         const Sgmm2PerSpkDerivedVars &spk_vars);
-
-  /// Accessors
-  void GetStateOccupancies(Vector<BaseFloat> *occs) const;
-  int32 FeatureDim() const { return feature_dim_; }
-  int32 PhoneSpaceDim() const { return phn_space_dim_; }
-  int32 NumPdfs() const { return num_pdfs_; } // returns J2
-  int32 NumGroups() const { return num_groups_; } // returns J1
-  int32 NumGauss() const { return num_gaussians_; }
-
- private:
-  /// The stats which are not tied to any state.
-  /// Stats Y_{i} for phonetic-subspace projections M; Dim is [I][D][S].
-  std::vector< Matrix<double> > Y_;
-  /// Stats Z_{i} for speaker-subspace projections N. Dim is [I][D][T].
-  std::vector< Matrix<double> > Z_;
-  /// R_{i}, quadratic term for speaker subspace estimation. Dim is [I][T][T]
-  std::vector< SpMatrix<double> > R_;
-  /// S_{i}^{-}, scatter of adapted feature vectors x_{i}(t). Dim is [I][D][D].
-  std::vector< SpMatrix<double> > S_;
-
-  /// The SGMM state specific stats.
-  /// Statistics y_{jm} for state vectors v_{jm}. dimension is [J1][#mix][S].
-  std::vector< Matrix<double> > y_;
-  /// Gaussian occupancies gamma_{jmi} for each substate and Gaussian index,
-  /// pooled over groups. Dim is [J1][#mix][I].
-  std::vector< Matrix<double> > gamma_;
-
-  /// [SSGMM] These a_{jmi} quantities are dimensionally the same
-  /// as the gamma quantities.  They're needed to estimate the v_{jm}
-  /// and w_i quantities in the symmetric SGMM.  Dimension is [J1][#mix][S]
-  std::vector< Matrix<double> > a_;
-
-  /// [SSGMM] each row is one of the t_i quantities in the less-exact
-  /// version of the SSGMM update for the speaker weight projections.
-  /// Dimension is [I][T]
-  Matrix<double> t_;
-
-  /// [SSGMM], this is a per-speaker variable storing the a_i^{(s)}
-  /// quantities that we will use in order to compute the non-speaker-
-  /// specific quantities [see eqs. 53 and 54 in techreport].  Note:
-  /// there is a separate variable a_s_ in class MleSgmm2SpeakerAccs,
-  /// which is the same thing but for purposes of computing
-  /// the speaker-vector v^{(s)}.
-  Vector<double> a_s_;
-
-  /// the U_i quantities from the less-exact version of the SSGMM update for the
-  /// speaker weight projections.  Dimension is [I][T][T]
-  std::vector<SpMatrix<double> > U_;
-
-  /// Sub-state occupancies gamma_{jm}^{(c)} for each sub-state.  In the
-  /// SCTM version of the SGMM, for compactness we store two separate
-  /// sets of gamma statistics, one to estimate the v_{jm} quantities
-  /// and one to estimate the sub-state weights c_{jm}.
-  std::vector< Vector<double> > gamma_c_;
-
-  /// gamma_{i}^{(s)}.  Per-speaker counts for each Gaussian. Dimension is [I]
-  /// Needed for stats R_.  This can be viewed as a temporary variable; it
-  /// does not form part of the stats that we eventually dump to disk.
-  Vector<double> gamma_s_;
-
-  double total_frames_, total_like_;
-
-  /// Dimensionality of various subspaces
-  int32 feature_dim_, phn_space_dim_, spk_space_dim_;
-  int32 num_gaussians_, num_pdfs_, num_groups_;  ///< Other model specifications
-
-  BaseFloat rand_prune_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(MleAmSgmm2Accs);
-  friend class MleAmSgmm2Updater;
-  friend class EbwAmSgmm2Updater;
-};
-
-/** \class MleAmSgmmUpdater
- *  Contains the functions needed to update the SGMM parameters.
- */
-class MleAmSgmm2Updater {
- public:
-  explicit MleAmSgmm2Updater(const MleAmSgmm2Options &options)
-      : options_(options) {}
-  void Reconfigure(const MleAmSgmm2Options &options) {
-    options_ = options;
-  }
-
-  void Update(const MleAmSgmm2Accs &accs,
-              AmSgmm2 *model,
-              SgmmUpdateFlagsType flags);
-
- private:
-  friend class UpdateWClass;
-  friend class UpdatePhoneVectorsClass;
-  friend class EbwEstimateAmSgmm2;
-
-  ///  Compute the Q_i quantities (Eq. 64).
-  static void ComputeQ(const MleAmSgmm2Accs &accs,
-                       const AmSgmm2 &model,
-                       std::vector< SpMatrix<double> > *Q);
-
-  /// Compute the S_means quantities, minus sum: (Y_i M_i^T + M_i Y_I^T).
-  static void ComputeSMeans(const MleAmSgmm2Accs &accs,
-                            const AmSgmm2 &model,
-                            std::vector< SpMatrix<double> > *S_means);
-  friend class EbwAmSgmm2Updater;
-
-  MleAmSgmm2Options options_;
-
-  // Called from UpdatePhoneVectors; updates a subset of states
-  // (relates to multi-threading).
-  void UpdatePhoneVectorsInternal(const MleAmSgmm2Accs &accs,
-                                  const std::vector<SpMatrix<double> > &H,
-                                  const std::vector<Matrix<double> > &log_a,
-                                  AmSgmm2 *model,
-                                  double *auxf_impr,
-                                  int32 num_threads,
-                                  int32 thread_id) const;
-
-  double UpdatePhoneVectors(const MleAmSgmm2Accs &accs,
-                            const std::vector<SpMatrix<double> > &H,
-                            const std::vector<Matrix<double> > &log_a,
-                            AmSgmm2 *model) const;
-
-  double UpdateM(const MleAmSgmm2Accs &accs,
-                 const std::vector< SpMatrix<double> > &Q,
-                 const Vector<double> &gamma_i,
-                 AmSgmm2 *model);
-
-  void RenormalizeV(const MleAmSgmm2Accs &accs, AmSgmm2 *model,
-                    const Vector<double> &gamma_i,
-                    const std::vector<SpMatrix<double> > &H);
-
-  double UpdateN(const MleAmSgmm2Accs &accs, const Vector<double> &gamma_i,
-                 AmSgmm2 *model);
-  void RenormalizeN(const MleAmSgmm2Accs &accs, const Vector<double> &gamma_i,
-                    AmSgmm2 *model);
-  double UpdateVars(const MleAmSgmm2Accs &accs,
-                    const std::vector< SpMatrix<double> > &S_means,
-                    const Vector<double> &gamma_i,
-                    AmSgmm2 *model);
-  // Update for the phonetic-subspace weight projections w_i
-  double UpdateW(const MleAmSgmm2Accs &accs,
-                 const std::vector<Matrix<double> > &log_a,
-                 const Vector<double> &gamma_i,
-                 AmSgmm2 *model);
-  // Update for the speaker-subspace weight projections u_i [SSGMM]
-  double UpdateU(const MleAmSgmm2Accs &accs, const Vector<double> &gamma_i,
-                 AmSgmm2 *model);
-
-  /// Called, multithreaded, inside UpdateW
-  static
-  void UpdateWGetStats(const MleAmSgmm2Accs &accs,
-                       const AmSgmm2 &model,
-                       const Matrix<double> &w,
-                       const std::vector<Matrix<double> > &log_a,
-                       Matrix<double> *F_i,
-                       Matrix<double> *g_i,
-                       double *tot_like,
-                       int32 num_threads,
-                       int32 thread_id);
-
-  double UpdateSubstateWeights(const MleAmSgmm2Accs &accs,
-                               AmSgmm2 *model);
-
-  static void ComputeLogA(const MleAmSgmm2Accs &accs,
-                          std::vector<Matrix<double> > *log_a); // [SSGMM]
-
-  void ComputeMPrior(AmSgmm2 *model);  // TODO(arnab): Maybe make this static?
-  double MapUpdateM(const MleAmSgmm2Accs &accs,
-                    const std::vector< SpMatrix<double> > &Q,
-                    const Vector<double> &gamma_i, AmSgmm2 *model);
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(MleAmSgmm2Updater);
-  MleAmSgmm2Updater() {}  // Prevent unconfigured updater.
-};
-
-
-/** \class MleSgmm2SpeakerAccs
- *  Class for the accumulators required to update the speaker
- *  vectors v_s.
- *  Note: if you have multiple speakers you will want to initialize
- *  this just once and call Clear() after you're done with each speaker,
- *  rather than creating a new object for each speaker, since the
- *  initialization function does nontrivial work.
- */
-
-class MleSgmm2SpeakerAccs {
- public:
-  /// Initialize the object.  Error if speaker subspace not set up.
-  MleSgmm2SpeakerAccs(const AmSgmm2 &model,
-                      BaseFloat rand_prune_ = 1.0e-05);
-
-  /// Clear the statistics.
-  void Clear();
-
-  /// Accumulate statistics.  Returns per-frame log-likelihood.
-  BaseFloat Accumulate(const AmSgmm2 &model,
-                       const Sgmm2PerFrameDerivedVars &frame_vars,
-                       int32 pdf_index,
-                       BaseFloat weight,
-                       Sgmm2PerSpkDerivedVars *spk_vars);
-
-  /// Accumulate statistics, given posteriors.  Returns total
-  /// count accumulated, which may differ from posteriors.Sum()
-  /// due to randomized pruning.
-  BaseFloat AccumulateFromPosteriors(const AmSgmm2 &model,
-                                     const Sgmm2PerFrameDerivedVars &frame_vars,
-                                     const Matrix<BaseFloat> &posteriors,
-                                     int32 pdf_index,
-                                     Sgmm2PerSpkDerivedVars *spk_vars);
-
-  /// Update speaker vector.  If v_s was empty, will assume it started as zero
-  /// and will resize it to the speaker-subspace size.
-  void Update(const AmSgmm2 &model,
-              BaseFloat min_count,  // e.g. 100
-              Vector<BaseFloat> *v_s,
-              BaseFloat *objf_impr_out,
-              BaseFloat *count_out);
-
- private:
-  // Update without speaker-dependent weights (vectors u_i),
-  // i.e. not symmetric SGMM (SSGMM)
-  void UpdateNoU(Vector<BaseFloat> *v_s,
-                 BaseFloat *objf_impr_out,
-                 BaseFloat *count_out);
-  // Update for SSGMM
-  void UpdateWithU(const AmSgmm2 &model,
-                   Vector<BaseFloat> *v_s,
-                   BaseFloat *objf_impr_out,
-                   BaseFloat *count_out);
-
-
-  /// Statistics for speaker adaptation (vectors), stored per-speaker.
-  /// Per-speaker stats for vectors, y^{(s)}. Dimension [T].
-  Vector<double> y_s_;
-  /// gamma_{i}^{(s)}.  Per-speaker counts for each Gaussian. Dimension is [I]
-  Vector<double> gamma_s_;
-  /// a_i^{(s)}.  For SSGMM.
-  Vector<double> a_s_;
-
-  /// The following variable does not change per speaker, it just
-  /// relates to the speaker subspace.
-  /// Eq. (82): H_{i}^{spk} = N_{i}^T \Sigma_{i}^{-1} N_{i}
-  std::vector< SpMatrix<double> > H_spk_;
-
-  /// N_i^T \Sigma_{i}^{-1}. Needed for y^{(s)}
-  std::vector< Matrix<double> > NtransSigmaInv_;
-
-  /// small constant to randomly prune tiny posteriors
-  BaseFloat rand_prune_;
-};
-
-// This class, used in multi-core implementation of the updates of the "w_i"
-// quantities, was previously in estimate-am-sgmm.cc, but is being moved to the
-// header so it can be used in estimate-am-sgmm-ebw.cc.  It is responsible for
-// computing, in parallel, the F_i and g_i quantities used in the updates of
-// w_i.
-class UpdateWClass: public MultiThreadable {
- public:
-  UpdateWClass(const MleAmSgmm2Accs &accs,
-               const AmSgmm2 &model,
-               const Matrix<double> &w,
-               const std::vector<Matrix<double> > &log_a,
-               Matrix<double> *F_i,
-               Matrix<double> *g_i,
-               double *tot_like):
-      accs_(accs), model_(model), w_(w), log_a_(log_a),
-      F_i_ptr_(F_i), g_i_ptr_(g_i), tot_like_ptr_(tot_like) {
-    tot_like_ = 0.0;
-    F_i_.Resize(F_i->NumRows(), F_i->NumCols());
-    g_i_.Resize(g_i->NumRows(), g_i->NumCols());
-  }
-
-  UpdateWClass(const UpdateWClass &other) :
-      MultiThreadable(other),
-      accs_(other.accs_), model_(other.model_), w_(other.w_),
-      log_a_(other.log_a_), F_i_ptr_(other.F_i_ptr_), g_i_ptr_(other.g_i_ptr_),
-      F_i_(other.F_i_), g_i_(other.g_i_), tot_like_ptr_(other.tot_like_ptr_),
-      tot_like_(0.0) { }
-
-  ~UpdateWClass() {
-    F_i_ptr_->AddMat(1.0, F_i_, kNoTrans);
-    g_i_ptr_->AddMat(1.0, g_i_, kNoTrans);
-    *tot_like_ptr_ += tot_like_;
-  }
-
-  inline void operator() () {
-    // Note: give them local copy of the sums we're computing,
-    // which will be propagated to the total sums in the destructor.
-    MleAmSgmm2Updater::UpdateWGetStats(accs_, model_, w_, log_a_,
-                                      &F_i_, &g_i_, &tot_like_,
-                                      num_threads_, thread_id_);
-  }
- private:
-  const MleAmSgmm2Accs &accs_;
-  const AmSgmm2 &model_;
-  const Matrix<double> &w_;
-  const std::vector<Matrix<double> > &log_a_;
-  Matrix<double> *F_i_ptr_;
-  Matrix<double> *g_i_ptr_;
-  Matrix<double> F_i_;
-  Matrix<double> g_i_;
-  double *tot_like_ptr_;
-  double tot_like_;
-};
-
-
-}  // namespace kaldi
-
-
-#endif  // KALDI_SGMM2_ESTIMATE_AM_SGMM2_H_
diff --git a/src/sgmm2/fmllr-sgmm2-test.cc b/src/sgmm2/fmllr-sgmm2-test.cc
deleted file mode 100644
index ede25d76c68..00000000000
--- a/src/sgmm2/fmllr-sgmm2-test.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-// sgmm2/fmllr-sgmm2-test.cc
-
-// Copyright 2009-2011  Saarland University (author:  Arnab Ghoshal)
-//           2012  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-
-#include "base/kaldi-math.h"
-#include "gmm/model-test-common.h"
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/fmllr-sgmm2.h"
-#include "util/kaldi-io.h"
-
-using kaldi::AmSgmm2;
-using kaldi::int32;
-using kaldi::BaseFloat;
-using kaldi::Vector;
-using kaldi::Matrix;
-using kaldi::Exp;
-
-namespace ut = kaldi::unittest;
-
-void ApplyFmllrXform(const kaldi::VectorBase<BaseFloat> &in,
-                     const Matrix<BaseFloat> &xf,
-                     Vector<BaseFloat> *out) {
-  int32 dim = in.Dim();
-  KALDI_ASSERT(xf.NumRows() == dim && xf.NumCols() == dim + 1);
-  Vector<BaseFloat> tmp(dim + 1);
-  tmp.Range(0, dim).CopyFromVec(in);
-  tmp(dim) = 1.0;
-  out->Resize(dim, kaldi::kSetZero);
-  out->AddMatVec(1.0, xf, kaldi::kNoTrans, tmp, 0.0);
-}
-
-// Tests the Read() and Write() methods for the accumulators, in both binary
-// and ASCII mode, as well as Check().
-void TestSgmm2FmllrAccsIO(const AmSgmm2 &sgmm,
-                         const kaldi::Matrix<BaseFloat> &feats) {
-  KALDI_LOG << "Test IO start.";
-  using namespace kaldi;
-  int32 dim = sgmm.FeatureDim();
-  kaldi::Sgmm2PerFrameDerivedVars frame_vars;
-  kaldi::Sgmm2PerSpkDerivedVars empty;
-  kaldi::Sgmm2FmllrGlobalParams fmllr_globals;
-  kaldi::Sgmm2GselectConfig sgmm_config;
-
-  frame_vars.Resize(sgmm.NumGauss(), dim, sgmm.PhoneSpaceDim());
-  sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest,
-                                        sgmm.NumGauss());
-  kaldi::Vector<BaseFloat> occs(sgmm.NumPdfs());
-  occs.Set(feats.NumRows());
-  sgmm.ComputeFmllrPreXform(occs, &fmllr_globals.pre_xform_,
-                            &fmllr_globals.inv_xform_,
-                            &fmllr_globals.mean_scatter_);
-  if (fmllr_globals.mean_scatter_.Min() == 0.0) {
-    KALDI_WARN << "Global covariances low rank!";
-    KALDI_WARN << "Diag-scatter = " << fmllr_globals.mean_scatter_;
-    return;
-  }
-
-//  std::cout << "Pre-Xform = " << fmllr_globals.pre_xform_;
-//  std::cout << "Inv-Xform = " << fmllr_globals.inv_xform_;
-
-  FmllrSgmm2Accs accs;
-  accs.Init(sgmm.FeatureDim(), sgmm.NumGauss());
-  BaseFloat loglike = 0.0;
-  std::vector<int32> gselect;
-  for (int32 i = 0; i < feats.NumRows(); i++) {
-    sgmm.GaussianSelection(sgmm_config, feats.Row(i), &gselect);
-    sgmm.ComputePerFrameVars(feats.Row(i), gselect, empty, &frame_vars);
-    loglike += accs.Accumulate(sgmm, feats.Row(i), frame_vars, 0, 1.0,
-                               &empty);
-  }
-
-  kaldi::Sgmm2FmllrConfig update_opts;
-  update_opts.fmllr_min_count = 999; // Make sure it doesn't
-  // divide 200, because the test can fail when we cross the boundary
-  // of 1000 due to roundoff.  Actually it's weird because 1000 should
-  // be exactly representable in float and in text.  But something's going wrong.
-  kaldi::Matrix<BaseFloat> xform_mat(dim, dim+1);
-  xform_mat.SetUnit();
-  BaseFloat frames, impr;
-  accs.Update(sgmm, fmllr_globals, update_opts, &xform_mat, &frames, &impr);
-
-  Vector<BaseFloat> xformed_feat(dim);
-  ApplyFmllrXform(feats.Row(0), xform_mat, &xformed_feat);
-  sgmm.GaussianSelection(sgmm_config, xformed_feat, &gselect);
-  sgmm.ComputePerFrameVars(xformed_feat, gselect, empty, &frame_vars);
-
-  Sgmm2LikelihoodCache like_cache(sgmm.NumGroups(), sgmm.NumPdfs());
-  BaseFloat loglike1 = sgmm.LogLikelihood(frame_vars, 0,
-                                          &like_cache, &empty);
-
-  bool binary_in;
-  // First, non-binary write
-  KALDI_LOG << "Test ASCII IO.";
-  accs.Write(kaldi::Output("tmpf", false).Stream(), false);
-  FmllrSgmm2Accs *accs1 = new FmllrSgmm2Accs();
-  // Non-binary read
-  kaldi::Input ki1("tmpf", &binary_in);
-  accs1->Read(ki1.Stream(), binary_in, false);
-  xform_mat.SetUnit();
-  accs1->Update(sgmm, fmllr_globals, update_opts, &xform_mat, NULL, NULL);
-  ApplyFmllrXform(feats.Row(0), xform_mat, &xformed_feat);
-  sgmm.GaussianSelection(sgmm_config, xformed_feat, &gselect);
-  sgmm.ComputePerFrameVars(xformed_feat, gselect, empty, &frame_vars);
-  like_cache.NextFrame();
-  BaseFloat loglike2 = sgmm.LogLikelihood(frame_vars, 0,
-                                          &like_cache, &empty);
-  std::cout << "LL1 = " << loglike1 << ", LL2 = " << loglike2 << std::endl;
-  
-  kaldi::AssertEqual(loglike1, loglike2, 1e-2);
-  delete accs1;
-
-  // Next, binary write
-  KALDI_LOG << "Test Binary IO.";
-  accs.Write(kaldi::Output("tmpfb", true).Stream(), true);
-  FmllrSgmm2Accs *accs2 = new FmllrSgmm2Accs();
-  // Binary read
-  kaldi::Input ki2("tmpfb", &binary_in);
-  accs2->Read(ki2.Stream(), binary_in, false);
-  xform_mat.SetUnit();
-  accs2->Update(sgmm, fmllr_globals, update_opts, &xform_mat, NULL, NULL);
-  ApplyFmllrXform(feats.Row(0), xform_mat, &xformed_feat);
-  sgmm.GaussianSelection(sgmm_config, xformed_feat, &gselect);
-  sgmm.ComputePerFrameVars(xformed_feat, gselect, empty,  &frame_vars);
-  BaseFloat loglike3 = sgmm.LogLikelihood(frame_vars, 0,
-                                          &like_cache, &empty);
-  std::cout << "LL1 = " << loglike1 << ", LL3 = " << loglike3 << std::endl;
-  kaldi::AssertEqual(loglike1, loglike3, 1e-4);
-  delete accs2;
-  
-  unlink("tmpf");
-  unlink("tmpfb");
-  KALDI_LOG << "Test IO end.";
-}
-
-void TestSgmm2FmllrSubspace(const AmSgmm2 &sgmm,
-                         const kaldi::Matrix<BaseFloat> &feats) {
-  KALDI_LOG << "Test Subspace start.";
-  using namespace kaldi;
-  int32 dim = sgmm.FeatureDim();
-  kaldi::Sgmm2PerFrameDerivedVars frame_vars;
-  kaldi::Sgmm2PerSpkDerivedVars empty;
-  kaldi::Sgmm2FmllrGlobalParams fmllr_globals;
-  kaldi::Sgmm2GselectConfig sgmm_config;
-
-  frame_vars.Resize(sgmm.NumGauss(), dim, sgmm.PhoneSpaceDim());
-  sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest,
-                                        sgmm.NumGauss());
-  kaldi::Vector<BaseFloat> occs(sgmm.NumPdfs());
-  occs.Set(feats.NumRows());
-  sgmm.ComputeFmllrPreXform(occs, &fmllr_globals.pre_xform_,
-                            &fmllr_globals.inv_xform_,
-                            &fmllr_globals.mean_scatter_);
-  if (fmllr_globals.mean_scatter_.Min() == 0.0) {
-    KALDI_WARN << "Global covariances low rank!";
-    KALDI_WARN << "Diag-scatter = " << fmllr_globals.mean_scatter_;
-    return;
-  }
-
-  FmllrSgmm2Accs accs;
-  accs.Init(sgmm.FeatureDim(), sgmm.NumGauss());
-  BaseFloat loglike = 0.0;
-  std::vector<int32> gselect;
-  for (int32 i = 0; i < feats.NumRows(); i++) {
-    sgmm.GaussianSelection(sgmm_config, feats.Row(i), &gselect);
-    sgmm.ComputePerFrameVars(feats.Row(i), gselect, empty, &frame_vars);
-    loglike += accs.Accumulate(sgmm, feats.Row(i), frame_vars, 0, 1.0,
-                               &empty);
-  }
-
-  SpMatrix<double> grad_scatter(dim * (dim+1));
-  accs.AccumulateForFmllrSubspace(sgmm, fmllr_globals, &grad_scatter);
-  kaldi::Sgmm2FmllrConfig update_opts;
-  EstimateSgmm2FmllrSubspace(grad_scatter, update_opts.num_fmllr_bases, dim,
-                            &fmllr_globals);
-//  update_opts.fmllr_min_count = 100;
-  kaldi::Matrix<BaseFloat> xform_mat(dim, dim+1);
-  xform_mat.SetUnit();
-  accs.Update(sgmm, fmllr_globals, update_opts, &xform_mat, NULL, NULL);
-  KALDI_LOG << "Test Subspace end.";
-}
-
-void TestSgmm2Fmllr() {
-  // srand(time(NULL));
-  int32 dim = 1 + kaldi::RandInt(0, 9);  // random dimension of the gmm
-  int32 num_comp = 2 + kaldi::RandInt(0, 9);  // random number of mixtures
-  kaldi::FullGmm full_gmm;
-  ut::InitRandFullGmm(dim, num_comp, &full_gmm);
-
-  AmSgmm2 sgmm;
-  kaldi::Sgmm2GselectConfig config;
-  std::vector<int32> pdf2group;
-  pdf2group.push_back(0);
-  sgmm.InitializeFromFullGmm(full_gmm, pdf2group, dim+1, dim, true, 0.9);
-  sgmm.ComputeNormalizers();
-
-  kaldi::Matrix<BaseFloat> feats;
-
-  {  // First, generate random means and variances
-    int32 num_feat_comp = num_comp + kaldi::RandInt(-num_comp/2, num_comp/2);
-    kaldi::Matrix<BaseFloat> means(num_feat_comp, dim),
-        vars(num_feat_comp, dim);
-    for (int32 m = 0; m < num_feat_comp; m++) {
-      for (int32 d= 0; d < dim; d++) {
-        means(m, d) = kaldi::RandGauss();
-        vars(m, d) = Exp(kaldi::RandGauss()) + 1e-2;
-      }
-    }
-    // Now generate random features with those means and variances.
-    feats.Resize(num_feat_comp * 200, dim);
-    for (int32 m = 0; m < num_feat_comp; m++) {
-      kaldi::SubMatrix<BaseFloat> tmp(feats, m*200, 200, 0, dim);
-      ut::RandDiagGaussFeatures(200, means.Row(m), vars.Row(m), &tmp);
-    }
-  }
-  TestSgmm2FmllrAccsIO(sgmm, feats);
-  TestSgmm2FmllrSubspace(sgmm, feats);
-}
-
-int main() {
-  kaldi::g_kaldi_verbose_level = 5;
-  for (int i = 0; i < 10; i++)
-    TestSgmm2Fmllr();
-  std::cout << "Test OK.\n";
-  return 0;
-}
diff --git a/src/sgmm2/fmllr-sgmm2.cc b/src/sgmm2/fmllr-sgmm2.cc
deleted file mode 100644
index 35658caec69..00000000000
--- a/src/sgmm2/fmllr-sgmm2.cc
+++ /dev/null
@@ -1,555 +0,0 @@
-// sgmm2/fmllr-sgmm2.cc
-
-// Copyright 2009-2012   Saarland University (author: Arnab Ghoshal)
-//                       Johns Hopkins University (author: Daniel Povey)    
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <string>
-#include <vector>
-using std::vector;
-
-#include "sgmm2/fmllr-sgmm2.h"
-#include "util/parse-options.h"
-
-namespace kaldi {
-
-static void ApplyPreXformToGradient(const Sgmm2FmllrGlobalParams &globals,
-                                    const Matrix<BaseFloat> &gradient_in,
-                                    Matrix<BaseFloat> *gradient_out) {
-  // Eq. (B.14): P' = A_{inv}^T P {W_{pre}^+}^T
-  int32 dim = gradient_in.NumRows();
-  Matrix<BaseFloat> Wpre_plus(dim + 1, dim + 1, kSetZero);
-  Wpre_plus.Range(0, dim, 0, dim + 1).CopyFromMat(globals.pre_xform_);
-  Wpre_plus(dim, dim) = 1;
-  SubMatrix<BaseFloat> Ainv(globals.inv_xform_, 0, dim, 0, dim);
-  Matrix<BaseFloat> AinvP(dim, dim + 1, kUndefined);
-  AinvP.AddMatMat(1.0, Ainv, kTrans, gradient_in, kNoTrans, 0.0);
-  gradient_out->AddMatMat(1.0, AinvP, kNoTrans, Wpre_plus, kTrans, 0.0);
-}
-
-static void ApplyInvPreXformToChange(const Sgmm2FmllrGlobalParams &globals,
-                                     const Matrix<BaseFloat> &delta_in,
-                                     Matrix<BaseFloat> *delta_out) {
-  // Eq. (B.25): \Delta = A_{inv} \Delta' W_{pre}^+
-  int32 dim = delta_in.NumRows();
-  Matrix<BaseFloat> Wpre_plus(dim + 1, dim + 1, kSetZero);
-  Wpre_plus.Range(0, dim, 0, dim + 1).CopyFromMat(globals.pre_xform_);
-  Wpre_plus(dim, dim) = 1;
-  SubMatrix<BaseFloat> Ainv(globals.inv_xform_, 0, dim, 0, dim);
-  Matrix<BaseFloat> AinvD(dim, dim + 1, kUndefined);
-  AinvD.AddMatMat(1.0, Ainv, kNoTrans, delta_in, kNoTrans, 0.0);
-  delta_out->AddMatMat(1.0, AinvD, kNoTrans, Wpre_plus, kNoTrans, 0.0);
-}
-
-static void ApplyHessianXformToGradient(const Sgmm2FmllrGlobalParams &globals,
-                                        const Matrix<BaseFloat> &gradient_in,
-                                        Matrix<BaseFloat> *gradient_out) {
-  int32 dim = gradient_in.NumRows();
-  const Vector<BaseFloat> &D = globals.mean_scatter_;
-  if (D.Min() <= 0.0)
-    KALDI_ERR << "Cannot estimate FMLLR: mean scatter has 0 eigenvalues.";
-  for (int32 r = 0; r < dim; r++) {
-    for (int32 c = 0; c < r; c++) {
-      // Eq. (B.15)
-      (*gradient_out)(r, c) = gradient_in(r, c) / std::sqrt(1 + D(c));
-      // Eq. (B.16)
-      (*gradient_out)(c, r) = gradient_in(c, r) / std::sqrt(1 + D(r) -
-          1 / (1 + D(c))) - gradient_in(r, c) / ((1 + D(c)) *
-              std::sqrt(1 + D(r) - 1 / (1 + D(c))));
-    }
-    // Eq. (B.17) & (B.18)
-    (*gradient_out)(r, r) = gradient_in(r, r) / std::sqrt(2 + D(r));
-    (*gradient_out)(r, dim) = gradient_in(r, dim);
-  }
-}
-
-static void ApplyInvHessianXformToChange(const Sgmm2FmllrGlobalParams &globals,
-                                         const Matrix<BaseFloat> &delta_in,
-                                         Matrix<BaseFloat> *delta_out) {
-  int32 dim = delta_in.NumRows();
-  const Vector<BaseFloat> &D = globals.mean_scatter_;
-  if (D.Min() <= 0.0)
-    KALDI_ERR << "Cannot estimate FMLLR: mean scatter has 0 eigenvalues.";
-  for (int32 r = 0; r < dim; r++) {
-    for (int32 c = 0; c < r; c++) {
-      // Eq. (B.21)
-      (*delta_out)(r, c) = delta_in(r, c) / std::sqrt(1 + D(c)) -
-          delta_in(c, r) / ((1 + D(c)) * std::sqrt(1 + D(r) - 1 / (1 + D(c))));
-      // Eq. (B.22)
-      (*delta_out)(c, r) = delta_in(c, r) / std::sqrt(1 + D(r) - 1/ (1 + D(c)));
-    }
-    // Eq. (B.23) & (B.24)
-    (*delta_out)(r, r) = delta_in(r, r) / std::sqrt(2 + D(r));
-    (*delta_out)(r, dim) = delta_in(r, dim);
-  }
-}
-
-
-void Sgmm2FmllrGlobalParams::Write(std::ostream &out, bool binary) const {
-  WriteToken(out, binary, "<SGMM_FMLLR_GLOBAL_PARAMS>");
-  WriteToken(out, binary, "<PRE_XFORM>");
-  pre_xform_.Write(out, binary);
-  WriteToken(out, binary, "<INV_XFORM>");
-  inv_xform_.Write(out, binary);
-  WriteToken(out, binary, "<MEAN_SCATTER>");
-  mean_scatter_.Write(out, binary);
-  if (fmllr_bases_.size() != 0) {
-    WriteToken(out, binary, "<FMLLR_BASIS>");
-    uint32 tmp = static_cast<uint32>(fmllr_bases_.size());
-    WriteBasicType(out, binary, tmp);
-    for (uint32 i = 0; i < tmp; i++) {
-      fmllr_bases_[i].Write(out, binary);
-    }
-  }
-  WriteToken(out, binary, "</SGMM_FMLLR_GLOBAL_PARAMS>");
-}
-
-void Sgmm2FmllrGlobalParams::Read(std::istream &in, bool binary) {
-  ExpectToken(in, binary, "<SGMM_FMLLR_GLOBAL_PARAMS>");
-  ExpectToken(in, binary, "<PRE_XFORM>");
-  pre_xform_.Read(in, binary);
-  ExpectToken(in, binary, "<INV_XFORM>");
-  inv_xform_.Read(in, binary);
-  ExpectToken(in, binary, "<MEAN_SCATTER>");
-  mean_scatter_.Read(in, binary);
-  std::string token;
-  ReadToken(in, binary, &token);
-  if (token == "<FMLLR_BASIS>") {
-    uint32 tmp;
-    ReadBasicType(in, binary, &tmp);
-    fmllr_bases_.resize(tmp);
-    for (uint32 i = 0; i < tmp; i++) {
-      fmllr_bases_[i].Read(in, binary);
-    }
-  } else {
-    if (token != "</SGMM_FMLLR_GLOBAL_PARAMS>")
-      KALDI_ERR << "Unexpected token '" << token << "' found.";
-  }
-}
-
-
-void FmllrSgmm2Accs::Init(int32 dim, int32 num_gaussians) {
-  if (dim == 0) {  // empty stats
-    dim_ = 0;  // non-zero dimension is meaningless in empty stats
-    stats_.Init(0, 0);  // clear the stats
-  } else {
-    dim_ = dim;
-    stats_.Init(dim, num_gaussians);
-  }
-}
-
-BaseFloat FmllrSgmm2Accs::Accumulate(const AmSgmm2 &model,
-                                     const VectorBase<BaseFloat> &data,
-                                     const Sgmm2PerFrameDerivedVars &frame_vars,
-                                     int32 pdf_index, BaseFloat weight,
-                                     Sgmm2PerSpkDerivedVars *spk) {
-  // Calulate Gaussian posteriors and collect statistics
-  Matrix<BaseFloat> posteriors;
-  BaseFloat log_like = model.ComponentPosteriors(frame_vars, pdf_index,
-                                                 spk, &posteriors);
-  posteriors.Scale(weight);
-  AccumulateFromPosteriors(model, *spk, data, frame_vars.gselect, posteriors,
-                           pdf_index);
-  return log_like;
-}
-
-void FmllrSgmm2Accs::AccumulateFromPosteriors(
-    const AmSgmm2 &model,
-    const Sgmm2PerSpkDerivedVars &spk,
-    const VectorBase<BaseFloat> &data,
-    const vector<int32> &gselect,
-    const Matrix<BaseFloat> &posteriors,
-    int32 j2) {
-  Vector<double> var_scaled_mean(dim_), extended_data(dim_+1);
-  extended_data.Range(0, dim_).CopyFromVec(data);
-  extended_data(dim_) = 1.0;
-  SpMatrix<double> scatter(dim_+1, kSetZero);
-  scatter.AddVec2(1.0, extended_data);
-  int32 j1 = model.Pdf2Group(j2);
-  for (int32 ki = 0, ki_max = gselect.size(); ki < ki_max; ki++) {
-    int32 i = gselect[ki];
-
-    for (int32 m = 0; m < model.NumSubstatesForGroup(j1); m++) {
-      // posterior gamma_{jkmi}(t)                             eq.(39)
-      BaseFloat gammat_jmi = posteriors(ki, m);
-
-      // Accumulate statistics for non-zero gaussian posterior
-      if (gammat_jmi > 0.0) {
-        stats_.beta_ += gammat_jmi;
-        model.GetVarScaledSubstateSpeakerMean(j1, m, i, spk,
-                                              &var_scaled_mean);
-        // Eq. (52): K += \gamma_{jmi} \Sigma_{i}^{-1} \mu_{jmi}^{(s)} x^{+T}
-        stats_.K_.AddVecVec(gammat_jmi, var_scaled_mean, extended_data);
-        // Eq. (53): G_{i} += \gamma_{jmi} x^{+} x^{+T}
-        stats_.G_[i].AddSp(gammat_jmi, scatter);
-      }  // non-zero posteriors
-    }  // loop over substates
-  }  // loop over selected Gaussians
-}
-
-void FmllrSgmm2Accs::AccumulateForFmllrSubspace(const AmSgmm2 &sgmm,
-    const Sgmm2FmllrGlobalParams &globals, SpMatrix<double> *grad_scatter) {
-  if (stats_.beta_ <= 0.0) {
-    KALDI_WARN << "Not committing any stats since no stats accumulated.";
-    return;
-  }
-  int32 dim = sgmm.FeatureDim();
-  Matrix<BaseFloat> xform(dim, dim + 1, kUndefined);
-  xform.SetUnit();
-  Matrix<BaseFloat> grad(dim, dim + 1, kSetZero);
-  this->FmllrObjGradient(sgmm, xform, &grad, NULL);
-  Matrix<BaseFloat> pre_xformed_grad(dim, dim + 1, kSetZero);
-  ApplyPreXformToGradient(globals, grad, &pre_xformed_grad);
-  Matrix<BaseFloat> hess_xformed_grad(dim, dim + 1, kSetZero);
-  ApplyHessianXformToGradient(globals, pre_xformed_grad, &hess_xformed_grad);
-  Vector<double> grad_vec(dim * (dim + 1));
-  grad_vec.CopyRowsFromMat(hess_xformed_grad);
-  grad_vec.Scale(1 / std::sqrt(stats_.beta_));
-  grad_scatter->AddVec2(1.0, grad_vec);
-  KALDI_LOG << "Frame counts for when committing fMLLR subspace stats are "
-            << stats_.beta_;
-}
-
-
-BaseFloat FmllrSgmm2Accs::FmllrObjGradient(const AmSgmm2 &sgmm,
-                                          const Matrix<BaseFloat> &xform,
-                                          Matrix<BaseFloat> *grad_out,
-                                          Matrix<BaseFloat> *G_out) const {
-  int32 dim = sgmm.FeatureDim(),
-      num_gauss = sgmm.NumGauss();
-  KALDI_ASSERT(stats_.G_.size() == static_cast<size_t>(num_gauss));
-  Matrix<double> xform_d(xform);
-  SubMatrix<double> A(xform_d, 0, dim, 0, dim);
-  Matrix<double> xform_g(dim, dim + 1), total_g(dim, dim + 1);
-  SpMatrix<double> inv_covar(dim);
-  double obj = stats_.beta_ * A.LogDet() +
-      TraceMatMat(xform_d, stats_.K_, kTrans);
-  for (int32 i = 0; i < num_gauss; i++) {
-    sgmm.GetInvCovars(i, &inv_covar);
-    xform_g.AddMatSp(1.0, xform_d, kNoTrans, stats_.G_[i], 0.0);
-    total_g.AddSpMat(1.0, inv_covar, xform_g, kNoTrans, 1.0);
-  }
-  obj -= 0.5 * TraceMatMat(xform_d, total_g, kTrans);
-  if (G_out != NULL) G_out->CopyFromMat(total_g);
-
-  // Compute the gradient: P = \beta [(A^{-1})^{T} , 0] + K - S
-  if (grad_out != NULL) {
-    Matrix<double> grad_d(dim, dim + 1, kSetZero);
-    grad_d.Range(0, dim, 0, dim).CopyFromMat(A);
-    grad_d.Range(0, dim, 0, dim).InvertDouble();
-    grad_d.Range(0, dim, 0, dim).Transpose();
-    grad_d.Scale(stats_.beta_);
-    grad_d.AddMat(-1.0, total_g, kNoTrans);
-    grad_d.AddMat(1.0, stats_.K_, kNoTrans);
-    grad_out->CopyFromMat(grad_d);
-  }
-
-  return obj;
-}
-
-
-void FmllrSgmm2Accs::Write(std::ostream &out, bool binary) const {
-  WriteToken(out, binary, "<FMLLRACCS>");
-  WriteToken(out, binary, "<DIMENSION>");
-  WriteBasicType(out, binary, dim_);
-  WriteToken(out, binary, "<STATS>");
-  stats_.Write(out, binary);
-  WriteToken(out, binary, "</FMLLRACCS>");
-}
-
-void FmllrSgmm2Accs::Read(std::istream &in, bool binary, bool add) {
-  ExpectToken(in, binary, "<FMLLRACCS>");
-  ExpectToken(in, binary, "<DIMENSION>");
-  ReadBasicType(in, binary, &dim_);
-  KALDI_ASSERT(dim_ > 0);
-  ExpectToken(in, binary, "<STATS>");
-  stats_.Read(in, binary, add);
-  ExpectToken(in, binary, "</FMLLRACCS>");
-}
-
-
-static BaseFloat CalcFmllrStepSize(const AffineXformStats &stats,
-                                   const AmSgmm2 &sgmm,
-                                   const MatrixBase<BaseFloat> &Delta,
-                                   const MatrixBase<BaseFloat> &A,
-                                   const Matrix<BaseFloat> &G,
-                                   int32 max_iters) {
-  int32 dim = sgmm.FeatureDim();
-  Matrix<double> Delta_d(Delta);
-  Matrix<double> G_d(G);
-  SubMatrix<double> Delta_C(Delta_d, 0, dim, 0, dim);
-
-  // Eq. (B.28): m = tr(\Delta K^T) - tr(\Delta S^T)
-  BaseFloat m = TraceMatMat(Delta_d, stats.K_, kTrans)
-                    - TraceMatMat(Delta_d, G_d, kTrans);
-  // Eq. (B.29): n = \sum_i tr(\Delta \Sigma_{i}^{-1} \Delta S_{i})
-  BaseFloat n = 0;
-  SpMatrix<double> inv_covar;
-  for (int32 i = 0, num_gauss = sgmm.NumGauss(); i < num_gauss; i++) {
-    sgmm.GetInvCovars(i, &inv_covar);
-    n += TraceMatSpMatSp(Delta_d, kTrans, inv_covar, Delta_d, kNoTrans,
-                         stats.G_[i]);
-  }
-
-  BaseFloat step_size = 0.0;
-  // initialize just to get rid of compile errors.
-  BaseFloat obj_step_old, obj_step_new = 0.0;
-  Matrix<double> new_A(dim, dim);
-  Matrix<double> B(dim, dim);
-  for (int32 iter_step = 0; iter_step < max_iters; iter_step++) {
-    if (iter_step == 0) {
-      obj_step_old = stats.beta_ * A.LogDet();  // Q_0 = \beta * log det(A)
-    } else {
-      obj_step_old = obj_step_new;
-    }
-
-    // Eq. (B.30); B = (A + k\Delta^{-C})^{-1} \Delta^{-C}
-    new_A.CopyFromMat(A);
-    new_A.AddMat(step_size, Delta_C, kNoTrans);
-    new_A.InvertDouble();
-    B.AddMatMat(1.0, new_A, kNoTrans, Delta_C, kNoTrans, 0.0);
-
-    BaseFloat d = m - step_size * n + stats.beta_ * TraceMat(B);
-    BaseFloat d2 = -n - stats.beta_ * TraceMatMat(B, B, kNoTrans);
-    if (std::fabs(d / d2) < 0.000001) { break; }  // converged
-
-    BaseFloat step_size_change = -(d / d2);
-    step_size += step_size_change;  // Eq. (B.33)
-
-    // Halve step size when the auxiliary function decreases.
-    do {
-      new_A.CopyFromMat(A);
-      new_A.AddMat(step_size, Delta_C, kNoTrans);
-      BaseFloat logdet = new_A.LogDet();
-      obj_step_new = stats.beta_ * logdet + step_size * m -
-          0.5 * step_size * step_size * n;
-
-      if (obj_step_new - obj_step_old < -0.001) {
-        KALDI_WARN << "Objective function decreased (" << obj_step_old << "->"
-                   << obj_step_new << "). Halving step size change ("
-                   << step_size << " -> " << (step_size - (step_size_change/2))
-                   << ")";
-        step_size_change /= 2;
-        step_size -= step_size_change;  // take away half of our step
-      }  // Facing numeric precision issues. Compute in double?
-    } while (obj_step_new - obj_step_old < -0.001 && step_size_change > 1e-05);
-  }
-  return step_size;
-}
-
-
-bool FmllrSgmm2Accs::Update(const AmSgmm2 &sgmm,
-                           const Sgmm2FmllrGlobalParams &globals,
-                           const Sgmm2FmllrConfig &opts,
-                           Matrix<BaseFloat> *out_xform,
-                           BaseFloat *frame_count, BaseFloat *auxf_out) const {
-  BaseFloat auxf_improv = 0.0, logdet = 0.0;
-  KALDI_ASSERT(out_xform->NumRows() == dim_ && out_xform->NumCols() == dim_+1);
-  BaseFloat mincount = (globals.HasBasis() ?
-      std::min(opts.fmllr_min_count_basis, opts.fmllr_min_count_full) :
-      opts.fmllr_min_count);
-  bool using_subspace = (globals.HasBasis() ?
-      (stats_.beta_ < opts.fmllr_min_count_full) : false);
-
-  if (globals.IsEmpty())
-    KALDI_ERR << "Must set up pre-transforms before estimating FMLLR.";
-
-  KALDI_VLOG(1) << "Mincount = " << mincount << "; Basis: "
-                << std::string(globals.HasBasis()? "yes; " : "no; ")
-                << "Using subspace: " << std::string(using_subspace? "yes; "
-                    : "no; ");
-
-  int32 num_bases = 0;
-  if (using_subspace) {
-    KALDI_ASSERT(globals.fmllr_bases_.size() != 0);
-    int32 max_bases = std::min(static_cast<int32>(globals.fmllr_bases_.size()),
-                               opts.num_fmllr_bases);
-    num_bases = (opts.bases_occ_scale <= 0.0)? max_bases :
-        std::min(max_bases, static_cast<int32>(std::floor(opts.bases_occ_scale
-                                                          * stats_.beta_)));
-    KALDI_VLOG(1) << "Have " << stats_.beta_ << " frames for speaker: Using "
-                  << num_bases << " fMLLR bases.";
-  }
-
-  // initialization just to get rid of compile errors.
-  BaseFloat auxf_old = 0, auxf_new = 0;
-  if (frame_count != NULL) *frame_count = stats_.beta_;
-
-  // If occupancy is greater than the min count, update the transform
-  if (stats_.beta_ >= mincount) {
-    for (int32 iter = 0; iter < opts.fmllr_iters; iter++) {
-      Matrix<BaseFloat> grad(dim_, dim_ + 1, kSetZero);
-      Matrix<BaseFloat> G(dim_, dim_ + 1, kSetZero);
-      auxf_new = this->FmllrObjGradient(sgmm, *out_xform, &grad, &G);
-
-      // For diagnostic purposes
-      KALDI_VLOG(3) << "Iter " << iter << ": Auxiliary function = "
-          << (auxf_new / stats_.beta_) << " per frame over " << stats_.beta_
-          << " frames";
-
-      if (iter > 0) {
-        // For diagnostic purposes
-        KALDI_VLOG(2) << "Iter " << iter << ": Auxiliary function improvement: "
-            << ((auxf_new - auxf_old) / stats_.beta_) << " per frame over "
-            << (stats_.beta_) << " frames";
-        auxf_improv += auxf_new - auxf_old;
-      }
-
-      Matrix<BaseFloat> pre_xformed_grad(dim_, dim_ + 1, kSetZero);
-      ApplyPreXformToGradient(globals, grad, &pre_xformed_grad);
-//      std::cout << "Pre-X Grad = " << pre_xformed_grad << std::endl;
-
-      // Transform P_sk with the Hessian
-      Matrix<BaseFloat> hess_xformed_grad(dim_, dim_ + 1, kSetZero);
-      ApplyHessianXformToGradient(globals, pre_xformed_grad,
-                                  &hess_xformed_grad);
-//      std::cout << "Hess-X Grad = " << hess_xformed_grad << std::endl;
-
-      // Update the actual FMLLR transform matrices
-      Matrix<BaseFloat> hess_xformed_delta(dim_, dim_ + 1, kUndefined);
-      if (using_subspace) {
-        // Note that in this case we can simply store the speaker-specific
-        // coefficients for each of the basis matrices. The current
-        // implementation stores the computed transform to simplify the code!
-        hess_xformed_delta.SetZero();
-        for (int32 b = 0; b < num_bases; b++) {  // Eq (B.20)
-          hess_xformed_delta.AddMat(TraceMatMat(globals.fmllr_bases_[b],
-                                                hess_xformed_grad, kTrans),
-                                    globals.fmllr_bases_[b], kNoTrans);
-        }
-        hess_xformed_delta.Scale(1 / stats_.beta_);
-      } else {
-        hess_xformed_delta.CopyFromMat(hess_xformed_grad);
-        hess_xformed_delta.Scale(1 / stats_.beta_);  // Eq. (B.19)
-      }
-
-//      std::cout << "Hess-X Delta = " << hess_xformed_delta << std::endl;
-
-      // Transform Delta with the Hessian
-      Matrix<BaseFloat> pre_xformed_delta(dim_, dim_ + 1, kSetZero);
-      ApplyInvHessianXformToChange(globals, hess_xformed_delta,
-                                   &pre_xformed_delta);
-
-      // Apply inverse pre-transform to Delta
-      Matrix<BaseFloat> delta(dim_, dim_ + 1, kSetZero);
-      ApplyInvPreXformToChange(globals, pre_xformed_delta, &delta);
-
-#ifdef KALDI_PARANOID
-      // Check whether co-ordinate transformation is correct.
-      {
-        BaseFloat tr1 = TraceMatMat(delta, grad, kTrans);
-        BaseFloat tr2 = TraceMatMat(pre_xformed_delta, pre_xformed_grad,
-                                    kTrans);
-        BaseFloat tr3 = TraceMatMat(hess_xformed_delta, hess_xformed_grad,
-                                    kTrans);
-        AssertEqual(tr1, tr2, 1e-5);
-        AssertEqual(tr2, tr3, 1e-5);
-      }
-#endif
-
-      // Calculate the optimal step size
-      SubMatrix<BaseFloat> A(*out_xform, 0, dim_, 0, dim_);
-      BaseFloat step_size = CalcFmllrStepSize(stats_, sgmm, delta, A, G,
-                                              opts.fmllr_iters);
-
-      // Update: W <-- W + k \Delta   Eq. (B.34)
-      out_xform->AddMat(step_size, delta, kNoTrans);
-      auxf_old = auxf_new;
-
-      // Check the objective function change for the last iteration
-      if (iter == opts.fmllr_iters - 1) {
-        auxf_new = this->FmllrObjGradient(sgmm, *out_xform, NULL, NULL);
-        logdet = A.LogDet();
-        // SubMatrix A points to the memory location of out_xform, and so will
-        // contain the updated value
-
-        KALDI_VLOG(2) << "Iter " << iter << ": Auxiliary function improvement: "
-            << ((auxf_new - auxf_old) / stats_.beta_) << " per frame over "
-            << (stats_.beta_) << " frames";
-        auxf_improv += auxf_new - auxf_old;
-      }
-    }
-    if (auxf_out != NULL) *auxf_out = auxf_improv;
-    auxf_improv /= (stats_.beta_ + 1.0e-10);
-
-    KALDI_LOG << "Auxiliary function improvement for FMLLR = " << auxf_improv
-        << " per frame over " << stats_.beta_ << " frames. Log-determinant = "
-        << logdet;
-    return true;
-  } else {
-    KALDI_ASSERT(stats_.beta_ < mincount);
-//    std::cerr.precision(10);
-//    std::cerr.setf(std::ios::fixed,std::ios::floatfield);
-    KALDI_WARN << "Not updating FMLLR because count is " << stats_.beta_
-        << " < " << (mincount);
-    if (auxf_out != NULL) *auxf_out = 0.0;
-    return false;
-  }  // Do not use the transform if it does not have enough counts
-  KALDI_ASSERT(false);  // Should never be reached.
-}
-
-void EstimateSgmm2FmllrSubspace(const SpMatrix<double> &fmllr_grad_scatter,
-                               int32 num_fmllr_bases, int32 feat_dim,
-                               Sgmm2FmllrGlobalParams *globals, double min_eig) {
-  KALDI_ASSERT(num_fmllr_bases > 0 && feat_dim > 0);
-  if (num_fmllr_bases >  feat_dim * (feat_dim + 1)) {
-    num_fmllr_bases = feat_dim * (feat_dim + 1);
-    KALDI_WARN << "Limiting number of fMLLR bases to be the same as transform "
-               << "dimension.";
-  }
-
-  vector< Matrix<BaseFloat> > &fmllr_bases(globals->fmllr_bases_);
-
-  Vector<double> s(fmllr_grad_scatter.NumRows());
-  Matrix<double> U(fmllr_grad_scatter.NumRows(),
-                      fmllr_grad_scatter.NumRows());
-  try {
-    fmllr_grad_scatter.Eig(&s, &U);
-    SortSvd(&s, &U);  // in case was not exactly sorted.
-    KALDI_VLOG(1) << "Eigenvalues (max 200) of CMLLR scatter are: "
-                  << (SubVector<double>(s, 0,
-                                        std::min(static_cast<MatrixIndexT>(200),
-                                                 s.Dim())));
-
-
-//    for (int32 b = 2; b < num_fmllr_bases; b++) {
-//      if (s(b) < min_eig) {
-//        num_fmllr_bases = b;
-//        KALDI_WARN << "Limiting number of fMLLR bases to " << num_fmllr_bases
-//                   << " because of small eigenvalues.";
-//        break;
-//      }
-//    }
-
-    U.Transpose();  // Now the rows of U correspond to the basis vectors.
-    fmllr_bases.resize(num_fmllr_bases);
-    for (int32 b = 0; b < num_fmllr_bases; b++) {
-      fmllr_bases[b].Resize(feat_dim, feat_dim + 1, kSetZero);
-      fmllr_bases[b].CopyRowsFromVec(U.Row(b));
-    }
-    KALDI_LOG << "Estimated " << num_fmllr_bases << " fMLLR basis matrices.";
-  } catch(const std::exception &e) {
-    KALDI_WARN << "Not estimating FMLLR bases because of a thrown exception:\n"
-               << e.what();
-    fmllr_bases.resize(0);
-  }
-}  // End of EstimateSgmm2FmllrSubspace
-
-
-}  // namespace kaldi
-
diff --git a/src/sgmm2/fmllr-sgmm2.h b/src/sgmm2/fmllr-sgmm2.h
deleted file mode 100644
index cfd716de534..00000000000
--- a/src/sgmm2/fmllr-sgmm2.h
+++ /dev/null
@@ -1,193 +0,0 @@
-// sgmm2/fmllr-sgmm2.h
-
-// Copyright 2009-2012     Saarland University (author: Arnab Ghoshal)
-//                         Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_SGMM2_FMLLR_SGMM2_H_
-#define KALDI_SGMM2_FMLLR_SGMM2_H_
-
-#include <string>
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "sgmm2/am-sgmm2.h"
-#include "transform/transform-common.h"
-#include "util/kaldi-table.h"
-#include "util/kaldi-holder.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-
-/** \struct Sgmm2FmllrConfig
- *  Configuration variables needed in the estimation of FMLLR for SGMMs.
- */
-struct Sgmm2FmllrConfig {
-  int32 fmllr_iters;  ///< Number of iterations in FMLLR estimation.
-  int32 step_iters;  ///< Iterations to find optimal FMLLR step size.
-  /// Minimum occupancy count to estimate FMLLR using basis matrices.
-  BaseFloat fmllr_min_count_basis;
-  /// Minimum occupancy count to estimate FMLLR without basis matrices.
-  BaseFloat fmllr_min_count;
-  /// Minimum occupancy count to stop using FMLLR bases and switch to
-  /// regular FMLLR estimation.
-  BaseFloat fmllr_min_count_full;
-  /// Number of basis matrices to use for FMLLR estimation. Can only *reduce*
-  /// the number of bases present. Overridden by the 'bases_occ_scale' option.
-  int32 num_fmllr_bases;
-  /// Scale per-speaker count to determine number of CMLLR bases.
-  BaseFloat bases_occ_scale;
-
-  Sgmm2FmllrConfig() {
-    fmllr_iters = 5;
-    step_iters = 10;
-    fmllr_min_count_basis = 100.0;
-    fmllr_min_count = 1000.0;
-    fmllr_min_count_full = 5000.0;
-    num_fmllr_bases = 50;
-    bases_occ_scale = 0.2;
-  }
-
-  void Register(OptionsItf *opts);
-};
-
-inline void Sgmm2FmllrConfig::Register(OptionsItf *opts) {
-  std::string module = "Sgmm2FmllrConfig: ";
-  opts->Register("fmllr-iters", &fmllr_iters, module+
-                 "Number of iterations in FMLLR estimation.");
-  opts->Register("fmllr-step-iters", &step_iters, module+
-                 "Number of iterations to find optimal FMLLR step size.");
-  opts->Register("fmllr-min-count-bases", &fmllr_min_count_basis, module+
-                 "Minimum occupancy count to estimate FMLLR using basis matrices.");
-  opts->Register("fmllr-min-count", &fmllr_min_count, module+
-                 "Minimum occupancy count to estimate FMLLR (without bases).");
-  opts->Register("fmllr-min-count-full", &fmllr_min_count_full, module+
-                 "Minimum occupancy count to stop using basis matrices for FMLLR.");
-  opts->Register("fmllr-num-bases", &num_fmllr_bases, module+
-                 "Number of FMLLR basis matrices.");
-  opts->Register("fmllr-bases-occ-scale", &bases_occ_scale, module+
-                 "Scale per-speaker count to determine number of CMLLR bases.");
-}
-
-
-/** \class Sgmm2FmllrGlobalParams
- *  Global adaptation parameters.
- */
-class Sgmm2FmllrGlobalParams {
- public:
-  void Init(const AmSgmm2 &sgmm, const Vector<BaseFloat> &state_occs);
-  void Write(std::ostream &out_stream, bool binary) const;
-  void Read(std::istream &in_stream, bool binary);
-  bool IsEmpty() const {
-    return (pre_xform_.NumRows() == 0 || inv_xform_.NumRows() == 0 ||
-            mean_scatter_.Dim() == 0);
-  }
-  bool HasBasis() const { return fmllr_bases_.size() != 0; }
-
-  /// Pre-transform matrix. Dim is [D][D+1].
-  Matrix<BaseFloat> pre_xform_;
-  /// Inverse of pre-transform. Dim is [D][D+1].
-  Matrix<BaseFloat> inv_xform_;
-  /// Diagonal of mean-scatter matrix. Dim is [D]
-  Vector<BaseFloat> mean_scatter_;
-  /// \tilde{W}_b.  [b][d][d], dim is [B][D][D+1].
-  std::vector< Matrix<BaseFloat> > fmllr_bases_;
-};
-
-inline void Sgmm2FmllrGlobalParams::Init(const AmSgmm2 &sgmm,
-                                        const Vector<BaseFloat> &state_occs) {
-  sgmm.ComputeFmllrPreXform(state_occs, &pre_xform_, &inv_xform_,
-                            &mean_scatter_);
-}
-
-/** \class FmllrSgmm2Accs
- *  Class for computing the accumulators needed for the maximum-likelihood
- *  estimate of FMLLR transforms for a subspace GMM acoustic model.
- */
-class FmllrSgmm2Accs {
- public:
-  FmllrSgmm2Accs() : dim_(-1) {}
-  ~FmllrSgmm2Accs() {}
-
-  void Init(int32 dim, int32 num_gaussians);
-  void SetZero() { stats_.SetZero(); }
-
-  void Write(std::ostream &out_stream, bool binary) const;
-  void Read(std::istream &in_stream, bool binary, bool add);
-
-  /// Accumulation routine that computes the Gaussian posteriors and calls
-  /// the AccumulateFromPosteriors function with the computed posteriors.
-  /// The 'data' argument is not FMLLR-transformed and is needed in addition
-  /// to the the 'frame_vars' since the latter only contains a copy of the
-  /// transformed feature vector.
-  BaseFloat Accumulate(const AmSgmm2 &sgmm,                       
-                       const VectorBase<BaseFloat> &data,
-                       const Sgmm2PerFrameDerivedVars &frame_vars,
-                       int32 state_index,
-                       BaseFloat weight,
-                       Sgmm2PerSpkDerivedVars *spk);
-
-  void AccumulateFromPosteriors(const AmSgmm2 &sgmm,
-                                const Sgmm2PerSpkDerivedVars &spk,
-                                const VectorBase<BaseFloat> &data,
-                                const std::vector<int32> &gauss_select,
-                                const Matrix<BaseFloat> &posteriors,
-                                int32 state_index);
-
-  void AccumulateForFmllrSubspace(const AmSgmm2 &sgmm,
-                                  const Sgmm2FmllrGlobalParams &fmllr_globals,
-                                  SpMatrix<double> *grad_scatter);
-
-  BaseFloat FmllrObjGradient(const AmSgmm2 &sgmm,
-                             const Matrix<BaseFloat> &xform,
-                             Matrix<BaseFloat> *grad_out,
-                             Matrix<BaseFloat> *G_out) const;
-
-  /// Computes the FMLLR transform from the accumulated stats, using the
-  /// pre-transforms in fmllr_globals. Expects the transform matrix out_xform
-  /// to be initialized to the correct size. Returns true if the transform was
-  /// updated (i.e. had enough counts).
-  bool Update(const AmSgmm2 &model,
-              const Sgmm2FmllrGlobalParams &fmllr_globals,
-              const Sgmm2FmllrConfig &opts, Matrix<BaseFloat> *out_xform,
-              BaseFloat *frame_count, BaseFloat *auxf_improv) const;
-
-  /// Accessors
-  int32 Dim() const { return dim_; }
-  const AffineXformStats &stats() const { return stats_; }
-
- private:
-  AffineXformStats stats_;  ///< Accumulated stats
-  int32 dim_;  ///< Dimension of feature vectors
-
-  // Cannot have copy constructor and assigment operator
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FmllrSgmm2Accs);
-};
-
-/// Computes the fMLLR basis matrices given the scatter of the vectorized
-/// gradients (eq: B.10). The result is stored in 'fmllr_globals'.
-/// The actual number of bases may be less than 'num_fmllr_bases' depending
-/// on the feature dimension and number of eigenvalues greater than 'min_eig'.
-void EstimateSgmm2FmllrSubspace(const SpMatrix<double> &fmllr_grad_scatter,
-                               int32 num_fmllr_bases, int32 feat_dim,
-                               Sgmm2FmllrGlobalParams *fmllr_globals,
-                               double min_eig = 0.0);
-
-}  // namespace kaldi
-
-#endif  // KALDI_SGMM2_FMLLR_SGMM2_H_
diff --git a/src/sgmm2bin/Makefile b/src/sgmm2bin/Makefile
deleted file mode 100644
index e973061ed8a..00000000000
--- a/src/sgmm2bin/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-
-all:
-EXTRA_CXXFLAGS = -Wno-sign-compare
-include ../kaldi.mk
-
-BINFILES = sgmm2-init sgmm2-gselect sgmm2-acc-stats sgmm2-est sgmm2-sum-accs \
-         sgmm2-align-compiled sgmm2-est-spkvecs sgmm2-post-to-gpost \
-         sgmm2-acc-stats-gpost sgmm2-latgen-faster sgmm2-est-spkvecs-gpost \
-         sgmm2-rescore-lattice sgmm2-copy sgmm2-info sgmm2-est-ebw \
-         sgmm2-acc-stats2 sgmm2-comp-prexform sgmm2-est-fmllr sgmm2-project \
-         sgmm2-latgen-faster-parallel init-ubm
-
-OBJFILES =
-
-
-
-TESTFILES =
-
-
-ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
-          ../fstext/kaldi-fstext.a ../sgmm2/kaldi-sgmm2.a ../hmm/kaldi-hmm.a \
-          ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
-          ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
-
-include ../makefiles/default_rules.mk
diff --git a/src/sgmm2bin/init-ubm.cc b/src/sgmm2bin/init-ubm.cc
deleted file mode 100644
index 3a0d398b7f6..00000000000
--- a/src/sgmm2bin/init-ubm.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-// sgmmbin/init-ubm.cc
-
-// Copyright 2009-2011   Saarland University
-// Author:  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "util/kaldi-io.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/full-gmm.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    typedef kaldi::BaseFloat BaseFloat;
-
-    const char *usage =
-        "Cluster the Gaussians in a diagonal-GMM acoustic model\n"
-        "to a single full-covariance or diagonal-covariance GMM.\n"
-        "Usage: init-ubm [options] <model-file> <state-occs> <gmm-out>\n";
-
-    bool binary_write = true, fullcov_ubm = true;
-    kaldi::ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("fullcov-ubm", &fullcov_ubm, "Write out full covariance UBM.");
-    kaldi::UbmClusteringOptions ubm_opts;
-    ubm_opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    ubm_opts.Check();
-    
-    std::string model_in_filename = po.GetArg(1),
-        occs_in_filename = po.GetArg(2),
-        gmm_out_filename = po.GetArg(3);
-
-    kaldi::AmDiagGmm am_gmm;
-    kaldi::TransitionModel trans_model;
-    {
-      bool binary_read;
-      kaldi::Input ki(model_in_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_gmm.Read(ki.Stream(), binary_read);
-    }
-
-    kaldi::Vector<BaseFloat> state_occs;
-    state_occs.Resize(am_gmm.NumPdfs());
-    {
-      bool binary_read;
-      kaldi::Input ki(occs_in_filename, &binary_read);
-      state_occs.Read(ki.Stream(), binary_read);
-    }
-
-    kaldi::DiagGmm ubm;
-    ClusterGaussiansToUbm(am_gmm, state_occs, ubm_opts, &ubm);
-    if (fullcov_ubm) {
-      kaldi::FullGmm full_ubm;
-      full_ubm.CopyFromDiagGmm(ubm);
-      kaldi::Output ko(gmm_out_filename, binary_write);
-      full_ubm.Write(ko.Stream(), binary_write);
-    } else {
-      kaldi::Output ko(gmm_out_filename, binary_write);
-      ubm.Write(ko.Stream(), binary_write);
-    }
-
-    KALDI_LOG << "Written UBM to " << gmm_out_filename;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-acc-stats-gpost.cc b/src/sgmm2bin/sgmm2-acc-stats-gpost.cc
deleted file mode 100644
index 9c6fa5989c8..00000000000
--- a/src/sgmm2bin/sgmm2-acc-stats-gpost.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-// sgmm2bin/sgmm2-acc-stats-gpost.cc
-
-// Copyright 2009-2012   Saarland University  Microsoft Corporation
-//                       Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  try {
-    const char *usage =
-        "Accumulate stats for SGMM training, given Gaussian-level posteriors\n"
-        "Usage: sgmm2-acc-stats-gpost [options] <model-in> <feature-rspecifier> "
-        "<gpost-rspecifier> <stats-out>\n"
-        "e.g.: sgmm2-acc-stats-gpost 1.mdl 1.ali scp:train.scp ark, s, cs:- 1.acc\n";
-
-    ParseOptions po(usage);
-    bool binary = true;
-    std::string spkvecs_rspecifier, utt2spk_rspecifier;
-    std::string update_flags_str = "vMNwcSt";
-    BaseFloat rand_prune = 1.0e-05;
-
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors");
-    po.Register("update-flags", &update_flags_str, "Which SGMM parameters to update: subset of vMNwcS.");
-    po.Read(argc, argv);
-
-    kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        gpost_rspecifier = po.GetArg(3),
-        accs_wxfilename = po.GetArg(4);
-
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    // Initialize the readers before the model, as this can avoid
-    // crashes on systems with low virtual memory.
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessSgmm2GauPostReader gpost_reader(gpost_rspecifier);
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-    RandomAccessTokenReader utt2spk_map(utt2spk_rspecifier);
-    
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    Vector<double> transition_accs;
-    trans_model.InitStats(&transition_accs);
-    MleAmSgmm2Accs sgmm_accs(rand_prune);
-    sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags, (spkvecs_rspecifier != ""));
-
-    double tot_t = 0.0;
-    kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
-    
-    int32 num_done = 0, num_err = 0;
-    std::string cur_spk;
-    Sgmm2PerSpkDerivedVars spk_vars;
-    
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      std::string spk = utt;
-
-      if (!utt2spk_rspecifier.empty()) {
-        if (!utt2spk_map.HasKey(utt)) {
-          KALDI_WARN << "utt2spk map does not have value for " << utt
-                     << ", ignoring this utterance.";
-          continue;
-        } else { spk = utt2spk_map.Value(utt); }
-      }
-
-      if (spk != cur_spk && cur_spk != "")
-        sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-      
-      if (spk != cur_spk || spk_vars.Empty()) {
-        spk_vars.Clear();
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(utt)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for " << utt;
-            num_err++;
-            continue;
-          }
-        } // else spk_vars is "empty"
-      }
-
-      cur_spk = spk;      
-      
-      const Matrix<BaseFloat> &mat = feature_reader.Value();
-      if (!gpost_reader.HasKey(utt) ||
-          gpost_reader.Value(utt).size() != mat.NumRows()) {
-        KALDI_WARN << "No Gaussian-posterior information for utterance "
-                   << utt << " (or wrong size).";
-        num_err++;
-        continue;
-      }
-      const Sgmm2GauPost &gpost = gpost_reader.Value(utt);
-      
-      num_done++;
-      BaseFloat tot_weight = 0.0;
-
-      for (size_t i = 0; i < gpost.size(); i++) {
-        const std::vector<int32> &gselect = gpost[i].gselect;
-        am_sgmm.ComputePerFrameVars(mat.Row(i), gselect, spk_vars,
-                                    &per_frame_vars);
-
-        for (size_t j = 0; j < gpost[i].tids.size(); j++) {
-          int32 tid = gpost[i].tids[j],  // transition identifier.
-              pdf_id = trans_model.TransitionIdToPdf(tid);
-          
-          BaseFloat weight = gpost[i].posteriors[j].Sum();
-          trans_model.Accumulate(weight, tid, &transition_accs);
-          sgmm_accs.AccumulateFromPosteriors(am_sgmm, per_frame_vars,
-                                             gpost[i].posteriors[j],
-                                             pdf_id, &spk_vars);
-          tot_weight += weight;
-        }
-      }
-
-      tot_t += tot_weight;
-      if (num_done % 50 == 0)
-        KALDI_LOG << "Processed " << num_done << " utterances";      
-    }
-    sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars); // for last speaker
-    
-    KALDI_LOG << "Overall number of frames is " << tot_t;
-    KALDI_LOG << "Done " << num_done << " files, "
-              << num_err << " with errors.";
-
-    {
-      Output ko(accs_wxfilename, binary);
-      transition_accs.Write(ko.Stream(), binary);
-      sgmm_accs.Write(ko.Stream(), binary);
-    }
-    KALDI_LOG << "Written accs.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-acc-stats.cc b/src/sgmm2bin/sgmm2-acc-stats.cc
deleted file mode 100644
index a083e169a2e..00000000000
--- a/src/sgmm2bin/sgmm2-acc-stats.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-// sgmm2bin/sgmm2-acc-stats.cc
-
-// Copyright 2009-2012   Saarland University (Author:  Arnab Ghoshal),
-//                       Johns Hopkins University (Author:  Daniel Povey)
-//                2014   Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "hmm/posterior.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  try {
-    const char *usage =
-        "Accumulate stats for SGMM training.\n"
-        "Usage: sgmm2-acc-stats [options] <model-in> <feature-rspecifier> "
-        "<posteriors-rspecifier> <stats-out>\n"
-        "e.g.: sgmm2-acc-stats --gselect=ark:gselect.ark 1.mdl 1.ali scp:train.scp 'ark:ali-to-post 1.ali ark:-|' 1.acc\n"
-        "(note: gselect option is mandatory)\n";
-        
-    ParseOptions po(usage);
-    bool binary = true;
-    std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
-    std::string update_flags_str = "vMNwcSt";
-    BaseFloat rand_prune = 1.0e-05;
-
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors");
-    po.Register("update-flags", &update_flags_str, "Which SGMM parameters to accumulate "
-                "stats for: subset of vMNwcS.");
-
-    po.Read(argc, argv);
-
-    kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is mandatory.";
-    
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        posteriors_rspecifier = po.GetArg(3),
-        accs_wxfilename = po.GetArg(4);
-
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    int32 num_done = 0, num_err = 0;
-    Vector<double> transition_accs;
-    MleAmSgmm2Accs sgmm_accs(rand_prune);
-
-    { // this anonymous scope is to ensure deallocation of unnecessary stuff
-      // while we're writing out the accs, which could be a long time for large
-      // models.
-      
-      // Initialize the readers before the model, as the model can
-      // be large, and we don't want to call fork() after reading it if
-      // virtual memory may be low.
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
-      RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-      RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                             utt2spk_rspecifier);
-      RandomAccessTokenReader utt2spk_map(utt2spk_rspecifier);
-      
-      AmSgmm2 am_sgmm;
-      TransitionModel trans_model;
-      {
-        bool binary;
-        Input ki(model_filename, &binary);
-        trans_model.Read(ki.Stream(), binary);
-        am_sgmm.Read(ki.Stream(), binary);
-      }
-
-
-      trans_model.InitStats(&transition_accs);
-      sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags, (spkvecs_rspecifier!=""));
-
-      double tot_like = 0.0;
-      double tot_t = 0;
-
-      kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
-      std::string cur_spk;
-      Sgmm2PerSpkDerivedVars spk_vars;
-              
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        std::string utt = feature_reader.Key();
-        std::string spk = utt;
-        if (!utt2spk_rspecifier.empty()) {
-          if (!utt2spk_map.HasKey(utt)) {
-            KALDI_WARN << "utt2spk map does not have value for " << utt
-                       << ", ignoring this utterance.";
-            continue;
-          } else { spk = utt2spk_map.Value(utt); }
-        }
-
-        if (spk != cur_spk && cur_spk != "")
-          sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);        
-        
-        if (spk != cur_spk || spk_vars.Empty()) {
-          spk_vars.Clear();
-          if (spkvecs_reader.IsOpen()) {
-            if (spkvecs_reader.HasKey(utt)) {
-              spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-              am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-            } else {
-              KALDI_WARN << "Cannot find speaker vector for " << utt;
-              num_err++;
-              continue;
-            }
-          } // else spk_vars is "empty"
-        }
-        
-        cur_spk = spk;
-        
-        const Matrix<BaseFloat> &features = feature_reader.Value();
-        if (!posteriors_reader.HasKey(utt) ||
-            posteriors_reader.Value(utt).size() != features.NumRows()) {
-          KALDI_WARN << "No posterior info available for utterance "
-                     << utt << " (or wrong size)";
-          num_err++;
-          continue;
-        }
-        const Posterior &posterior = posteriors_reader.Value(utt);
-      
-        if (!gselect_reader.HasKey(utt)
-            && gselect_reader.Value(utt).size() != features.NumRows()) {
-          KALDI_WARN << "No Gaussian-selection info available for utterance "
-                     << utt << " (or wrong size)";
-          num_err++;
-        }
-        const std::vector<std::vector<int32> > &gselect =
-            gselect_reader.Value(utt);
-
-        num_done++;
-      
-        BaseFloat tot_like_this_file = 0.0, tot_weight = 0.0;
-
-        Posterior pdf_posterior;
-        ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior);
-        for (size_t i = 0; i < posterior.size(); i++) {
-          am_sgmm.ComputePerFrameVars(features.Row(i), gselect[i], spk_vars,
-                                      &per_frame_vars);
-          // Accumulates for SGMM.
-          for (size_t j = 0; j < pdf_posterior[i].size(); j++) {
-            int32 pdf_id = pdf_posterior[i][j].first;
-            BaseFloat weight = pdf_posterior[i][j].second;
-            tot_like_this_file += sgmm_accs.Accumulate(am_sgmm, per_frame_vars,
-                                                       pdf_id, weight, &spk_vars)
-                * weight;
-            tot_weight += weight;
-          }
-
-          // Accumulates for transitions.
-          for (size_t j = 0; j < posterior[i].size(); j++) {
-            int32 tid = posterior[i][j].first;
-            BaseFloat weight = posterior[i][j].second;
-            trans_model.Accumulate(weight, tid, &transition_accs);
-          }
-        }
-        
-        KALDI_VLOG(2) << "Average like for this file is "
-                      << (tot_like_this_file/tot_weight) << " over "
-                      << tot_weight <<" frames.";
-        tot_like += tot_like_this_file;
-        tot_t += tot_weight;
-        if (num_done % 50 == 0) {
-          KALDI_LOG << "Processed " << num_done << " utterances; for utterance "
-                    << utt << " avg. like is "
-                    << (tot_like_this_file/tot_weight)
-                    << " over " << tot_weight <<" frames.";
-        }
-      }
-      sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars); // commit stats for
-      // last speaker.
-      
-      KALDI_LOG << "Overall like per frame (Gaussian only) = "
-                << (tot_like/tot_t) << " over " << tot_t << " frames.";
-
-      KALDI_LOG << "Done " << num_done << " files, " << num_err
-                << " with errors.";
-    } 
-
-    {
-      Output ko(accs_wxfilename, binary);
-      transition_accs.Write(ko.Stream(), binary);
-      sgmm_accs.Write(ko.Stream(), binary);
-    }
-    KALDI_LOG << "Written accs.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-acc-stats2.cc b/src/sgmm2bin/sgmm2-acc-stats2.cc
deleted file mode 100644
index a2446df9d6e..00000000000
--- a/src/sgmm2bin/sgmm2-acc-stats2.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-// sgmm2bin/sgmm2-acc-stats2.cc
-
-// Copyright 2009-2012   Saarland University (Author:  Arnab Ghoshal),
-//                       Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "hmm/posterior.h"
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  try {
-    const char *usage =
-        "Accumulate numerator and denominator stats for discriminative training\n"
-        "of SGMMs (input is posteriors of mixed sign)\n"
-        "Usage: sgmm2-acc-stats2 [options] <model-in> <feature-rspecifier> "
-        "<posteriors-rspecifier> <num-stats-out> <den-stats-out>\n"
-        "e.g.: sgmm2-acc-stats2 1.mdl 1.ali scp:train.scp ark:1.posts num.acc den.acc\n";
-
-    ParseOptions po(usage);
-    bool binary = true;
-    std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
-    std::string update_flags_str = "vMNwucSt";
-    BaseFloat rand_prune = 1.0e-05;
-
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors");
-    po.Register("update-flags", &update_flags_str, "Which SGMM parameters to accumulate "
-                "stats for: subset of vMNwcS.");
-
-    po.Read(argc, argv);
-
-    kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str);
-    
-    if (po.NumArgs() != 5) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        posteriors_rspecifier = po.GetArg(3),
-        num_accs_wxfilename = po.GetArg(4),
-        den_accs_wxfilename = po.GetArg(5);
-    
-
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    // Initialize the readers before the model, as the model can
-    // be large, and we don't want to call fork() after reading it if
-    // virtual memory may be low.
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-    RandomAccessTokenReader utt2spk_map(utt2spk_rspecifier);    
-    
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    if (acc_flags & kSgmmSpeakerWeightProjections && !am_sgmm.HasSpeakerDependentWeights()) {
-      acc_flags &= ~kSgmmSpeakerWeightProjections;
-      KALDI_WARN << "Removing speaker weight projections (u) from flags "
-          "as not present in model\n";
-    }
-    if (acc_flags & kSgmmSpeakerProjections && !am_sgmm.HasSpeakerSpace()) {
-      acc_flags &= ~kSgmmSpeakerProjections;
-      KALDI_WARN << "Removing speaker projections (N) from flags "
-          "as not present in model\n";
-    }
-    
-    Vector<double> num_transition_accs, den_transition_accs;
-    if (acc_flags & kaldi::kSgmmTransitions) {
-      trans_model.InitStats(&num_transition_accs);
-      trans_model.InitStats(&den_transition_accs);
-    }
-    MleAmSgmm2Accs num_sgmm_accs(rand_prune), den_sgmm_accs(rand_prune);
-    bool have_spk_vecs = (spkvecs_rspecifier != "");
-    num_sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags, have_spk_vecs);
-    den_sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags, have_spk_vecs);   
-
-    double tot_like = 0.0, tot_weight = 0.0, tot_abs_weight = 0.0;
-    int64 tot_frames = 0;
-
-    kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
-
-    int32 num_done = 0, num_err = 0;
-    std::string cur_spk;
-    Sgmm2PerSpkDerivedVars spk_vars;
-    
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      std::string utt = feature_reader.Key();
-      std::string spk = utt;
-      if (!utt2spk_rspecifier.empty()) {
-        if (!utt2spk_map.HasKey(utt)) {
-          KALDI_WARN << "utt2spk map does not have value for " << utt
-                     << ", ignoring this utterance.";
-          continue;
-        } else { spk = utt2spk_map.Value(utt); }
-      }
-      if (spk != cur_spk && cur_spk != "") {
-        num_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-        den_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-      }
-      if (spk != cur_spk || spk_vars.Empty()) {
-        spk_vars.Clear();
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(utt)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for " << utt;
-            num_err++;
-            continue;
-          }
-        } // else spk_vars is "empty"
-      }
-      cur_spk = spk;
-      
-      const Matrix<BaseFloat> &features = feature_reader.Value();
-      if (!posteriors_reader.HasKey(utt) ||
-          posteriors_reader.Value(utt).size() != features.NumRows()) {
-        KALDI_WARN << "No posterior info available for utterance "
-                   << utt << " (or wrong size)";
-        num_err++;
-        continue;
-      }
-      
-      const Posterior &posterior = posteriors_reader.Value(utt);
-      if (!gselect_reader.HasKey(utt)
-          && gselect_reader.Value(utt).size() != features.NumRows()) {
-        KALDI_WARN << "No Gaussian-selection info available for utterance "
-                   << utt << " (or wrong size)";
-        num_err++;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(utt);
-
-      num_done++;
-      BaseFloat tot_like_this_file = 0.0, tot_weight_this_file = 0.0,
-          tot_abs_weight_this_file = 0.0;
-        
-      for (size_t i = 0; i < posterior.size(); i++) {
-        if (posterior[i].empty())
-          continue;
-        am_sgmm.ComputePerFrameVars(features.Row(i), gselect[i], spk_vars,
-                                    &per_frame_vars);
-        
-        for (size_t j = 0; j < posterior[i].size(); j++) {
-          int32 tid = posterior[i][j].first,  // transition identifier.
-              pdf_id = trans_model.TransitionIdToPdf(tid);
-          BaseFloat weight = posterior[i][j].second,
-              abs_weight = std::abs(weight);
-            
-          if (acc_flags & kaldi::kSgmmTransitions) {
-            trans_model.Accumulate(abs_weight, tid,  weight > 0 ?
-                                   &num_transition_accs : &den_transition_accs);
-          }
-          tot_like_this_file +=
-              (weight > 0 ? num_sgmm_accs : den_sgmm_accs).Accumulate(
-                  am_sgmm, per_frame_vars, pdf_id, abs_weight, &spk_vars)
-              * weight;
-          tot_weight_this_file += weight;
-          tot_abs_weight_this_file += abs_weight;
-        }
-      }
-      // Commit stats for the last speaker.
-      num_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-      den_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-      
-        
-      tot_like += tot_like_this_file;
-      tot_weight += tot_weight_this_file;
-      tot_abs_weight += tot_abs_weight_this_file;
-      tot_frames += posterior.size();
-      if (num_done % 50 == 0)
-        KALDI_LOG << "Processed " << num_done << " utterances.";
-    }
-    // Commit stats for last speaker.
-    num_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-    den_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars);
-    
-    KALDI_LOG << "Overall weighted acoustic likelihood per frame was "
-              << (tot_like/tot_frames) << " over " << tot_frames << " frames; "
-              << "average weight per frame is " << (tot_weight/tot_frames)
-              << ", average abs(weight) per frame is "
-              << (tot_abs_weight/tot_frames);
-    
-    KALDI_LOG << "Done " << num_done << " files, " << num_err
-              << " with errors.";
-    
-    {
-      Output ko(num_accs_wxfilename, binary);
-      num_transition_accs.Write(ko.Stream(), binary);
-      num_sgmm_accs.Write(ko.Stream(), binary);
-    }
-    {
-      Output ko(den_accs_wxfilename, binary);
-      den_transition_accs.Write(ko.Stream(), binary);
-      den_sgmm_accs.Write(ko.Stream(), binary);
-    }
-    KALDI_LOG << "Written accs.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/sgmm2bin/sgmm2-align-compiled.cc b/src/sgmm2bin/sgmm2-align-compiled.cc
deleted file mode 100644
index 6b733fe9ab8..00000000000
--- a/src/sgmm2bin/sgmm2-align-compiled.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-// sgmm2bin/sgmm2-align-compiled.cc
-
-// Copyright 2009-2012  Microsoft Corporation;  Saarland University
-//           2012-2014 Johns Hopkins University (Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "hmm/hmm-utils.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "decoder/training-graph-compiler.h"
-#include "sgmm2/decodable-am-sgmm2.h"
-#include "lat/kaldi-lattice.h" // for {Compact}LatticeArc
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Align features given [SGMM-based] models.\n"
-        "Usage: sgmm2-align-compiled [options] <model-in> <graphs-rspecifier> "
-        "<feature-rspecifier> <alignments-wspecifier>\n"
-        "e.g.: sgmm2-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n";
-
-    ParseOptions po(usage);
-    bool binary = true;
-    AlignConfig align_config;
-    BaseFloat acoustic_scale = 1.0;
-    BaseFloat transition_scale = 1.0;
-    BaseFloat self_loop_scale = 1.0;
-    BaseFloat log_prune = 5.0;
-    std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
-    std::string per_frame_acwt_wspecifier;
-
-    align_config.Register(&po);
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("log-prune", &log_prune, "Pruning beam used to reduce number "
-                "of exp() evaluations.");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic "
-                "likelihoods");
-    po.Register("transition-scale", &transition_scale, "Scaling factor for "
-                "some transition probabilities [see also self-loop-scale].");
-    po.Register("self-loop-scale", &self_loop_scale, "Scaling factor for "
-                "self-loop versus non-self-loop probability mass [controls "
-                "most transition probabilities.]");
-    po.Register("write-per-frame-acoustic-loglikes", &per_frame_acwt_wspecifier,
-                "Wspecifier for table of vectors containing the acoustic log-likelihoods "
-                "per frame for each utterance. E.g. ark:foo/per_frame_logprobs.1.ark");
-    po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices "
-                "(rspecifier)");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is mandatory.";
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_rspecifier = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        alignment_wspecifier = po.GetArg(4);
-
-    TransitionModel trans_model;
-    AmSgmm2 am_sgmm;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_rspecifier);
-    RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-    BaseFloatVectorWriter per_frame_acwt_writer(per_frame_acwt_wspecifier);
-
-    int num_done = 0, num_err = 0, num_retry = 0;
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-
-    for (; !fst_reader.Done(); fst_reader.Next()) {
-      std::string utt = fst_reader.Key();
-      if (!feature_reader.HasKey(utt)) {
-        KALDI_WARN << "No feature found for utterance " << utt;
-        num_err++;
-        continue;
-      }
-      VectorFst<StdArc> decode_fst(fst_reader.Value());
-      // stops copy-on-write of the fst by deleting the fst inside the reader,
-      // since we're about to mutate the fst by adding transition probs.
-      fst_reader.FreeCurrent();
-
-      const Matrix<BaseFloat> &features = feature_reader.Value(utt);
-      if (features.NumRows() == 0) {
-        KALDI_WARN << "Zero-length utterance: " << utt;
-        num_err++;
-        continue;
-      }
-
-      Sgmm2PerSpkDerivedVars spk_vars;
-      if (spkvecs_reader.IsOpen()) {
-        if (spkvecs_reader.HasKey(utt)) {
-          spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-          am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-        } else {
-          KALDI_WARN << "Cannot find speaker vector for " << utt;
-          num_err++;
-          continue;
-        }
-      }  // else spk_vars is "empty"
-
-      if (!gselect_reader.HasKey(utt)
-          && gselect_reader.Value(utt).size() != features.NumRows()) {
-        KALDI_WARN << "No Gaussian-selection info available for utterance "
-                   << utt << " (or wrong size)";
-        num_err++;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(utt);
-
-      {  // Add transition-probs to the FST.
-        std::vector<int32> disambig_syms;  // empty.
-        AddTransitionProbs(trans_model, disambig_syms,
-                           transition_scale, self_loop_scale,
-                           &decode_fst);
-      }
-
-      DecodableAmSgmm2Scaled sgmm_decodable(am_sgmm, trans_model, features, gselect,
-                                            log_prune, acoustic_scale, &spk_vars);
-
-      AlignUtteranceWrapper(align_config, utt,
-                            acoustic_scale, &decode_fst, &sgmm_decodable,
-                            &alignment_writer, NULL,
-                            &num_done, &num_err, &num_retry,
-                            &tot_like, &frame_count, &per_frame_acwt_writer);
-
-    }
-
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count)
-              << " over " << frame_count<< " frames.";
-    KALDI_LOG << "Retried " << num_retry << " out of "
-              << (num_done + num_err) << " utterances.";
-    KALDI_LOG << "Done " << num_done << ", errors on " << num_err;
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/sgmm2bin/sgmm2-comp-prexform.cc b/src/sgmm2bin/sgmm2-comp-prexform.cc
deleted file mode 100644
index a216300fab7..00000000000
--- a/src/sgmm2bin/sgmm2-comp-prexform.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-// sgmm2bin/sgmm2-comp-prexform.cc
-
-// Copyright 2009-2012  Saarland University (author: Arnab Ghoshal)
-//                      Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/fmllr-sgmm2.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Compute \"pre-transform\" parameters required for estimating fMLLR with\n"
-        "SGMMs, and write to a model file, after the SGMM.\n"
-        "Usage: sgmm2-comp-prexform [options] <sgmm2-in> <occs-in> <sgmm-out>\n";
-
-    bool binary = true;
-    kaldi::ParseOptions po(usage);
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string sgmm_in_filename = po.GetArg(1),
-        occs_filename = po.GetArg(2),
-        sgmm_out_filename = po.GetArg(3);
-
-    kaldi::AmSgmm2 sgmm_in;
-    kaldi::TransitionModel trans_model;
-    {
-      bool binary_read;
-      kaldi::Input ki(sgmm_in_filename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      sgmm_in.Read(ki.Stream(), binary_read);
-    }
-
-    kaldi::Vector<kaldi::BaseFloat> occs;
-    {
-      bool binary_read;
-      kaldi::Input ki(occs_filename, &binary_read);
-      occs.Read(ki.Stream(), binary_read);
-    }
-
-    kaldi::Sgmm2FmllrGlobalParams fmllr_globals;
-    sgmm_in.ComputeFmllrPreXform(occs, &fmllr_globals.pre_xform_,
-                                 &fmllr_globals.inv_xform_,
-                                 &fmllr_globals.mean_scatter_);
-
-    {
-      kaldi::Output ko(sgmm_out_filename, binary);
-      trans_model.Write(ko.Stream(), binary);
-      sgmm_in.Write(ko.Stream(), binary, kaldi::kSgmmWriteAll);
-      fmllr_globals.Write(ko.Stream(), binary);
-    }
-
-    KALDI_LOG << "Written model to " << sgmm_out_filename;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-copy.cc b/src/sgmm2bin/sgmm2-copy.cc
deleted file mode 100644
index b3271e0938f..00000000000
--- a/src/sgmm2bin/sgmm2-copy.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// sgmm2bin/sgmm2-copy.cc
-
-// Copyright 2009-2012  Microsoft Corporation
-//                      Johns Hopkins University (author: Daniel Povey).
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    const char *usage =
-        "Copy SGMM (possibly changing binary/text format)\n"
-        "Usage: sgmm2-copy [options] <model-in> <model-out>\n"
-        "e.g.: sgmm2-copy --binary=false 1.mdl 1_text.mdl\n";
-
-    bool binary_write = true;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-
-    po.Read(argc, argv);
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-    std::string model_in_filename = po.GetArg(1),
-        model_out_filename = po.GetArg(2);
-
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    {
-      Output ko(model_out_filename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_sgmm.Write(ko.Stream(), binary_write, kSgmmWriteAll);
-    }
-    
-    
-    KALDI_LOG << "Written model to " << model_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-est-ebw.cc b/src/sgmm2bin/sgmm2-est-ebw.cc
deleted file mode 100644
index bff0e8ff04b..00000000000
--- a/src/sgmm2bin/sgmm2-est-ebw.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-// sgmm2bin/sgmm2-est-ebw.cc
-
-// Copyright 2012  Johns Hopkins Univerity (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "util/kaldi-thread.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/estimate-am-sgmm2-ebw.h"
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  typedef kaldi::int32 int32;
-  using std::string;
-  try {
-    const char *usage =
-        "Estimate SGMM model parameters discriminatively using Extended\n"
-        "Baum-Welch style of update\n"
-        "Usage: sgmm2-est-ebw [options] <model-in> <num-stats-in> <den-stats-in> <model-out>\n";
-
-
-    string update_flags_str = "vMNwcSt";
-    bool binary_write = true;
-    string write_flags_str = "gsnu";
-    EbwAmSgmm2Options opts;
-
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("update-flags", &update_flags_str, "Which SGMM parameters to "
-                "update: subset of vMNwcSt.");
-    po.Register("write-flags", &write_flags_str, "Which SGMM parameters to "
-                "write: subset of gsnu");
-    po.Register("num-threads", &g_num_threads, "Number of threads to use in "
-                "weight update and normalizer computation");
-    opts.Register(&po);
-
-    po.Read(argc, argv);
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-    string model_in_filename = po.GetArg(1),
-        num_stats_filename = po.GetArg(2),
-        den_stats_filename = po.GetArg(3),
-        model_out_filename = po.GetArg(4);
-    
-    SgmmUpdateFlagsType update_flags = StringToSgmmUpdateFlags(update_flags_str);
-    SgmmWriteFlagsType write_flags = StringToSgmmWriteFlags(write_flags_str);
-
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    MleAmSgmm2Accs sgmm_num_accs;
-    {
-      bool binary;
-      Vector<double> transition_accs; // won't be used.
-      Input ki(num_stats_filename, &binary);
-      transition_accs.Read(ki.Stream(), binary);
-      sgmm_num_accs.Read(ki.Stream(), binary, false);  // false == add; doesn't matter.
-    }
-    MleAmSgmm2Accs sgmm_den_accs;
-    {
-      bool binary;
-      Vector<double> transition_accs; // won't be used.
-      Input ki(den_stats_filename, &binary);
-      transition_accs.Read(ki.Stream(), binary);
-      sgmm_den_accs.Read(ki.Stream(), binary, false);  // false == add; doesn't matter.
-    }
-    
-    sgmm_num_accs.Check(am_sgmm, true); // Will check consistency and print some diagnostics.
-    sgmm_den_accs.Check(am_sgmm, true); // Will check consistency and print some diagnostics.    
-
-    {  // Update SGMM.
-      BaseFloat auxf_impr, count;
-      kaldi::EbwAmSgmm2Updater sgmm_updater(opts);
-      sgmm_updater.Update(sgmm_num_accs, sgmm_den_accs, &am_sgmm,
-                          update_flags, &auxf_impr, &count);
-      KALDI_LOG << "Overall auxf impr/frame from SGMM update is " << (auxf_impr/count)
-                << " over " << count << " frames.";
-    }
-
-    {
-      Output ko(model_out_filename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_sgmm.Write(ko.Stream(), binary_write, write_flags);
-    }
-    
-    KALDI_LOG << "Wrote model to " << model_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/sgmm2bin/sgmm2-est-fmllr.cc b/src/sgmm2bin/sgmm2-est-fmllr.cc
deleted file mode 100644
index 8d2b28ee325..00000000000
--- a/src/sgmm2bin/sgmm2-est-fmllr.cc
+++ /dev/null
@@ -1,302 +0,0 @@
-// sgmm2bin/sgmm2-est-fmllr.cc
-
-// Copyright 2009-2012  Saarland University   Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
-//                2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/fmllr-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-
-void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
-                            const Matrix<BaseFloat> &transformed_feats, // if already fMLLR
-                            const std::vector<std::vector<int32> > &gselect,
-                            const Posterior &post,
-                            const TransitionModel &trans_model,
-                            const AmSgmm2 &am_sgmm,
-                            BaseFloat logdet,
-                            Sgmm2PerSpkDerivedVars *spk_vars,
-                            FmllrSgmm2Accs *spk_stats) {
-  kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
-
-  Posterior pdf_post;
-  ConvertPosteriorToPdfs(trans_model, post, &pdf_post);
-  for (size_t t = 0; t < post.size(); t++) {
-    // per-frame vars only used for computing posteriors... use the
-    // transformed feats for this, if available.
-    am_sgmm.ComputePerFrameVars(transformed_feats.Row(t), gselect[t],
-                                *spk_vars, &per_frame_vars);
-    
-
-    for (size_t j = 0; j < pdf_post[t].size(); j++) {
-      int32 pdf_id = pdf_post[t][j].first;
-      Matrix<BaseFloat> posteriors;
-      am_sgmm.ComponentPosteriors(per_frame_vars, pdf_id,
-                                  spk_vars, &posteriors);
-      posteriors.Scale(pdf_post[t][j].second);
-      spk_stats->AccumulateFromPosteriors(am_sgmm, *spk_vars, feats.Row(t),
-                                          gselect[t], posteriors, pdf_id);
-    }
-  }
-}
-
-}  // end namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Estimate FMLLR transform for SGMMs, either per utterance or for the "
-        "supplied set of speakers (with spk2utt option).\n"
-        "Reads state-level posteriors. Writes to a table of matrices.\n"
-        "--gselect option is mandatory.\n"
-        "Usage: sgmm2-est-fmllr [options] <model-in> <feature-rspecifier> "
-        "<post-rspecifier> <mats-wspecifier>\n";
-    
-    ParseOptions po(usage);
-    string spk2utt_rspecifier, spkvecs_rspecifier, fmllr_rspecifier,
-        gselect_rspecifier;
-    BaseFloat min_count = 100;
-    Sgmm2FmllrConfig fmllr_opts;
-    
-    po.Register("spk2utt", &spk2utt_rspecifier,
-                "File to read speaker to utterance-list map from.");
-    po.Register("spkvec-min-count", &min_count,
-                "Minimum count needed to estimate speaker vectors");
-    po.Register("spk-vecs", &spkvecs_rspecifier,
-                "Speaker vectors to use during aligment (rspecifier)");
-    po.Register("input-fmllr", &fmllr_rspecifier,
-                "Initial FMLLR transform per speaker (rspecifier)");
-    po.Register("gselect", &gselect_rspecifier,
-                "Precomputed Gaussian indices (rspecifier)");
-    fmllr_opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    string model_rxfilename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        post_rspecifier = po.GetArg(3),
-        fmllr_wspecifier = po.GetArg(4);
-
-    TransitionModel trans_model;
-    AmSgmm2 am_sgmm;
-    Sgmm2FmllrGlobalParams fmllr_globals;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-      fmllr_globals.Read(ki.Stream(), binary);
-    }
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is required.";
-    
-    RandomAccessPosteriorReader post_reader(post_rspecifier);
-    RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatMatrixReader fmllr_reader(fmllr_rspecifier);
-
-    BaseFloatMatrixWriter fmllr_writer(fmllr_wspecifier);
-
-    int32 dim = am_sgmm.FeatureDim();
-    FmllrSgmm2Accs spk_stats;
-    spk_stats.Init(dim, am_sgmm.NumGauss());
-    Matrix<BaseFloat> fmllr_xform(dim, dim + 1);
-    BaseFloat logdet = 0.0;
-    double tot_impr = 0.0, tot_t = 0.0;
-    int32 num_done = 0, num_err = 0;
-    std::vector<std::vector<int32> > empty_gselect;
-
-    if (!spk2utt_rspecifier.empty()) {  // per-speaker adaptation
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        spk_stats.SetZero();
-        string spk = spk2utt_reader.Key();
-        const vector<string> &uttlist = spk2utt_reader.Value();
-
-        Sgmm2PerSpkDerivedVars spk_vars;
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(spk)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(spk));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for " << spk;
-            num_err++;
-            continue;
-          }
-        }  // else spk_vars is "empty"
-
-        if (fmllr_reader.IsOpen()) {
-          if (fmllr_reader.HasKey(spk)) {
-            fmllr_xform.CopyFromMat(fmllr_reader.Value(spk));
-            logdet = fmllr_xform.Range(0, dim, 0, dim).LogDet();
-          } else {
-            KALDI_WARN << "Cannot find FMLLR transform for " << spk;
-            fmllr_xform.SetUnit();
-            logdet = 0.0;
-          }
-        } else {
-          fmllr_xform.SetUnit();
-          logdet = 0.0;
-        }
-
-        for (size_t i = 0; i < uttlist.size(); i++) {
-          std::string utt = uttlist[i];
-          if (!feature_reader.HasKey(utt)) {
-            KALDI_WARN << "Did not find features for utterance " << utt;
-            num_err++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-          if (!post_reader.HasKey(utt) ||
-              post_reader.Value(utt).size() != feats.NumRows()) {
-            KALDI_WARN << "Did not find posteriors for utterance " << utt
-                       << " (or wrong size).";
-            num_err++;
-            continue;
-          }
-          const Posterior &post = post_reader.Value(utt);
-          if (!gselect_reader.HasKey(utt) ||
-              gselect_reader.Value(utt).size() != feats.NumRows()) {
-            KALDI_WARN << "Did not find gselect info for utterance " << utt
-                       << " (or wrong size).";
-            num_err++;
-            continue;
-          }
-          const std::vector<std::vector<int32> > &gselect =
-              gselect_reader.Value(utt);
-          
-          Matrix<BaseFloat> transformed_feats(feats);
-          for (int32 r = 0; r < transformed_feats.NumRows(); r++) {
-            SubVector<BaseFloat> row(transformed_feats, r);
-            ApplyAffineTransform(fmllr_xform, &row);
-          }
-          AccumulateForUtterance(feats, transformed_feats, gselect,
-                                 post, trans_model, am_sgmm,
-                                 logdet, &spk_vars, &spk_stats);
-          num_done++;
-        }  // end looping over all utterances of the current speaker
-        
-        BaseFloat impr, spk_frame_count;
-        // Compute the FMLLR transform and write it out.
-        spk_stats.Update(am_sgmm, fmllr_globals, fmllr_opts, &fmllr_xform,
-                         &spk_frame_count, &impr);
-        fmllr_writer.Write(spk, fmllr_xform);
-        tot_impr += impr;
-        tot_t += spk_frame_count;
-      }  // end looping over speakers
-    } else {  // per-utterance adaptation
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        string utt = feature_reader.Key();
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-
-        if (!post_reader.HasKey(utt) ||
-            post_reader.Value(utt).size() != feats.NumRows()) {
-          KALDI_WARN << "Did not find posteriors for utterance " << utt
-                     << " (or wrong size).";
-          num_err++;
-          continue;
-        }
-        const Posterior &post = post_reader.Value(utt);
-        if (!gselect_reader.HasKey(utt) ||
-            gselect_reader.Value(utt).size() != feats.NumRows()) {
-          KALDI_WARN << "Did not find gselect info for utterance " << utt
-                     << " (or wrong size).";
-          num_err++;
-          continue;
-        }
-        const std::vector<std::vector<int32> > &gselect =
-            gselect_reader.Value(utt);
-        
-        if (fmllr_reader.IsOpen()) {
-          if (fmllr_reader.HasKey(utt)) {
-            fmllr_xform.CopyFromMat(fmllr_reader.Value(utt));
-            logdet = fmllr_xform.Range(0, dim, 0, dim).LogDet();
-          } else {
-            KALDI_WARN << "Cannot find FMLLR transform for " << utt;
-            fmllr_xform.SetUnit();
-            logdet = 0.0;
-          }
-        } else {
-          fmllr_xform.SetUnit();
-          logdet = 0.0;
-        }
-        
-        Matrix<BaseFloat> transformed_feats(feats);
-        for (int32 r = 0; r < transformed_feats.NumRows(); r++) {
-          SubVector<BaseFloat> row(transformed_feats, r);
-          ApplyAffineTransform(fmllr_xform, &row);
-        }
-        
-        Sgmm2PerSpkDerivedVars spk_vars;
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(utt)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for " << utt;
-            num_err++;
-            continue;
-          }
-        }  // else spk_vars is "empty"
-
-        spk_stats.SetZero();
-
-        AccumulateForUtterance(feats, transformed_feats, gselect,
-                               post, trans_model, am_sgmm,
-                               logdet, &spk_vars, &spk_stats);
-        num_done++;
-        
-        BaseFloat impr, spk_frame_count;
-        // Compute the FMLLR transform and write it out.
-        spk_stats.Update(am_sgmm, fmllr_globals, fmllr_opts, &fmllr_xform,
-                         &spk_frame_count, &impr);
-        fmllr_writer.Write(utt, fmllr_xform);
-        tot_impr += impr;
-        tot_t += spk_frame_count;
-      }
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, " << num_err << " with errors.";
-    KALDI_LOG << "Overall auxf impr per frame is " << (tot_impr / tot_t)
-              << " per frame, over " << tot_t << " frames.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/sgmm2bin/sgmm2-est-spkvecs-gpost.cc b/src/sgmm2bin/sgmm2-est-spkvecs-gpost.cc
deleted file mode 100644
index e4b680cfd3f..00000000000
--- a/src/sgmm2bin/sgmm2-est-spkvecs-gpost.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-// sgmm2bin/sgmm2-est-spkvecs-gpost.cc
-
-// Copyright 2009-2011   Saarland University;  Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-
-void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
-                            const Sgmm2GauPost &gpost,
-                            const TransitionModel &trans_model,
-                            const AmSgmm2 &am_sgmm,
-                            Sgmm2PerSpkDerivedVars *spk_vars,
-                            MleSgmm2SpeakerAccs *spk_stats) {
-  kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
-
-  for (size_t i = 0; i < gpost.size(); i++) {
-    am_sgmm.ComputePerFrameVars(feats.Row(i),
-                                gpost[i].gselect, *spk_vars,
-                                &per_frame_vars);
-
-    for (size_t j = 0; j < gpost[i].tids.size(); j++) {
-      int32 pdf_id = trans_model.TransitionIdToPdf(gpost[i].tids[j]);
-      spk_stats->AccumulateFromPosteriors(am_sgmm, per_frame_vars,
-                                          gpost[i].posteriors[j], pdf_id,
-                                          spk_vars);
-    }
-  }
-}
-
-}  // end namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Estimate SGMM speaker vectors, either per utterance or for the "
-        "supplied set of speakers (with spk2utt option).\n"
-        "Reads Gaussian-level posteriors. Writes to a table of vectors.\n"
-        "Usage: sgmm2-est-spkvecs-gpost [options] <model-in> <feature-rspecifier> "
-        "<gpost-rspecifier> <vecs-wspecifier>\n";
-
-    ParseOptions po(usage);
-    string spk2utt_rspecifier, spkvecs_rspecifier;
-    BaseFloat min_count = 100;
-    BaseFloat rand_prune = 1.0e-05;
-
-    po.Register("spk2utt", &spk2utt_rspecifier,
-        "File to read speaker to utterance-list map from.");
-    po.Register("spkvec-min-count", &min_count,
-        "Minimum count needed to estimate speaker vectors");
-    po.Register("rand-prune", &rand_prune, "Randomized pruning parameter for posteriors (more->faster).");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors to use during aligment (rspecifier)");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    string model_rxfilename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        gpost_rspecifier = po.GetArg(3),
-        vecs_wspecifier = po.GetArg(4);
-
-    TransitionModel trans_model;
-    AmSgmm2 am_sgmm;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-    MleSgmm2SpeakerAccs spk_stats(am_sgmm, rand_prune);
-
-    RandomAccessSgmm2GauPostReader gpost_reader(gpost_rspecifier);
-
-    RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier);
-
-    BaseFloatVectorWriter vecs_writer(vecs_wspecifier);
-
-    double tot_impr = 0.0, tot_t = 0.0;
-    int32 num_done = 0, num_err = 0;
-
-    if (!spk2utt_rspecifier.empty()) {  // per-speaker adaptation
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        spk_stats.Clear();
-        string spk = spk2utt_reader.Key();
-        const vector<string> &uttlist = spk2utt_reader.Value();
-
-        Sgmm2PerSpkDerivedVars spk_vars;
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(spk)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(spk));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for " << spk;
-          }
-        }  // else spk_vars is "empty"
-
-        for (size_t i = 0; i < uttlist.size(); i++) {
-          std::string utt = uttlist[i];
-          if (!feature_reader.HasKey(utt)) {
-            KALDI_WARN << "Did not find features for utterance " << utt;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-          if (!gpost_reader.HasKey(utt) ||
-              gpost_reader.Value(utt).size() != feats.NumRows()) {
-            KALDI_WARN << "Did not find posteriors for utterance " << utt
-                       << " (or wrong size).";
-            num_err++;
-            continue;
-          }
-          const Sgmm2GauPost &gpost = gpost_reader.Value(utt);
-          
-          AccumulateForUtterance(feats, gpost, trans_model, am_sgmm,
-                                 &spk_vars, &spk_stats);
-          num_done++;
-        }  // end looping over all utterances of the current speaker
-
-        BaseFloat impr, spk_tot_t;
-        {  // Compute the spk_vec and write it out.
-          Vector<BaseFloat> spk_vec(am_sgmm.SpkSpaceDim(), kSetZero);
-          if (spk_vars.GetSpeakerVector().Dim() != 0)
-            spk_vec.CopyFromVec(spk_vars.GetSpeakerVector());
-          spk_stats.Update(am_sgmm, min_count, &spk_vec, &impr, &spk_tot_t);
-          vecs_writer.Write(spk, spk_vec);
-        }
-        KALDI_LOG << "For speaker " << spk << ", auxf-impr from speaker vector is "
-                  << (impr/spk_tot_t) << ", over " << spk_tot_t << " frames.\n";
-        tot_impr += impr;
-        tot_t += spk_tot_t;
-      }  // end looping over speakers
-    } else {  // per-utterance adaptation
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        string utt = feature_reader.Key();
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        if (!gpost_reader.HasKey(utt) ||
-            gpost_reader.Value(utt).size() != feats.NumRows()) {
-          KALDI_WARN << "Did not find posts for utterance "
-                     << utt;
-          num_err++;
-          continue;
-        }
-        const Sgmm2GauPost &gpost = gpost_reader.Value(utt);
-
-        Sgmm2PerSpkDerivedVars spk_vars;
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(utt)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for " << utt;
-          }
-        }  // else spk_vars is "empty"
-        
-        num_done++;
-        spk_stats.Clear();
-
-        AccumulateForUtterance(feats, gpost, trans_model, am_sgmm,
-                               &spk_vars, &spk_stats);
-
-        BaseFloat impr, utt_tot_t;
-        {  // Compute the spk_vec and write it out.
-          Vector<BaseFloat> spk_vec(am_sgmm.SpkSpaceDim(), kSetZero);
-          if (spk_vars.GetSpeakerVector().Dim() != 0)
-            spk_vec.CopyFromVec(spk_vars.GetSpeakerVector());
-          spk_stats.Update(am_sgmm, min_count, &spk_vec, &impr, &utt_tot_t);
-          vecs_writer.Write(utt, spk_vec);
-        }
-        KALDI_LOG << "For utterance " << utt << ", auxf-impr from speaker vectors is "
-                  << (impr/utt_tot_t) << ", over " << utt_tot_t << " frames.";
-        tot_impr += impr;
-        tot_t += utt_tot_t;
-      }
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, " << num_err
-              << " with errors.";
-    KALDI_LOG << "Overall auxf impr per frame is " << (tot_impr / tot_t)
-              << " over " << tot_t << " frames.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/sgmm2bin/sgmm2-est-spkvecs.cc b/src/sgmm2bin/sgmm2-est-spkvecs.cc
deleted file mode 100644
index dc979d4cbd1..00000000000
--- a/src/sgmm2bin/sgmm2-est-spkvecs.cc
+++ /dev/null
@@ -1,259 +0,0 @@
-// sgmm2bin/sgmm2-est-spkvecs.cc
-
-// Copyright 2009-2012  Saarland University  Microsoft Corporation
-//                      Johns Hopkins University (Author: Daniel Povey)
-//                2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-
-void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
-                            const Posterior &post,
-                            const TransitionModel &trans_model,
-                            const AmSgmm2 &am_sgmm,
-                            const vector< vector<int32> > &gselect,
-                            Sgmm2PerSpkDerivedVars *spk_vars,
-                            MleSgmm2SpeakerAccs *spk_stats) {
-  kaldi::Sgmm2PerFrameDerivedVars per_frame_vars;
-
-  KALDI_ASSERT(gselect.size() == feats.NumRows());
-  Posterior pdf_post;
-  ConvertPosteriorToPdfs(trans_model, post, &pdf_post);
-  for (size_t i = 0; i < post.size(); i++) {
-    am_sgmm.ComputePerFrameVars(feats.Row(i), gselect[i],
-                                *spk_vars, &per_frame_vars);
-    
-    for (size_t j = 0; j < pdf_post[i].size(); j++) {
-      int32 pdf_id = pdf_post[i][j].first;
-      spk_stats->Accumulate(am_sgmm, per_frame_vars, pdf_id,
-                            pdf_post[i][j].second, spk_vars);
-    }
-  }
-}
-
-}  // end namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-    using namespace kaldi;
-    const char *usage =
-        "Estimate SGMM speaker vectors, either per utterance or for the "
-        "supplied set of speakers (with spk2utt option).\n"
-        "Reads Gaussian-level posteriors. Writes to a table of vectors.\n"
-        "Usage: sgmm2-est-spkvecs [options] <model-in> <feature-rspecifier> "
-        "<post-rspecifier> <vecs-wspecifier>\n"
-        "note: --gselect option is required.";
-    
-    ParseOptions po(usage);
-    string gselect_rspecifier, spk2utt_rspecifier, spkvecs_rspecifier;
-    BaseFloat min_count = 100;
-    BaseFloat rand_prune = 1.0e-05;
-
-    po.Register("gselect", &gselect_rspecifier,
-                "rspecifier for precomputed per-frame Gaussian indices from.");
-    po.Register("spk2utt", &spk2utt_rspecifier,
-        "File to read speaker to utterance-list map from.");
-    po.Register("spkvec-min-count", &min_count,
-        "Minimum count needed to estimate speaker vectors");
-    po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors to use during aligment (rspecifier)");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is mandatory.";
-    
-    string model_rxfilename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        post_rspecifier = po.GetArg(3),
-        vecs_wspecifier = po.GetArg(4);
-
-    TransitionModel trans_model;
-    AmSgmm2 am_sgmm;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-    MleSgmm2SpeakerAccs spk_stats(am_sgmm, rand_prune);
-
-    RandomAccessPosteriorReader post_reader(post_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier);
-
-    BaseFloatVectorWriter vecs_writer(vecs_wspecifier);
-
-    double tot_impr = 0.0, tot_t = 0.0;
-    int32 num_done = 0, num_err = 0;
-    std::vector<std::vector<int32> > empty_gselect;
-
-    if (!spk2utt_rspecifier.empty()) {  // per-speaker adaptation
-      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
-      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
-        spk_stats.Clear();
-        string spk = spk2utt_reader.Key();
-        const vector<string> &uttlist = spk2utt_reader.Value();
-
-        Sgmm2PerSpkDerivedVars spk_vars;
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(spk)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(spk));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for speaker " << spk
-                       << ", not processing this speaker.";
-            num_err++; // standard Kaldi behavior is to not process data
-            // when errors like this happen, as it's generally a script error;
-            continue;
-          }
-        }  // else spk_vars is "empty"
-
-        for (size_t i = 0; i < uttlist.size(); i++) {
-          std::string utt = uttlist[i];
-          if (!feature_reader.HasKey(utt)) {
-            KALDI_WARN << "Did not find features for utterance " << utt;
-            continue;
-          }
-          if (!post_reader.HasKey(utt)) {
-            KALDI_WARN << "Did not find posteriors for utterance " << utt;
-            num_err++;
-            continue;
-          }
-          const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-          const Posterior &post = post_reader.Value(utt);
-          if (static_cast<int32>(post.size()) != feats.NumRows()) {
-            KALDI_WARN << "Posterior vector has wrong size " << (post.size())
-                       << " vs. " << (feats.NumRows());
-            num_err++;
-            continue;
-          }
-          if (!gselect_reader.HasKey(utt) ||
-              gselect_reader.Value(utt).size() != feats.NumRows()) {
-            KALDI_WARN << "No Gaussian-selection info available for utterance "
-                       << utt << " (or wrong size)";
-            num_err++;
-            continue;
-          }
-          const std::vector<std::vector<int32> > &gselect =
-              gselect_reader.Value(utt);
-          
-          AccumulateForUtterance(feats, post, trans_model, am_sgmm,
-                                 gselect, &spk_vars, &spk_stats);
-          num_done++;
-        }  // end looping over all utterances of the current speaker
-
-        BaseFloat impr, spk_tot_t;
-        {  // Compute the spk_vec and write it out.
-          Vector<BaseFloat> spk_vec(am_sgmm.SpkSpaceDim(), kSetZero);
-          if (spk_vars.GetSpeakerVector().Dim() != 0)
-            spk_vec.CopyFromVec(spk_vars.GetSpeakerVector());
-          spk_stats.Update(am_sgmm, min_count, &spk_vec, &impr, &spk_tot_t);
-          vecs_writer.Write(spk, spk_vec);
-        }
-        KALDI_LOG << "For speaker " << spk << ", auxf-impr from speaker vector is "
-                  << (impr/spk_tot_t) << ", over " << spk_tot_t << " frames.";
-        tot_impr += impr;
-        tot_t += spk_tot_t;
-      }  // end looping over speakers
-    } else {  // per-utterance adaptation
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      for (; !feature_reader.Done(); feature_reader.Next()) {
-        string utt = feature_reader.Key();        
-        const Matrix<BaseFloat> &feats = feature_reader.Value();
-        if (!post_reader.HasKey(utt) ||
-            post_reader.Value(utt).size() != feats.NumRows()) {
-          KALDI_WARN << "Did not find posts for utterance "
-                     << utt << " (or wrong size).";
-          num_err++;
-          continue;
-        }
-        const Posterior &post = post_reader.Value(utt);
-
-        Sgmm2PerSpkDerivedVars spk_vars;
-        if (spkvecs_reader.IsOpen()) {
-          if (spkvecs_reader.HasKey(utt)) {
-            spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-            am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-          } else {
-            KALDI_WARN << "Cannot find speaker vector for utterance " << utt
-                       << ", not processing it.";
-            num_err++;
-            continue;
-          }
-        }  // else spk_vars is "empty"
-        
-        num_done++;
-
-        if (!gselect_reader.HasKey(utt) ||
-            gselect_reader.Value(utt).size() != feats.NumRows()) {
-          KALDI_WARN << "No Gaussian-selection info available for utterance "
-                     << utt << " (or wrong size)";
-          num_err++;
-          continue;
-        }
-        const std::vector<std::vector<int32> > &gselect =
-            gselect_reader.Value(utt);
-
-        spk_stats.Clear();
-        
-        AccumulateForUtterance(feats, post, trans_model, am_sgmm,
-                               gselect, &spk_vars, &spk_stats);
-
-        BaseFloat impr, utt_tot_t;
-        {  // Compute the spk_vec and write it out.
-          Vector<BaseFloat> spk_vec(am_sgmm.SpkSpaceDim(), kSetZero);
-          if (spk_vars.GetSpeakerVector().Dim() != 0)
-            spk_vec.CopyFromVec(spk_vars.GetSpeakerVector());
-          spk_stats.Update(am_sgmm, min_count, &spk_vec, &impr, &utt_tot_t);
-          vecs_writer.Write(utt, spk_vec);
-        }
-        KALDI_LOG << "For utterance " << utt << ", auxf-impr from speaker vectors is "
-                  << (impr/utt_tot_t) << ", over " << utt_tot_t << " frames.";
-        tot_impr += impr;
-        tot_t += utt_tot_t;
-      }
-    }
-
-    KALDI_LOG << "Overall auxf impr per frame is "
-              << (tot_impr / tot_t) << " over " << tot_t << " frames.";
-    KALDI_LOG << "Done " << num_done << " files, " << num_err << " with errors.";
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
diff --git a/src/sgmm2bin/sgmm2-est.cc b/src/sgmm2bin/sgmm2-est.cc
deleted file mode 100644
index 0080f72fea6..00000000000
--- a/src/sgmm2bin/sgmm2-est.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-// sgmm2bin/sgmm2-est.cc
-
-// Copyright 2009-2012  Saarland University (Author:  Arnab Ghoshal)
-//                      Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "util/kaldi-thread.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    const char *usage =
-        "Estimate SGMM model parameters from accumulated stats.\n"
-        "Usage: sgmm2-est [options] <model-in> <stats-in> <model-out>\n";
-
-    bool binary_write = true;
-    std::string update_flags_str = "vMNwucSt";
-    std::string write_flags_str = "gsnu";
-    kaldi::MleTransitionUpdateConfig tcfg;
-    kaldi::MleAmSgmm2Options sgmm_opts;
-    kaldi::Sgmm2SplitSubstatesConfig split_opts;
-    int32 increase_phn_dim = 0;
-    int32 increase_spk_dim = 0;
-    bool remove_speaker_space = false;
-    bool spk_dep_weights = false;
-    std::string occs_out_filename;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("increase-phn-dim", &increase_phn_dim, "Increase phone-space "
-                "dimension as far as allowed towards this target.");
-    po.Register("increase-spk-dim", &increase_spk_dim, "Increase speaker-space "
-                "dimension as far as allowed towards this target.");
-    po.Register("spk-dep-weights", &spk_dep_weights, "If true, have speaker-"
-                "dependent weights (symmetric SGMM)-- this option only makes"
-                "a difference if you use the --increase-spk-dim option and "
-                "are increasing the speaker dimension from zero.");
-    po.Register("remove-speaker-space", &remove_speaker_space, "Remove speaker-specific "
-                "projections N");
-    po.Register("write-occs", &occs_out_filename, "File to write pdf "
-                "occupantion counts to.");
-    po.Register("update-flags", &update_flags_str, "Which SGMM parameters to "
-                "update: subset of vMNwcSt.");
-    po.Register("write-flags", &write_flags_str, "Which SGMM parameters to "
-                "write: subset of gsnu");
-    po.Register("num-threads", &g_num_threads, "Number of threads to use in "
-                "weight update and normalizer computation");
-    tcfg.Register(&po);
-    sgmm_opts.Register(&po);
-    split_opts.Register(&po);
-
-    po.Read(argc, argv);
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    std::string model_in_filename = po.GetArg(1),
-        stats_filename = po.GetArg(2),
-        model_out_filename = po.GetArg(3);
-
-    kaldi::SgmmUpdateFlagsType update_flags =
-        StringToSgmmUpdateFlags(update_flags_str);
-    kaldi::SgmmWriteFlagsType write_flags =
-        StringToSgmmWriteFlags(write_flags_str);
-    
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    Vector<double> transition_accs;
-    MleAmSgmm2Accs sgmm_accs;
-    {
-      bool binary;
-      Input ki(stats_filename, &binary);
-      transition_accs.Read(ki.Stream(), binary);
-      sgmm_accs.Read(ki.Stream(), binary, true);  // true == add; doesn't matter here.
-    }
-
-    if (update_flags & kSgmmTransitions) {  // Update transition model.
-      BaseFloat objf_impr, count;
-      trans_model.MleUpdate(transition_accs, tcfg, &objf_impr, &count);
-      KALDI_LOG << "Transition model update: Overall " << (objf_impr/count)
-                << " log-like improvement per frame over " << (count)
-                << " frames.";
-    }
-
-    sgmm_accs.Check(am_sgmm, true); // Will check consistency and print some diagnostics.
-
-    { // Do the update.
-      kaldi::MleAmSgmm2Updater updater(sgmm_opts);
-      updater.Update(sgmm_accs, &am_sgmm, update_flags);
-    }
-
-    Vector<BaseFloat> pdf_occs;
-    sgmm_accs.GetStateOccupancies(&pdf_occs);
-
-    if (split_opts.split_substates != 0)
-      am_sgmm.SplitSubstates(pdf_occs, split_opts);
-
-    if (!occs_out_filename.empty()) {
-      kaldi::Output ko(occs_out_filename, binary_write);
-      pdf_occs.Write(ko.Stream(), binary_write);
-    }
-
-    if (increase_phn_dim != 0 || increase_spk_dim != 0) {
-      // Feature normalizing transform matrix used to initialize the new columns
-      // of the phonetic- or speaker-space projection matrices.
-      kaldi::Matrix<BaseFloat> norm_xform;
-      ComputeFeatureNormalizingTransform(am_sgmm.full_ubm(), &norm_xform);
-      if (increase_phn_dim != 0)
-        am_sgmm.IncreasePhoneSpaceDim(increase_phn_dim, norm_xform);
-      if (increase_spk_dim != 0)
-        am_sgmm.IncreaseSpkSpaceDim(increase_spk_dim, norm_xform,
-                                    spk_dep_weights);
-    }
-    if (remove_speaker_space) {
-      KALDI_LOG << "Removing speaker space (projections N_)";
-      am_sgmm.RemoveSpeakerSpace();
-    }
-
-    am_sgmm.ComputeDerivedVars(); // recompute normalizers, and possibly
-    // weights.
-    
-    {
-      Output ko(model_out_filename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_sgmm.Write(ko.Stream(), binary_write, write_flags);
-    }
-    
-    
-    KALDI_LOG << "Written model to " << model_out_filename;
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-gselect.cc b/src/sgmm2bin/sgmm2-gselect.cc
deleted file mode 100644
index ded53b68cb2..00000000000
--- a/src/sgmm2bin/sgmm2-gselect.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// sgmm2bin/sgmm2-gselect.cc
-
-// Copyright 2009-2012   Saarland University  Microsoft Corporation
-//                       Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    const char *usage =
-        "Precompute Gaussian indices for SGMM training "
-        "Usage: sgmm2-gselect [options] <model-in> <feature-rspecifier> <gselect-wspecifier>\n"
-        "e.g.: sgmm2-gselect 1.sgmm \"ark:feature-command |\" ark:1.gs\n"
-        "Note: you can do the same thing by combining the programs sgmm2-write-ubm, fgmm-global-to-gmm,\n"
-        "gmm-gselect and fgmm-gselect\n";
-
-    ParseOptions po(usage);
-    kaldi::Sgmm2GselectConfig sgmm_opts;
-    std::string preselect_rspecifier;
-    std::string likelihood_wspecifier;
-    po.Register("write-likes", &likelihood_wspecifier, "Wspecifier for likelihoods per "
-                "utterance");
-    sgmm_opts.Register(&po);
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        gselect_wspecifier = po.GetArg(3);
-
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    AmSgmm2 am_sgmm;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      TransitionModel trans_model;
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    double tot_like = 0.0;
-    kaldi::int64 tot_t = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    Int32VectorVectorWriter gselect_writer(gselect_wspecifier);
-    BaseFloatWriter likelihood_writer(likelihood_wspecifier);
-
-    int32 num_done = 0, num_err = 0;
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      int32 tot_t_this_file = 0; double tot_like_this_file = 0;
-      std::string utt = feature_reader.Key();
-      const Matrix<BaseFloat> &mat = feature_reader.Value();
-      std::vector<std::vector<int32> > gselect_vec(mat.NumRows());
-      tot_t_this_file += mat.NumRows();
-      for (int32 i = 0; i < mat.NumRows(); i++)
-        tot_like_this_file += am_sgmm.GaussianSelection(sgmm_opts, mat.Row(i), &(gselect_vec[i]));
-
-      gselect_writer.Write(utt, gselect_vec);
-      if (num_done % 10 == 0)
-        KALDI_LOG << "For " << num_done << "'th file, average UBM likelihood over "
-                  << tot_t_this_file << " frames is "
-                  << (tot_like_this_file/tot_t_this_file);
-      tot_t += tot_t_this_file;
-      tot_like += tot_like_this_file;
-
-      if(likelihood_wspecifier != "")
-        likelihood_writer.Write(utt, tot_like_this_file);
-      num_done++;
-    }
-
-    KALDI_LOG << "Done " << num_done << " files, " << num_err
-              << " with errors, average UBM log-likelihood is "
-              << (tot_like/tot_t) << " over " << tot_t << " frames.";
-
-
-    if (num_done != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-info.cc b/src/sgmm2bin/sgmm2-info.cc
deleted file mode 100644
index 6b9ce2f30d1..00000000000
--- a/src/sgmm2bin/sgmm2-info.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// sgmm2bin/sgmm2-info.cc
-
-// Copyright 2012  Arnab Ghoshal  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <iomanip>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    const char *usage =
-        "Print various information about an SGMM.\n"
-        "Usage: sgmm2-info [options] <model-in> [model-in2 ... ]\n";
-
-    bool sgmm_detailed = false;
-    bool trans_detailed = false;
-
-    ParseOptions po(usage);
-    po.Register("sgmm2-detailed", &sgmm_detailed,
-                "Print detailed information about substates.");
-    po.Register("trans-detailed", &trans_detailed,
-                "Print detailed information about transition model.");
-
-    po.Read(argc, argv);
-    if (po.NumArgs() < 1) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    for (int i = 1, max = po.NumArgs(); i <= max; ++i) {
-      std::string model_in_filename = po.GetArg(i);
-      AmSgmm2 am_sgmm;
-      TransitionModel trans_model;
-      {
-        bool binary;
-        Input ki(model_in_filename, &binary);
-        trans_model.Read(ki.Stream(), binary);
-        am_sgmm.Read(ki.Stream(), binary);
-      }
-
-      {
-        using namespace std;
-        cout.setf(ios::left);
-        cout << "\nModel file: " << model_in_filename << endl;
-        cout << " SGMM information:\n"
-          << setw(40) << "  # of HMM states" << am_sgmm.NumPdfs() << endl
-          << setw(40) << "  # of Gaussians per state" << am_sgmm.NumGauss() << endl
-          << setw(40) << "  Dimension of phone vector space"
-          << am_sgmm.PhoneSpaceDim() << endl
-          << setw(40) << "  Dimension of speaker vector space"
-          << am_sgmm.SpkSpaceDim() << endl
-          << setw(40) << "  Dimension of feature vectors"
-             << am_sgmm.FeatureDim() << endl;
-        int32 total_mixweights = 0;
-        for (int32 j2 = 0; j2 < am_sgmm.NumPdfs(); j2++) {
-          total_mixweights += am_sgmm.NumSubstatesForPdf(j2);
-          if (sgmm_detailed) {
-            cout << "  # of substates for state " << setw(13) << j2
-                 << am_sgmm.NumSubstatesForPdf(j2) << endl;
-          }
-        }
-        cout << setw(40) << "  Total # of mixture weights " << total_mixweights << endl;
-        int32 total_groups = am_sgmm.NumGroups();
-        cout << setw(40) << "  Total # of groups of pdfs " << total_groups << endl;
-        int32 total_substates = 0;
-        for (int32 j1 = 0; j1 < am_sgmm.NumGroups(); j1++) {
-          total_substates += am_sgmm.NumSubstatesForGroup(j1);
-        }
-        cout << setw(40) << "  Total # of substates " << total_substates << endl;        
-        cout << "\nTransition model information:\n"
-             << setw(40) << " # of HMM states" << trans_model.NumPdfs() << endl
-             << setw(40) << " # of transition states"
-             << trans_model.NumTransitionStates() << endl;
-          int32 total_indices = 0;
-          for (int32 s = 0; s < trans_model.NumTransitionStates(); s++) {
-            total_indices += trans_model.NumTransitionIndices(s);
-            if (trans_detailed) {
-              cout << "  # of transition ids for state " << setw(8) << s
-                   << trans_model.NumTransitionIndices(s) << endl;
-            }
-          }
-          cout << setw(40) << "  Total # of transition ids " << total_indices
-               << endl;
-      }
-    }
-
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-init.cc b/src/sgmm2bin/sgmm2-init.cc
deleted file mode 100644
index 4aaa400c511..00000000000
--- a/src/sgmm2bin/sgmm2-init.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-// sgmm2bin/sgmm2-init.cc
-
-// Copyright 2012   Arnab Ghoshal  Johns Hopkins University (author: Daniel Povey)
-// Copyright 2009-2011   Saarland University
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Initialize an SGMM from a trained full-covariance UBM and a specified"
-        " model topology.\n"
-        "Usage: sgmm2-init [options] <topology> <tree> <init-model> <sgmm-out>\n"
-        "The <init-model> argument can be a UBM (the default case) or another\n"
-        "SGMM (if the --init-from-sgmm flag is used).\n"
-        "For systems with two-level tree, use --pdf-map argument.";
-    
-    bool binary = true, init_from_sgmm = false, spk_dep_weights = false; // will
-    // make it true later.
-    int32 phn_space_dim = 0, spk_space_dim = 0;
-    std::string pdf_map_rxfilename;
-    double self_weight = 1.0;
-    
-    kaldi::ParseOptions po(usage);
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("phn-space-dim", &phn_space_dim, "Phonetic space dimension.");
-    po.Register("spk-space-dim", &spk_space_dim, "Speaker space dimension.");
-    po.Register("spk-dep-weights", &spk_dep_weights, "If true, have speaker-"
-                "dependent weights (symmetric SGMM)");
-    po.Register("init-from-sgmm", &init_from_sgmm,
-                "Initialize from another SGMM (instead of a UBM).");
-    po.Register("self-weight", &self_weight,
-                "If < 1.0, will be the weight of a pdf with its \"own\" mixture, "
-                "where we initialize each group with a number of mixtures.  If"
-                "1.0, we initialize each group with just one mixture component.");
-    po.Register("pdf-map", &pdf_map_rxfilename,
-                "For systems with 2-level trees [SCTM systems], the file that "
-                "maps from pdfs to groups (from build-tree-two-level)");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string topo_in_filename = po.GetArg(1),
-        tree_in_filename = po.GetArg(2),
-        init_model_filename = po.GetArg(3),
-        sgmm_out_filename = po.GetArg(4);
-
-    ContextDependency ctx_dep;
-    {
-      bool binary_in;
-      Input ki(tree_in_filename.c_str(), &binary_in);
-      ctx_dep.Read(ki.Stream(), binary_in);
-    }
-
-    std::vector<int32> pdf2group;
-    if (pdf_map_rxfilename != "") {
-      bool binary_in;
-      Input ki(pdf_map_rxfilename, &binary_in);
-      ReadIntegerVector(ki.Stream(), binary_in, &pdf2group);
-    } else {
-      for (int32 i = 0; i < ctx_dep.NumPdfs(); i++) pdf2group.push_back(i);
-    }
-
-    
-    HmmTopology topo;
-    ReadKaldiObject(topo_in_filename, &topo);
-
-    TransitionModel trans_model(ctx_dep, topo);
-    
-    kaldi::AmSgmm2 sgmm;
-    if (init_from_sgmm) {
-      kaldi::AmSgmm2 init_sgmm;
-      {
-        bool binary_read;
-        kaldi::Input ki(init_model_filename, &binary_read);
-        init_sgmm.Read(ki.Stream(), binary_read);
-      }
-      sgmm.CopyGlobalsInitVecs(init_sgmm, pdf2group, self_weight);
-    } else {
-      kaldi::FullGmm ubm;
-      {
-        bool binary_read;
-        kaldi::Input ki(init_model_filename, &binary_read);
-        ubm.Read(ki.Stream(), binary_read);
-      }
-      sgmm.InitializeFromFullGmm(ubm, pdf2group, phn_space_dim,
-                                 spk_space_dim, spk_dep_weights,
-                                 self_weight);
-    }
-    sgmm.ComputeNormalizers();
-
-    {
-      kaldi::Output ko(sgmm_out_filename, binary);
-      trans_model.Write(ko.Stream(), binary);
-      sgmm.Write(ko.Stream(), binary, kaldi::kSgmmWriteAll);
-    }
-
-    KALDI_LOG << "Written model to " << sgmm_out_filename;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-latgen-faster-parallel.cc b/src/sgmm2bin/sgmm2-latgen-faster-parallel.cc
deleted file mode 100644
index 31ed135efa7..00000000000
--- a/src/sgmm2bin/sgmm2-latgen-faster-parallel.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-// sgmm2bin/sgmm2-latgen-faster-parallel.cc
-
-// Copyright 2009-2013  Saarland University;  Microsoft Corporation;
-//                      Johns Hopkins University (author: Daniel Povey)
-//                2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "sgmm2/decodable-am-sgmm2.h"
-#include "util/kaldi-thread.h"
-#include "base/timer.h"
-
-namespace kaldi {
-
-// the reference arguments at the beginning are not const as the style guide
-// requires, but are best viewed as inputs.
-void ProcessUtterance(const AmSgmm2 &am_sgmm,
-                      const TransitionModel &trans_model,
-                      double log_prune,
-                      double acoustic_scale,
-                      const Matrix<BaseFloat> &features,
-                      RandomAccessInt32VectorVectorReader &gselect_reader,
-                      RandomAccessBaseFloatVectorReaderMapped &spkvecs_reader,
-                      const fst::SymbolTable *word_syms,
-                      const std::string &utt,
-                      bool determinize,
-                      bool allow_partial,
-                      Int32VectorWriter *alignments_writer,
-                      Int32VectorWriter *words_writer,
-                      CompactLatticeWriter *compact_lattice_writer,
-                      LatticeWriter *lattice_writer,
-                      LatticeFasterDecoder *decoder, // Takes ownership of this.
-                      double *like_sum,
-                      int64 *frame_sum,
-                      int32 *num_done,
-                      int32 *num_err,
-                      TaskSequencer<DecodeUtteranceLatticeFasterClass> *sequencer) {
-  using fst::Fst;
-  using std::vector;
-
-  Sgmm2PerSpkDerivedVars *spk_vars = new Sgmm2PerSpkDerivedVars; // decodable
-  // will take ownership.
-  if (spkvecs_reader.IsOpen()) {
-    if (spkvecs_reader.HasKey(utt)) {
-      spk_vars->SetSpeakerVector(spkvecs_reader.Value(utt));
-      am_sgmm.ComputePerSpkDerivedVars(spk_vars);
-    } else {
-      KALDI_WARN << "Cannot find speaker vector for " << utt << ", not decoding this utterance";
-      delete spk_vars;
-      (*num_err)++;
-      return;
-    }
-  }
-  if (!gselect_reader.HasKey(utt) ||
-      gselect_reader.Value(utt).size() != features.NumRows()) {
-    KALDI_WARN << "No Gaussian-selection info available for utterance "
-               << utt << " (or wrong size)";
-  }
-
-  // decodable will take ownership.
-  vector<vector<int32> > *gselect = new std::vector<vector<int32> >(
-      gselect_reader.Value(utt));
-
-  Matrix<BaseFloat> *new_feats = new Matrix<BaseFloat>(features); // decodable
-  // will take ownership of this.
-
-  // This takes ownership of new_feats, gselect, and spk_vars
-  DecodableAmSgmm2Scaled *sgmm_decodable = new DecodableAmSgmm2Scaled(
-      am_sgmm, trans_model, new_feats, gselect,
-      spk_vars, log_prune, acoustic_scale);
-
-  // takes ownership of decoder and sgmm_decodable.
-  DecodeUtteranceLatticeFasterClass *task =
-      new DecodeUtteranceLatticeFasterClass(
-          decoder, sgmm_decodable, trans_model, word_syms, utt, acoustic_scale,
-          determinize, allow_partial, alignments_writer, words_writer,
-          compact_lattice_writer, lattice_writer, like_sum, frame_sum, num_done,
-          num_err, NULL);
-
-  sequencer->Run(task); // takes ownership.
-}
-
-} // end namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::Fst;
-    using fst::VectorFst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Decode features using SGMM-based model.  This version accepts the --num-threads\n"
-        "option but otherwise behaves identically to sgmm2-latgen-faster\n"
-        "Usage:  sgmm2-latgen-faster-parallel [options] <model-in> (<fst-in>|<fsts-rspecifier>) "
-        "<features-rspecifier> <lattices-wspecifier> [<words-wspecifier> [<alignments-wspecifier>] ]\n";
-    ParseOptions po(usage);
-    BaseFloat acoustic_scale = 0.1;
-    bool allow_partial = false;
-    BaseFloat log_prune = 5.0;
-    string word_syms_filename, gselect_rspecifier, spkvecs_rspecifier,
-        utt2spk_rspecifier;
-
-    LatticeFasterDecoderConfig decoder_opts;
-    TaskSequencerConfig sequencer_config; // has --num-threads option
-    decoder_opts.Register(&po);
-    sequencer_config.Register(&po);
-
-    po.Register("acoustic-scale", &acoustic_scale,
-        "Scaling factor for acoustic likelihoods");
-    po.Register("log-prune", &log_prune,
-                "Pruning beam used to reduce number of exp() evaluations.");
-    po.Register("word-symbol-table", &word_syms_filename,
-        "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial,
-                "Produce output even when final state was not reached");
-    po.Register("gselect", &gselect_rspecifier,
-                "rspecifier for precomputed per-frame Gaussian indices.");
-    po.Register("spk-vecs", &spkvecs_rspecifier,
-                "rspecifier for speaker vectors");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is required.";
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        lattice_wspecifier = po.GetArg(4),
-        words_wspecifier = po.GetOptArg(5),
-        alignment_wspecifier = po.GetOptArg(6);
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;    
-    int num_done = 0, num_err = 0;
-    Timer timer;
-    Fst<StdArc> *decode_fst = NULL;
-    fst::SymbolTable *word_syms = NULL;
-    
-    TaskSequencer<DecodeUtteranceLatticeFasterClass> sequencer(
-        sequencer_config);
-    TransitionModel trans_model;
-    kaldi::AmSgmm2 am_sgmm;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    
-    bool determinize = decoder_opts.determinize_lattice;    
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                << lattice_wspecifier;
-
-    Int32VectorWriter words_writer(words_wspecifier);
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    if (word_syms_filename != "") 
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                  << word_syms_filename;
-
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-        
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) { // a single FST.
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      // It's important that we initialize decode_fst after feature_reader, as it
-      // can prevent crashes on systems installed without enough virtual memory.
-      // It has to do with what happens on UNIX systems if you call fork() on a
-      // large process: the page-table entries are duplicated, which requires a
-      // lot of virtual memory.
-      decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-      timer.Reset(); // exclude graph loading time.
-      
-      {
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          string utt = feature_reader.Key();
-          const Matrix<BaseFloat> &features(feature_reader.Value());
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_err++;
-            continue;
-          }
-
-          // ProcessUtterance will take ownership of this.
-          LatticeFasterDecoder *decoder = new LatticeFasterDecoder(
-              *decode_fst, decoder_opts);
-
-          ProcessUtterance(am_sgmm, trans_model, log_prune, acoustic_scale,
-                           features, gselect_reader, spkvecs_reader, word_syms,
-                           utt, determinize, allow_partial,
-                           &alignment_writer, &words_writer, &compact_lattice_writer,
-                           &lattice_writer, decoder, &tot_like, &frame_count,
-                           &num_done, &num_err, &sequencer);
-        }
-      }
-    } else { // We have different FSTs for different utterances.
-      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);          
-      for (; !fst_reader.Done(); fst_reader.Next()) {
-        std::string utt = fst_reader.Key();
-        if (!feature_reader.HasKey(utt)) {
-          KALDI_WARN << "Not decoding utterance " << utt
-                     << " because no features available.";
-          num_err++;
-          continue;
-        }
-        const Matrix<BaseFloat> &features = feature_reader.Value(utt);
-        if (features.NumRows() == 0) {
-          KALDI_WARN << "Zero-length utterance: " << utt;
-          num_err++;
-          continue;
-        }
-        VectorFst<StdArc> *fst = fst_reader.Value().Copy(); // Note: this does
-        // a shallow copy because OpenFst is "smart" about these things and
-        // does reference counting.  The constructor of LatticeFasterDecoder
-        // takes ownership of this FST (note: LatticeFasterDecoder has 2
-        // constructors, one of which takes ownership and one of which does not).
-        LatticeFasterDecoder *decoder = new LatticeFasterDecoder(decoder_opts,
-                                                                 fst);
-
-        // ProcessUtterance takes ownership of "decoder".
-        ProcessUtterance(am_sgmm, trans_model, log_prune, acoustic_scale,
-                         features, gselect_reader, spkvecs_reader, word_syms,
-                         utt, determinize, allow_partial,
-                         &alignment_writer, &words_writer, &compact_lattice_writer,
-                         &lattice_writer, decoder, &tot_like, &frame_count,
-                         &num_done, &num_err, &sequencer);
-      }
-    }
-    sequencer.Wait(); // Wait till all tasks are done.
-    
-    delete decode_fst; 
-    delete word_syms;
-    
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Decoded with " << sequencer_config.num_threads << " threads.";
-    KALDI_LOG << "Time taken [excluding initialization] "<< elapsed
-              << "s: real-time factor per thread assuming 100 frames/sec is "
-              << (sequencer_config.num_threads * elapsed * 100.0 / frame_count);
-    KALDI_LOG << "Done " << num_done << " utterances, failed for "
-              << num_err;
-    KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count)
-              << " over " << frame_count << " frames.";
-
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-latgen-faster.cc b/src/sgmm2bin/sgmm2-latgen-faster.cc
deleted file mode 100644
index 39eccc4a6b6..00000000000
--- a/src/sgmm2bin/sgmm2-latgen-faster.cc
+++ /dev/null
@@ -1,268 +0,0 @@
-// sgmm2bin/sgmm2-latgen-faster.cc
-
-// Copyright 2009-2012  Saarland University;  Microsoft Corporation;
-//                      Johns Hopkins University (author: Daniel Povey)
-//                2014  Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "sgmm2/decodable-am-sgmm2.h"
-#include "base/timer.h"
-
-namespace kaldi {
-
-// the reference arguments at the beginning are not const as the style guide
-// requires, but are best viewed as inputs.
-bool ProcessUtterance(LatticeFasterDecoder &decoder,
-                      const AmSgmm2 &am_sgmm,
-                      const TransitionModel &trans_model,
-                      double log_prune,
-                      double acoustic_scale,
-                      const Matrix<BaseFloat> &features,
-                      RandomAccessInt32VectorVectorReader &gselect_reader,
-                      RandomAccessBaseFloatVectorReaderMapped &spkvecs_reader,
-                      const fst::SymbolTable *word_syms,
-                      const std::string &utt,
-                      bool determinize,
-                      bool allow_partial,
-                      Int32VectorWriter *alignments_writer,
-                      Int32VectorWriter *words_writer,
-                      CompactLatticeWriter *compact_lattice_writer,
-                      LatticeWriter *lattice_writer,
-                      double *like_ptr) { // puts utterance's like in like_ptr on success.
-  using fst::Fst;
-
-  Sgmm2PerSpkDerivedVars spk_vars;
-  if (spkvecs_reader.IsOpen()) {
-    if (spkvecs_reader.HasKey(utt)) {
-      spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-      am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-    } else {
-      KALDI_WARN << "Cannot find speaker vector for " << utt << ", not decoding this utterance";
-      return false; // We could use zero, but probably the user would want to know about this
-      // (this would normally be a script error or some kind of failure).
-    }
-  }
-  if (!gselect_reader.HasKey(utt) ||
-      gselect_reader.Value(utt).size() != features.NumRows()) {
-    KALDI_WARN << "No Gaussian-selection info available for utterance "
-               << utt << " (or wrong size)";
-  }
-
-  const std::vector<std::vector<int32> > &gselect =
-      gselect_reader.Value(utt);
-  
-  DecodableAmSgmm2Scaled sgmm_decodable(am_sgmm, trans_model, features, gselect,
-                                        log_prune, acoustic_scale, &spk_vars);
-
-  return DecodeUtteranceLatticeFaster(
-      decoder, sgmm_decodable, trans_model, word_syms, utt, acoustic_scale,
-      determinize, allow_partial, alignments_writer, words_writer,
-      compact_lattice_writer, lattice_writer, like_ptr);
-}
-
-} // end namespace kaldi
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::Fst;
-    using fst::StdArc;
-
-    const char *usage =
-        "Decode features using SGMM-based model.\n"
-        "Usage:  sgmm2-latgen-faster [options] <model-in> (<fst-in>|<fsts-rspecifier>) "
-        "<features-rspecifier> <lattices-wspecifier> [<words-wspecifier> [<alignments-wspecifier>] ]\n";
-    ParseOptions po(usage);
-    BaseFloat acoustic_scale = 0.1;
-    bool allow_partial = false;
-    BaseFloat log_prune = 5.0;
-    string word_syms_filename, gselect_rspecifier, spkvecs_rspecifier,
-        utt2spk_rspecifier;
-
-    LatticeFasterDecoderConfig decoder_opts;
-    decoder_opts.Register(&po);    
-
-    po.Register("acoustic-scale", &acoustic_scale,
-        "Scaling factor for acoustic likelihoods");
-    po.Register("log-prune", &log_prune,
-                "Pruning beam used to reduce number of exp() evaluations.");
-    po.Register("word-symbol-table", &word_syms_filename,
-        "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial,
-                "Produce output even when final state was not reached");
-    po.Register("gselect", &gselect_rspecifier,
-                "rspecifier for precomputed per-frame Gaussian indices.");
-    po.Register("spk-vecs", &spkvecs_rspecifier,
-                "rspecifier for speaker vectors");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is required.";
-
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        lattice_wspecifier = po.GetArg(4),
-        words_wspecifier = po.GetOptArg(5),
-        alignment_wspecifier = po.GetOptArg(6);
-
-    TransitionModel trans_model;
-    kaldi::AmSgmm2 am_sgmm;
-    {
-      bool binary;
-      Input ki(model_in_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    bool determinize = decoder_opts.determinize_lattice;    
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-    
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-
-    BaseFloat tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_err = 0;
-
-    Timer timer;
-        
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) { // a single FST.
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      // It's important that we initialize decode_fst after feature_reader, as it
-      // can prevent crashes on systems installed without enough virtual memory.
-      // It has to do with what happens on UNIX systems if you call fork() on a
-      // large process: the page-table entries are duplicated, which requires a
-      // lot of virtual memory.
-      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-      timer.Reset(); // exclude graph loading time.
-      
-      {
-        LatticeFasterDecoder decoder(*decode_fst, decoder_opts);
-    
-        const std::vector<std::vector<int32> > empty_gselect;
-
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          string utt = feature_reader.Key();
-          const Matrix<BaseFloat> &features(feature_reader.Value());
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_err++;
-            continue;
-          }
-          double like;
-          if (ProcessUtterance(decoder, am_sgmm, trans_model, log_prune, acoustic_scale,
-                               features, gselect_reader, spkvecs_reader, word_syms,
-                               utt, determinize, allow_partial,
-                               &alignment_writer, &words_writer, &compact_lattice_writer,
-                               &lattice_writer, &like)) {
-            tot_like += like;
-            frame_count += features.NumRows();
-            KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
-                      << (like / features.NumRows()) << " over "
-                      << features.NumRows() << " frames.";
-            num_success++;
-          } else { num_err++; }
-        }
-      }
-      delete decode_fst; // only safe to do this after decoder goes out of scope.
-    } else { // We have different FSTs for different utterances.
-      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);          
-      for (; !fst_reader.Done(); fst_reader.Next()) {
-        std::string utt = fst_reader.Key();
-        if (!feature_reader.HasKey(utt)) {
-          KALDI_WARN << "Not decoding utterance " << utt
-                     << " because no features available.";
-          num_err++;
-          continue;
-        }
-        const Matrix<BaseFloat> &features = feature_reader.Value(utt);
-        if (features.NumRows() == 0) {
-          KALDI_WARN << "Zero-length utterance: " << utt;
-          num_err++;
-          continue;
-        }
-        LatticeFasterDecoder decoder(fst_reader.Value(), decoder_opts);
-        double like;
-
-        if (ProcessUtterance(decoder, am_sgmm, trans_model, log_prune, acoustic_scale,
-                             features, gselect_reader, spkvecs_reader, word_syms,
-                             utt, determinize, allow_partial,
-                             &alignment_writer, &words_writer, &compact_lattice_writer,
-                             &lattice_writer, &like)) {
-          tot_like += like;
-          frame_count += features.NumRows();
-          KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
-                    << (like / features.NumRows()) << " over "
-                    << features.NumRows() << " frames.";
-          num_success++;
-        } else { num_err++; }
-      }
-    }
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken [excluding initialization] "<< elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_err;
-    KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count)
-              << " over " << frame_count << " frames.";
-
-    delete word_syms;
-    return (num_success != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-post-to-gpost.cc b/src/sgmm2bin/sgmm2-post-to-gpost.cc
deleted file mode 100644
index 2dfbe436fb3..00000000000
--- a/src/sgmm2bin/sgmm2-post-to-gpost.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-// sgmm2bin/sgmm2-post-to-gpost.cc
-
-// Copyright 2009-2012   Saarland University  Microsoft Corporation
-//                       Johns Hopkins University (Author: Daniel Povey)
-//                2014   Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "hmm/posterior.h"
-
-
-int main(int argc, char *argv[]) {
-  using namespace kaldi;
-  try {
-    const char *usage =
-        "Convert posteriors to Gaussian-level posteriors for SGMM training.\n"
-        "Usage: sgmm2-post-to-gpost [options] <model-in> <feature-rspecifier> "
-        "<posteriors-rspecifier> <gpost-wspecifier>\n"
-        "e.g.: sgmm2-post-to-gpost 1.mdl 1.ali scp:train.scp 'ark:ali-to-post ark:1.ali ark:-|' ark:-";
-
-    ParseOptions po(usage);
-    std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
-
-    po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect option is required";
-    
-    std::string model_filename = po.GetArg(1),
-        feature_rspecifier = po.GetArg(2),
-        posteriors_rspecifier = po.GetArg(3),
-        gpost_wspecifier = po.GetArg(4);
-
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    double tot_like = 0.0;
-    kaldi::int64 tot_t = 0;
-
-    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-
-    Sgmm2PerFrameDerivedVars per_frame_vars;
-    
-    Sgmm2GauPostWriter gpost_writer(gpost_wspecifier);
-    
-    int32 num_done = 0, num_err = 0;
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-      const Matrix<BaseFloat> &mat = feature_reader.Value();
-      std::string utt = feature_reader.Key();
-      if (!posteriors_reader.HasKey(utt)
-          || posteriors_reader.Value(utt).size() != mat.NumRows()) {
-        KALDI_WARN << "No posteriors available for utterance " << utt
-                   << " (or wrong size)";
-        num_err++;
-        continue;
-      }
-      Posterior posterior = posteriors_reader.Value(utt);
-
-      if (!gselect_reader.HasKey(utt) ||
-          gselect_reader.Value(utt).size() != mat.NumRows()) {
-        KALDI_WARN << "No Gaussian-selection info available for utterance "
-                   << utt << " (or wrong size)";
-        num_err++;
-        continue;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(utt);
-
-      Sgmm2PerSpkDerivedVars spk_vars;
-      if (spkvecs_reader.IsOpen()) {
-        if (spkvecs_reader.HasKey(utt)) {
-          spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-          am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-        } else {
-          KALDI_WARN << "Cannot find speaker vector for " << utt;
-          num_err++;
-          continue;
-        }
-      } // else spk_vars is "empty"
-
-      num_done++;
-      BaseFloat tot_like_this_file = 0.0, tot_weight = 0.0;
-
-      Sgmm2GauPost gpost(posterior.size());  // posterior.size() == T.
-
-      SortPosteriorByPdfs(trans_model, &posterior);
-      int32 prev_pdf_id = -1;
-      BaseFloat prev_like = 0;
-      Matrix<BaseFloat> prev_posterior;
-      for (size_t i = 0; i < posterior.size(); i++) {
-        am_sgmm.ComputePerFrameVars(mat.Row(i), gselect[i],
-                                    spk_vars, &per_frame_vars);
-
-        gpost[i].gselect = gselect[i];
-        gpost[i].tids.resize(posterior[i].size());
-        gpost[i].posteriors.resize(posterior[i].size());
-
-        prev_pdf_id = -1;       // Only cache for the same frame.
-        for (size_t j = 0; j < posterior[i].size(); j++) {
-          int32 tid = posterior[i][j].first,  // transition identifier.
-              pdf_id = trans_model.TransitionIdToPdf(tid);
-          BaseFloat weight = posterior[i][j].second;
-          gpost[i].tids[j] = tid;
-
-          if (pdf_id != prev_pdf_id) {
-            // First time see this pdf-id for this frame, update the cached
-            // variables.
-            prev_pdf_id = pdf_id;
-            prev_like = am_sgmm.ComponentPosteriors(per_frame_vars, pdf_id,
-                                                    &spk_vars,
-                                                    &prev_posterior);
-          }
-
-          gpost[i].posteriors[j] = prev_posterior;
-          tot_like_this_file += prev_like * weight;
-          tot_weight += weight;
-          gpost[i].posteriors[j].Scale(weight);
-        }
-      }
-
-      KALDI_VLOG(2) << "Average like for this file is "
-                    << (tot_like_this_file/posterior.size()) << " over "
-                    << posterior.size() <<" frames.";
-      tot_like += tot_like_this_file;
-      tot_t += posterior.size();
-      if (num_done % 10 == 0)
-        KALDI_LOG << "Avg like per frame so far is "
-                  << (tot_like/tot_t);
-      gpost_writer.Write(utt, gpost);
-    }
-    
-    KALDI_LOG << "Overall like per frame (Gaussian only) = "
-              << (tot_like/tot_t) << " over " << tot_t << " frames.";
-
-    KALDI_LOG << "Done " << num_done << " files, " << num_err
-              << " with errors.";
-
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-project.cc b/src/sgmm2bin/sgmm2-project.cc
deleted file mode 100644
index 7b3d5c412d5..00000000000
--- a/src/sgmm2bin/sgmm2-project.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// sgmm2bin/sgmm2-project.cc
-
-// Copyright 2012    Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "util/kaldi-thread.h"
-#include "hmm/transition-model.h"
-#include "sgmm2/am-sgmm2-project.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    const char *usage =
-        "Compute SGMM model projection that only models a part of a pre-LDA space.\n"
-        "Used in predictive SGMMs.  Takes as input an LDA+MLLT transform,\n"
-        "and outputs a transform from the pre-LDA+MLLT space to the space that\n"
-        "we want to model\n"
-        "Usage: sgmm2-project [options] <model-in> <lda-mllt-mat-in> <model-out> <new-projection-out>\n"
-        "e.g.: sgmm2-project --start-dim=0 --end-dim=52 final.mdl final.inv_full_mat final_proj1.mdl proj1.mat\n";
-    
-    std::string write_flags_str = "gsnu";
-
-    bool binary_write = false;
-    int32 start_dim = 0;
-    int32 end_dim = 0;
-
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("start-dim", &start_dim, "Starting dimension to keep in "
-                "pre-LDA-MLLT space.");
-    po.Register("end-dim", &end_dim, "Ending dimension to keep in "
-                "pre-LDA-MLLT space (equals last retained dimension plus one)");
-
-    po.Read(argc, argv);
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-    std::string model_rxfilename = po.GetArg(1),
-        lda_mllt_rxfilename = po.GetArg(2),
-        model_wxfilename = po.GetArg(3),
-        proj_wxfilename = po.GetArg(4);
-
-    kaldi::SgmmWriteFlagsType write_flags =
-        StringToSgmmWriteFlags(write_flags_str);
-    
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_rxfilename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-
-    Matrix<BaseFloat> lda_mllt_mat;
-    ReadKaldiObject(lda_mllt_rxfilename, &lda_mllt_mat);
-
-    // Need the full LDA+MLLT matrix, including the extra rows.
-    // See featbin/extend-transform.cc
-    KALDI_ASSERT(lda_mllt_mat.NumRows() == lda_mllt_mat.NumCols());
-
-    Matrix<BaseFloat> inv_lda_mllt_mat(lda_mllt_mat);
-    inv_lda_mllt_mat.Invert();
-
-    Matrix<BaseFloat> projection;
-    Sgmm2Project sgmm_project;
-    sgmm_project.ComputeProjection(am_sgmm, inv_lda_mllt_mat, start_dim, end_dim,
-                                   &projection);
-
-    Matrix<BaseFloat> total_projection(projection.NumRows(), projection.NumCols());
-    total_projection.AddMatMat(1.0, projection, kNoTrans,
-                               inv_lda_mllt_mat, kNoTrans, 0.0);
-    
-    sgmm_project.ApplyProjection(total_projection, &am_sgmm);
-    
-    am_sgmm.ComputeDerivedVars(); // recompute normalizers, and possibly
-    // weights.
-    
-    {
-      Output ko(model_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_sgmm.Write(ko.Stream(), binary_write, write_flags);
-    }
-    KALDI_LOG << "Wrote model to " << model_wxfilename;
-
-    WriteKaldiObject(projection, proj_wxfilename, binary_write);
-    KALDI_LOG << "Wrote projection matrix to " << proj_wxfilename;
-    
-    return 0;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-
-
diff --git a/src/sgmm2bin/sgmm2-rescore-lattice.cc b/src/sgmm2bin/sgmm2-rescore-lattice.cc
deleted file mode 100644
index 95daab11c99..00000000000
--- a/src/sgmm2bin/sgmm2-rescore-lattice.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-// sgmm2bin/sgmm2-rescore-lattice.cc
-
-// Copyright 2009-2012   Saarland University (Author: Arnab Ghoshal)
-//                       Johns Hopkins University (Author: Daniel Povey)
-//                       Cisco Systems (Author: Neha Agrawal)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "util/stl-utils.h"
-#include "sgmm2/am-sgmm2.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "lat/kaldi-lattice.h"
-#include "lat/lattice-functions.h"
-#include "sgmm2/decodable-am-sgmm2.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::StdArc;
-
-    const char *usage =
-      "Replace the acoustic scores on a lattice using a new model.\n"
-      "Usage: sgmm2-rescore-lattice [options] <model-in> <lattice-rspecifier> "
-      "<feature-rspecifier> <lattice-wspecifier>\n"
-      " e.g.: sgmm2-rescore-lattice 1.mdl ark:1.lats scp:trn.scp ark:2.lats\n";
-
-    kaldi::BaseFloat old_acoustic_scale = 0.0;
-    bool speedup = false;
-    BaseFloat log_prune = 5.0;
-    std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
-
-    kaldi::ParseOptions po(usage);
-    po.Register("old-acoustic-scale", &old_acoustic_scale,
-                "Add the current acoustic scores with some scale.");
-    po.Register("log-prune", &log_prune,
-                "Pruning beam used to reduce number of exp() evaluations.");
-    po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier,
-                "rspecifier for utterance to speaker map");
-    po.Register("gselect", &gselect_rspecifier,
-                "Precomputed Gaussian indices (rspecifier)");
-    po.Register("speedup", &speedup,
-                "If true, enable a faster version of the computation that "
-                "saves times when there is only one pdf-id on a single frame "
-                "by only sometimes (randomly) computing the probabilities, and "
-                "then scaling them up to preserve corpus-level diagnostics.");
-
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 4) {
-      po.PrintUsage();
-      exit(1);
-    }
-    if (gselect_rspecifier == "")
-      KALDI_ERR << "--gselect-rspecifier option is required.";
-
-    std::string model_filename = po.GetArg(1),
-        lats_rspecifier = po.GetArg(2),
-        feature_rspecifier = po.GetArg(3),
-        lats_wspecifier = po.GetArg(4);
-
-    AmSgmm2 am_sgmm;
-    TransitionModel trans_model;
-    {
-      bool binary;
-      Input ki(model_filename, &binary);
-      trans_model.Read(ki.Stream(), binary);
-      am_sgmm.Read(ki.Stream(), binary);
-    }
-
-    RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-    RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
-                                                           utt2spk_rspecifier);
-    RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-    // Read as compact lattice
-    SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
-    // Write as compact lattice.
-    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
-
-    int32 num_done = 0, num_err = 0;
-    for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
-      std::string utt = compact_lattice_reader.Key();
-      if (!feature_reader.HasKey(utt)) {
-        KALDI_WARN << "No feature found for utterance " << utt;
-        num_err++;
-        continue;
-      }
-
-      CompactLattice clat = compact_lattice_reader.Value();
-      compact_lattice_reader.FreeCurrent();
-      if (old_acoustic_scale != 1.0)
-        fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale), &clat);
-
-      const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-
-      // Get speaker vectors
-      Sgmm2PerSpkDerivedVars spk_vars;
-      if (spkvecs_reader.IsOpen()) {
-        if (spkvecs_reader.HasKey(utt)) {
-          spk_vars.SetSpeakerVector(spkvecs_reader.Value(utt));
-          am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
-        } else {
-          KALDI_WARN << "Cannot find speaker vector for " << utt;
-          num_err++;
-          continue;
-        }
-      }  // else spk_vars is "empty"
-
-      if (!gselect_reader.HasKey(utt) ||
-          gselect_reader.Value(utt).size() != feats.NumRows()) {
-        KALDI_WARN << "No Gaussian-selection info available for utterance "
-                   << utt << " (or wrong size)";
-        num_err++;
-        continue;
-      }
-      const std::vector<std::vector<int32> > &gselect =
-          gselect_reader.Value(utt);
-
-      DecodableAmSgmm2 sgmm2_decodable(am_sgmm, trans_model, feats,
-                                       gselect, log_prune, &spk_vars);
-
-      if (!speedup) {
-        if (kaldi::RescoreCompactLattice(&sgmm2_decodable, &clat)) {
-          compact_lattice_writer.Write(utt, clat);
-          num_done++;
-        } else num_err++;
-      } else {
-        BaseFloat speedup_factor = 100.0; 
-        if (kaldi::RescoreCompactLatticeSpeedup(trans_model, speedup_factor,
-                                                &sgmm2_decodable,
-                                                &clat)) {
-          compact_lattice_writer.Write(utt, clat);
-          num_done++;
-        } else num_err++;
-      }        
-    }
-
-    KALDI_LOG << "Done " << num_done << " lattices, errors on "
-              << num_err;
-    return (num_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/sgmm2bin/sgmm2-sum-accs.cc b/src/sgmm2bin/sgmm2-sum-accs.cc
deleted file mode 100644
index 8259702ba49..00000000000
--- a/src/sgmm2bin/sgmm2-sum-accs.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-// sgmm2bin/sgmm2-sum-accs.cc
-
-// Copyright 2009-2012   Saarland University;  Microsoft Corporation
-//                       Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "sgmm2/estimate-am-sgmm2.h"
-#include "hmm/transition-model.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "Sum multiple accumulated stats files for SGMM training.\n"
-        "Usage: sgmm2-sum-accs [options] stats-out stats-in1 stats-in2 ...\n";
-
-    bool binary = true;
-    bool parallel = false;
-    kaldi::ParseOptions po(usage);
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("parallel", &parallel, "If true, the program makes sure to open all "
-                "filehandles before reading for any (useful when summing accs from "
-                "long processes)");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string stats_out_filename = po.GetArg(1);
-    kaldi::Vector<double> transition_accs;
-    kaldi::MleAmSgmm2Accs sgmm_accs;
-
-    if (parallel) {
-      std::vector<kaldi::Input*> inputs(po.NumArgs() - 1);
-      for (int i = 0; i < po.NumArgs() - 1; i++) {
-        std::string stats_in_filename = po.GetArg(i + 2);
-        inputs[i] = new kaldi::Input(stats_in_filename); // Don't try
-        // to work out binary status yet; this would cause us to wait
-        // for the output of that process.  We delay it till later.
-      }
-      for (size_t i = 0; i < po.NumArgs() - 1; i++) {
-        bool b;
-        if (kaldi::InitKaldiInputStream(inputs[i]->Stream(), &b)) {
-          transition_accs.Read(inputs[i]->Stream(), b, true /* add values */);
-          sgmm_accs.Read(inputs[i]->Stream(), b, true /* add values */);
-          delete inputs[i];
-        } else {
-          KALDI_ERR << "Failed to read input stats file " << po.GetArg(i + 2);
-        }
-      }      
-    } else {
-      for (int i = 2, max = po.NumArgs(); i <= max; i++) {
-        std::string stats_in_filename = po.GetArg(i);
-        bool binary_read;
-        kaldi::Input ki(stats_in_filename, &binary_read);
-        transition_accs.Read(ki.Stream(), binary_read, true /* add values */);
-        sgmm_accs.Read(ki.Stream(), binary_read, true /* add values */);
-      }
-    }
-
-    // Write out the accs
-    {
-      kaldi::Output ko(stats_out_filename, binary);
-      transition_accs.Write(ko.Stream(), binary);
-      sgmm_accs.Write(ko.Stream(), binary);
-    }
-
-    KALDI_LOG << "Written stats to " << stats_out_filename;
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/tensor/array-ref.h b/src/tensor/array-ref.h
new file mode 100644
index 00000000000..e37f40630c3
--- /dev/null
+++ b/src/tensor/array-ref.h
@@ -0,0 +1,107 @@
+// tensor/array-ref.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <base/kaldi-error.h>
+#include <tensor/tensor-common.h>
+
+
+namespace kaldi {
+namespace tensor {
+
+
+/**
+ Similar to llvm/PyTorch's ArrayRef, this is a lightweight way to store a const
+ array.  The data in array is not owned here; it will generally be unsafe to use
+ an ArrayRef as other than a local variable.
+
+ ArrayRef has only two members and it will probably make sense to pass it by
+ value most of the time.  Its constructors via std::vector<T> and
+ std::initializer_list<T> will be the usual way of creating it;
+*/
+template <typename T>
+struct ArrayRef final {
+  const T *data;
+  size_t size;
+
+  inline T& operator [] (uint64_t i) const {
+    KALDI_ASSERT(i < size);
+    return data[i];
+  }
+
+  constexpr ArrayRef() : size(0), data(nullptr) { }
+
+  // Construct from one element.
+  // Caution: this constructor allows you to evade 'const'.
+  constexpr ArrayRef(const T &element) : size(1), data(&element) { }
+
+  // Construct from data and size
+  constexpr ArrayRef(const T* data, size_t size): data(data), size(size) { }
+
+  /// Construct from a range.  Caution: this constructor allows
+  /// you to evade 'const'.
+  constexpr ArrayRef(const T* begin, const T* end): data(begin), size(end - begin) { }
+
+  /// Construct from a std::vector.
+  ArrayRef(const std::vector<T> &vec): data(vec.data()), size(vec.size()) { }
+
+  /// Construct from a C array.
+  template <size_t N>
+      constexpr ArrayRef(const T (&data)[N]): data(data), size(N) { }
+
+  /// Construct from a std::initializer_list
+  constexpr ArrayRef(const std::initializer_list<T> &vec):
+      data(vec.data()), size(vec.size()) { }
+
+
+  T *begin() { return data; }
+  T *end() { return data + size; }
+
+
+
+  // We will add iterators later if they are needed.
+};
+
+// Converts an ArrayRef<int32> to a string, e.g. "[1 4 300]"; can implicitly
+// print std::vector as well.
+std::string ArrayAsString(const ArrayRef<int32> a);
+
+
+
+/**
+   This template is a mechanism of keeping a collection of shared pointers, in
+   no particular order, with a mechanism to iterate over the list (typically
+   for purposes of dependency tracking, from Ops to Nodes and vice versa).
+
+ */
+template <class T, int BlockSize>
+class SharedPointerCollection {
+ public:
+
+
+
+ private:
+  std::shared_ptr<T> data[BlockSize];
+  std::unique_ptr<SmallSharedPointerCollection<T,BlockSize> > next;
+};
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
diff --git a/src/tensor/config.h b/src/tensor/config.h
new file mode 100644
index 00000000000..0d14c621bba
--- /dev/null
+++ b/src/tensor/config.h
@@ -0,0 +1,109 @@
+// tensor/config.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_CONFIG_H_
+#define KALDI_TENSOR_CONFIG_H_ 1
+
+#include <string>
+#include "util/text-utils.h"
+
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   This Config class is used when we want to store configuration information
+   inside Variables (e.g., to set per-parameter learning rates).
+   We'll eventually need mechanisms to read and write this.
+ */
+class Config {
+ public:
+
+  /**
+    This template will be defined only for types
+    `T = {std::string, bool, int32, float }`.
+
+      @param [in] key   The name of the config parameter we are setting,
+                      e.g. "learning-rate".   Must satisfy IsValidName(key),
+                      i.e. starts with `[a-zA-Z]`, and contains only characters
+                      `[a-zA-Z0-9_-]`.
+      @param [in] value  The value to be set, of type string, bool, int32 or
+                      float.  Any previous value (of whatever type) set for this
+                      key will be overwritten.
+  */
+  template<typename T>  void SetValue(const std::string &key,
+                                      const T &value);
+
+
+  /**
+    This template will be defined only for types
+    `T = {std::string, bool, int32, float }`.
+
+      @param [in] key   The name of the config parameter we are querying,
+                      e.g. "learning-rate".   Must satisfy IsValidName(key),
+                      i.e. starts with `[a-zA-Z]`, and contains only characters
+                      `[a-zA-Z0-9_-]`.
+      @param [out] value  The value to be set, of type string, bool, int32 or
+                      float.  If the key was not present in the map, we return
+                      false and don't set `value`.  If the key was present and
+                      the value was of a compatible type, we set `value`.
+                      If they key was present but the value was not of a
+                      compatible type, we die with an error.
+                      As for type compatibility: all types are compatible
+                      with themselves, and the only automatic conversion we
+                      do (so far) is from int to float.  We may add more
+                      conversions later if needed.
+      @return         Returns true if the key was in the map and of a compatible
+                      type, false if the key was not in the map.  (Dies
+                      if the key was present but with an incompatible type).
+  */
+  template<typename T> bool GetValue(const std::string &key,
+                                     T *value);
+
+
+ private:
+
+  enum ValueType { kStringValue, kBoolValue, kIntValue, kFloatValue };
+
+  struct ConfigElement {
+    ValueType value_type;
+    std::string str;
+    union  {
+      int32 i;
+      float f;
+      bool b;
+    } u;
+  };
+
+  // If we later end up storing many configuration value, we could change this
+  // to unordered_map, but in most cases it will only be one or two so that
+  // would be overkill.
+  std::map<std::string, ConfigElement> map_;
+};
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+// Include implementation of inline functions.
+#include "tensor/config-inl.h"
+
+
+#endif  // KALDI_TENSOR_CONFIG_H_
diff --git a/src/tensor/context.h b/src/tensor/context.h
new file mode 100644
index 00000000000..5b7306c6562
--- /dev/null
+++ b/src/tensor/context.h
@@ -0,0 +1,291 @@
+// tensor/context.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_CONTEXT_H_
+#define KALDI_TENSOR_CONTEXT_H_ 1
+
+#include <cstdint>
+#include <vector>
+#include <string>
+#include "tensor/tensor-common.h"
+
+
+/**
+   This file contains certain mechanisms to set settings about default
+   data types and devices within scopes, some related things like
+   an equivalent of PyTorch's .no_grad().  Also the `Tick()` mechanism
+   is here.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+
+// class Context contains various configurations that we will sometimes need
+// when we do operations on Tensors.
+struct Context {
+  // The default DataType for newly created Tensors
+  DataType default_dtype;
+  // The default Device for newly created Tensors
+  Device default_device;
+};
+
+
+
+// ExecutionContext is used when executing Ops (or doing other things
+// with them, e.g. just storing them); we explicitly pass this
+// object into functions that might want to execute Ops.
+class ExecutionContext: public Context {
+
+  /// This function executes the Op (op.Do()) and/or does something else
+  /// relating to taking derivatives.
+  virtual void Execute(const Op &op);
+
+  virtual ~ExecutionContext() {}
+};
+
+
+// SimpleExecutionContext means we just execute an Op and then immediately
+// delete it.  It's used when we are just computing something with no
+// autograd.  You could, of course, just call the version of the
+// Op that doesn't take an ExecutionContext, but this option makes
+// it easier to switch between autograd and no-autograd.
+class SimpleExecutionContext: public ExecutionContext {
+
+  virtual void Execute(const Op &op) {  op.Do();  }
+  virtual ~SimpleExecutionContext() {}
+};
+
+
+
+
+/**
+   Execution context that you use while doing a forward computation, that
+   executes the forward commands and stores the things required to later do the
+   backprop.  See its Backprop() function for how to execute the backprop.
+*/
+class BackpropExecutionContext: public ExecutionContext {
+
+  /**
+     Constructor of BackpropExecutionContext from an existing DerivMap, which
+     might map, for instance, parameters to their derivatives.
+
+      @param [in] deriv_map   An existing DerivMap, to which the user will
+                      likely have added the model parameters and anything
+                      else that derivatives are needed for, with its
+                      Deriv() function.  This is *copied*, not held as a
+                      reference, by this object, to avoid a kind of memory
+                      leakage.
+      @param [in] base_context  The base execution context, which would
+                      normally be SimpleExecutionContext; it is used to
+                      execute both the forward and backward commands.
+                      This class will store the pointer but will not take
+                      ownership; it is the user's responsibility to
+                      make sure it stays alive as long as this object is
+                      alive.
+   */
+  BackpropExecutionContext(const DerivMap &deriv_map,
+                           ExecutionContext *base_context);
+
+
+
+
+  /**
+     Does the backprop on a Tensor t; propagates the derivative back to whatever
+     quantities you had added derivs for in the DerivMap passed to the constructor.
+
+     The backprop commands will be executed with a SimpleExecutionContext
+     whose Context base-class is a copy of this class's one.  If you want to
+     do something fancier (e.g. for 2, you can use the version of Backprop
+
+     If retain_info is false, it will delete deriv_map_ and clear backward_ops_.
+     This is recommended in most cases; it's more memory efficient.
+
+        @param [in] t    The Tensor that we are taking the derivative with
+                        respect to.
+        @param [in] deriv  The derivative w.r.t. t of the function we
+                        are taking the derivative of.  Might be just
+                        1.0.  Must satsify Broadcastable(deriv, t).
+                        Note: deriv may have more axes than t, in which
+                        case the extra leading axes are required to
+                        have dimensions equal to deriv_map_->ExtraDims().
+                        If deriv_map_->ExtraDims() is nonempty,
+                        the num-axes of 'deriv' is required to equal
+                        `t.NumAxes() + deriv_map_->ExtraDims().size()`.
+   */
+  void Backprop(const Tensor &t,
+                const Tensor &deriv) {
+    if (deriv_map_ == nullptr)
+      KALDI_ERR << "You cannot call Backprop twice on the same "
+          "BackpropExecutionContext";
+
+    // Delete deriv_map_.  This will help ensure that derivative
+    // quantities are deleted as soon as they are no longer needed
+    // (since once we delete the deriv_map_ and the ops referring
+    // to those derivative matrices, they will be garbage collected).
+    deriv_map_ = nullptr;
+
+    for (auto iter = backward_ops_.rbegin();
+         iter != backward_ops_.rend(); ++iter){
+      base_context_->Execute(**iter);
+      // Delete this op.  Deleting the ops also deletes the associated
+      // derivative matrices, via shared_ptr garbage collection.
+      *iter = nullptr;
+    }
+    backward_ops_.clear();
+  }
+
+
+  virtual void Execute(const Op &op) {
+    base_context_->Execute(op);
+    op.GetBackwardDerivOps(&deriv_map_, &backward_ops_);
+  }
+
+  virtual ~BackpropExecutionContext() { }
+
+
+ private:
+  std::vector<unique_ptr<Op> > backward_ops_;
+  unique_ptr<DerivMap> deriv_map_;
+  ExecutionContext *base_context_;
+
+};
+
+
+/**
+   Execution context that you use while doing a forward computation, that
+   executes the forward commands and also computes forward derivatives
+   w.r.t. something.
+*/
+class ForwardPropExecutionContext: public ExecutionContext {
+
+  /**
+     Constructor of ForwardPropExecutionContext from an existing DerivMap, which
+     might map, for instance, some input x to dx/da, where a is the thing
+     we're taking the derivative of.
+
+      @param [in] deriv_map   An existing DerivMap, to which the user will
+                      likely have added the thing we are taking the derivative
+                      w.r.t. (e.g. some input where we want to see its
+                      effect on the computation).  deriv_map is *copied*,
+                      not held as a reference, by this object, to avoid
+                      a kind of memory leakage.
+      @param [in] base_context  The base execution context, which would
+                      normally be SimpleExecutionContext; it is used to
+                      execute both the forward and backward commands.
+                      This class will store the pointer but will not take
+                      ownership; it is the user's responsibility to
+                      make sure it stays alive as long as this object is
+                      alive.
+   */
+  ForwardPropExecutionContext(const DerivMap &deriv_map,
+                              ExecutionContext *base_context);
+
+
+  virtual void Execute(const Op &op) {
+    base_context_->Execute(op);
+    std::vector<std::unique_ptr<Op> > ops;
+    op.GetForwardDerivOps(&deriv_map_, &ops);
+    for (auto iter = ops.begin(); iter != ops.end(); ++iter)
+      base_context_->Execute(*iter);
+    // and let the ops in 'ops' go out of scope and get deleted.
+  }
+
+
+  // Returns pointer to this deriv_map_ (still owned by this class).
+  // May be used to query the derivative of some Tensor w.r.t. the
+  // input, e.g. forward_context.GetDerivMap()->DerivIfPresent(some_tensor).
+  DerivMap *GetDerivMap() { return deriv_map_.get(); }
+};
+
+
+
+
+// struct TensorOptions is used as an arg for some constructors
+// when creating Tensors and Variables; it allows flexibility
+// in specifying the device and/or dtype.  See the examples
+// shown where constructors of Tensor or Variable are declared.
+struct TensorOptions {
+  DataType dtype;
+  Device device;
+
+  explicit TensorOptions(const Context &context):
+      dtype(context.default_dtype),
+      device(context.default_device) { }
+  explicit TensorOptions(const Context &context,
+                         DataType dtype):
+      dtype(dtype), device(context.default_device) { }
+  explicit TensorOptions(const Context &context, Device device):
+      dtype(context.default_dtype), device(device) { }
+  explicit TensorOptions(const Context &context, DeviceType device_type):
+      dtype(context.default_dtype), device(device_type) { }
+  // Here the context is not used; we could create a new version
+  // that doesn't take the context object, but of course that would
+  // make it harder if we add more options later.
+  TensorOptions(const Context &context, DataType dtype,
+                Device device):
+      dtype(dtype), device(device) { }
+  TensorOptions(const Context &context, DataType dtype,
+                Device device_type):
+      dtype(dtype), device(device_type) { }
+  TensorOptions(DataType dtype, Device device_type):
+      dtype(dtype), device(device_type) { }
+  explicit TensorOptions(const TensorOptions &other):
+      dtype(other.dtype), device(other.device) { }
+};
+
+
+// Global variable, initialized from zero, that is used in GetTick().
+// This is defined in tensor-settings.cc.
+extern int64 g_tick_counter;
+inline int64 NextTick() { return ++g_tick_counter; }
+
+
+// debug_mode activates code that checks for invalidated data in the backprop
+// pass; see "Invalidated:" in glossary in tensor.h.
+// Don't access this variable directly,
+extern bool debug_mode;     // Do not access directly!
+extern int64 debug_start_tick;   // Do not access directly!
+
+inline bool DebugMode() {
+  return debug_mode;
+}
+inline void SetDebugMode(bool b) {
+  if (!debug_mode)
+    debug_start_tick = NextTick();
+  debug_mode = b;
+}
+
+/**
+   Returns the tick at which debug mode most recently changed from false to
+   true.
+ */
+inline int64 DebugTick() {
+  KALDI_PARANOID_ASSERT(debug_mode);
+  return debug_start_tick;
+}
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_CONTEXT_H_
diff --git a/src/tensor/cpu-impl-linear.cc b/src/tensor/cpu-impl-linear.cc
new file mode 100644
index 00000000000..e8522664f0d
--- /dev/null
+++ b/src/tensor/cpu-impl-linear.cc
@@ -0,0 +1,58 @@
+// tensor/cpu-impl-linear.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/cpu-impl-linear.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+template <typename Real>
+inline static void AddProductScalar3CPU(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  Real *a_data = static_cast<Real*>(a->data),
+      *b_data = static_cast<Real*>(b->data),
+      *c_data = static_cast<Real*>(c->data);
+  if (beta != 0.0) {
+    *c_data = (beta * *c_data) + alpha * (*a_data + *b_data);
+  } else {  // don't propagate NaN
+    *c_data = alpha * (*a_data + *b_data);
+  }
+}
+
+
+void AddProductScalar3CPU(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  if (c.dtype == kFloatDtype) {
+    AddProductScalar3CPU<float>(a, b, c);
+  } else if (c.dtype == kDoubleDtype) {
+    AddProductScalar3CPU<double>(a, b, c);
+  } else {
+    KALDI_ERR << "Data type not supported for this operation";
+  }
+}
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
diff --git a/src/tensor/cpu-impl-linear.h b/src/tensor/cpu-impl-linear.h
new file mode 100644
index 00000000000..90ea3ad48da
--- /dev/null
+++ b/src/tensor/cpu-impl-linear.h
@@ -0,0 +1,71 @@
+// tensor/cpu-impl-linear.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_CPU_IMPL_LINEAR_H_
+#define KALDI_TENSOR_CPU_IMPL_LINEAR_H_ 1
+
+#include "tensor/tensor.h"
+
+
+// This header actually contains implementations of functions that are required
+// by tensor-impl-linear.cc.  It should not be included by users of this
+// library.
+
+
+namespace kaldi {
+namespace tensor {
+
+
+template <typename Real>
+inline static void AddProductScalar3CPU(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  Real *a_data = static_cast<Real*>(a->data),
+      *b_data = static_cast<Real*>(b->data),
+      *c_data = static_cast<Real*>(c->data);
+  if (beta != 0.0) {
+    *c_data = (beta * *c_data) + alpha * (*a_data + *b_data);
+  } else {  // don't propagate NaN
+    *c_data = alpha * (*a_data + *b_data);
+  }
+}
+
+
+void AddProductScalar3CPU(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  if (c.dtype == kFloatDtype) {
+
+  } else {
+    KALDI_ASSERT(c.dtype == kDoubleDtype &&
+                 "Data type not supported for this operation");
+
+  }
+
+}
+
+
+AddProductScalar3Cpu(alpha, beta, a, b, c);
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_CPU_IMPL_LINEAR_H_
diff --git a/src/tensor/cuda-utils.cc b/src/tensor/cuda-utils.cc
new file mode 100644
index 00000000000..a66bec0d964
--- /dev/null
+++ b/src/tensor/cuda-utils.cc
@@ -0,0 +1,480 @@
+// tensor/cuda-utils.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/cuda-utils.h"
+#include "base/kaldi-math.h"
+
+namespace kaldi {
+namespace tensor {
+
+#define KALDI_STANDARD_THREAD_BLOCK_SIZE 256
+
+
+/**
+   This function splits the kernel that's the last element of 'kernels' so that
+   it satisifes grid_dim.x <= 65535-- if necessary, by splitting it into
+   multiple kernels, increasing the length of the vector 'kernels'.
+       @param [in] kernel  The input kernel that needs to be split;
+                           must satisfy kernel.grid_dim.x > 65535.
+       @param [out] kernels  The split copies of the input kernel will be
+                          *appended* to the vector `kernels`.
+
+ */
+static void SplitStandardKernelX(const StandardOneArgKernel &kernel,
+                                 std::vector<StandardOneArgKernel> *kernels) {
+  int cur_grid_dim = kernels->back().grid_dim.x;
+  KALDI_ASSERT(cur_grid_dim > 65535);
+  int num_kernels = (kernels->back().grid_dim.x + 65534) / 65535;
+
+  size_t cur_size = kernels.size(),
+      new_size = cur_size + num_kernels - 1;
+
+  std::vector<int> new_grid_dims(num_kernels,
+                                 cur_grid_dim / num_kernels);
+  // the next loop ensures that the sum of new_grid_dims equals cur_grid_dim,
+  // correcting for the rounding down.  this will be checked at the bottom of
+  // this function.
+  for (int i = 0; i < cur_grid_dim % num_kernels; i++)
+    new_grid_dims[i]++;
+
+
+  int prev_grid_dim_sum = 0;
+  for (int i = 0; i < num_kernels; i++) {
+    kernels->push_back(kernel);
+    StandardKernel &new_kernel = kernels->back();
+    int this_grid_dim = new_grid_dims[i];
+
+    new_kernel.dim_grid.x = this_grid_dim;
+    if (i + 1 < num_kernels) {
+      // the following actually has no effect on operation since all
+      // threads will run; it's more for clarity.
+      k.sizes.mindex_a_range.x = this_grid_dim * k.sizes.block_stride_a.x;
+    } else {
+      // for the last kernel, this limit might actually make a difference, as
+      // the highest-numbered thread block in the last kernel may not have all
+      // threads run.
+      k.sizes.mindex_a_range.x -= prev_grid_dim_sum * k.sizes.block_stride_a.x;
+    }
+    k.offset_a += prev_grid_dim_sum * k.sizes.block_stride_a.x;
+
+    prev_grid_dim_sum += this_grid_dim;
+  }
+  KALDI_ASSERT(prev_grid_dim_sum == cur_grid_dim);
+}
+
+// This is a copy of SplitStandardKernelX above, but with x's changed to y's.
+// See the documentation for SplitStandardKernelX.
+static void SplitStandardKernelY(const StandardOneArgKernel &kernel,
+                                 std::vector<StandardOneArgKernel> *kernels) {
+  int cur_grid_dim = kernels->back().grid_dim.y;
+  KALDI_ASSERT(cur_grid_dim > 65535);
+    return;
+  int num_kernels = (kernels->back().grid_dim.y + 65534) / 65535;
+
+  size_t cur_size = kernels.size(),
+      new_size = cur_size + num_kernels - 1;
+
+  std::vector<int> new_grid_dims(num_kernels,
+                                 cur_grid_dim / num_kernels);
+  // the next loop ensures that the sum of new_grid_dims equals cur_grid_dim,
+  // correcting for the rounding down.  this will be checked at the bottom of
+  // this function.
+  for (int i = 0; i < cur_grid_dim % num_kernels; i++)
+    new_grid_dims[i]++;
+
+
+  int prev_grid_dim_sum = 0;
+  for (int i = 0; i < num_kernels; i++) {
+    kernels->push_back(kernel);
+    StandardKernel &new_kernel = kernels->back();
+    int this_grid_dim = new_grid_dims[i];
+
+    new_kernel.dim_grid.y = this_grid_dim;
+    if (i + 1 < num_kernels) {
+      // the following actually has no effect on operation since all
+      // threads will run; it's more for clarity.
+      k.sizes.mindex_a_range.y = this_grid_dim * k.sizes.block_stride_a.y;
+    } else {
+      // for the last kernel, this limit might actually make a difference, as
+      // the highest-numbered thread block in the last kernel may not have all
+      // threads run.
+      k.sizes.mindex_a_range.y -= prev_grid_dim_sum * k.sizes.block_stride_a.y;
+    }
+    k.offset_a += prev_grid_dim_sum * k.sizes.block_stride_a.y;
+
+    prev_grid_dim_sum += this_grid_dim;
+  }
+  KALDI_ASSERT(prev_grid_dim_sum == cur_grid_dim);
+}
+
+
+/**
+   This function is used to handle cases where we still have more than 3 axes
+   (should be very rare since we only use the standard kernel on reduced
+   pattern-tuples).  It creates copies of a kernel that differ only in
+   mindex_a_range to take account of an raxis that has not been included in the
+   kernel.
+
+     @param [in] a      The first Pattern that's an arg to the kernel
+     @param [in] raxis  The raxis that we're splitting on; in place of the
+                        single input 'kernel' we will have a separate
+                        output for each i in [0, a.dim[raxis] - 1]
+     @param [in] kernel    The original kernel that awe are going to
+                        expand.  Assumed to correspond to an index
+                        value of 0 on raxis 'raxis'.
+     @param [out] kernels  The output kernels are *appended* to this
+                        vector.  The number of output kernels will
+                        be a.dims[raxis].
+ */
+static void SplitStandardKernelByAxis(
+    const Pattern &a,
+    int32 raxis,
+    const StandardOneArgKernel &kernel
+    std::vector<StandardOneArgKernel> *kernels) {
+  // Asserting raxis > 0 is just from knowledge of how the calling code works,
+  // it is not something that would affect the operation of this function.
+  KALDI_ASSERT(raxis > 0 && raxis < a.num_axes);
+  int32 dim = a.dims[raxis];
+  for (int32 i = 0; i < dim; i++) {
+    kernels->push_back(kernel);
+    StandardOneArgKernel &k = kernels->back();
+    k.mindex_a_range += i * a.strides[raxis];
+  }
+}
+
+// Fills out the 'x' dimension of the standard kernel, which is assumed to have
+// immediately before been initialized with its default constructor.
+
+// The 'x' dimension is filled out using raxis=0, which is required to be the
+// lowest abs(stride) in 'a' and have stride != 0; most of the time, this stride
+// will be 1.  We preferentially make the thread block vary along this axis,
+// which will increase the chance of consolidated memory accesses.  (We could,
+// of course, take much more care to ensure memory accesses are consolidated,
+// taking into account the patterns of b and c and taking into account whether
+// the start of the tensor is on a 128-byte boundary; we can consider these
+// kinds of optimizations in future).
+static void ProcessStandardKernelX(const Pattern &a,
+                                   StandardOneArgKernel *k) {
+  KALDI_PARANOID_ASSERT(a.num_axes >= 1 && a.dims[0] > 1);
+  // Note: b.dims[0] is either 'dim' or 1; it won't affect anything, we only
+  // need b's stride.
+  int dim = a.dims[0],
+      a_stride = a.strides[0],
+      b_stride = b.strides[0],
+      c_stride = c.strides[0];
+
+  // bs is the thread-block size (at least, as far as the x dimension is
+  // concerned).
+  int bs = std::min<int32>(RoundUpToNearestPowerOfTwo(dim),
+                           KALDI_STANDARD_THREAD_BLOCK_SIZE);
+  int num_blocks = (dim + bs - 1) / bs;  // round up.
+
+  k->sizes.thread_stride_a.x = a_stride;
+  k->sizes.block_stride_a.x = a_stride * bs;
+  k->sizes.mindex_a_range.x = dim * a_stride;
+
+  k->block_dim.x = bs;
+  k->grid_dim.x = num_blocks;
+}
+
+
+// Fills out the 'y' dimension of the standard three-arg kernel (whose x
+// dimension is assumed to already have been set up) using an raxis-index
+// specified by the user; this will normally be the one with the largest dim,
+// and it won't be 0 because axis 0 goes to x and will already have been
+// processed.
+static void ProcessStandardKernelY(const Pattern &a,
+                                   int32 raxis,
+                                   StandardOneArgKernel *kernel) {
+  KALDI_PARANOID_ASSERT(a.num_axes > raxis && raxis > 0);
+
+  int dim = a.dims[raxis],
+      stride = a.strides[raxis];
+
+  // bs means block size.
+  int bs_x = kernel->block_dim.x;
+  // If the threads-per-block is too small, we may have to have threads-per-block
+  // != 1 on this axis.
+  int bs_y = std::min<int32>(RoundUpToNearestPowerOfTwo(dim),
+                             KALDI_STANDARD_THREAD_BLOCK_SIZE / bs_x);
+  if (bs_y < 1)
+    bs_y = 1;  // just for robustness to any later code changes.
+  int num_blocks = (dim + bs_y - 1) / bs_y;  // round up.
+
+  k->sizes.thread_stride_a.y = stride;
+  k->sizes.block_stride_a.y = stride * bs_y;
+
+  k->sizes.mindex_a_range.y = dim * stride;
+  k->block_dim.y = bs_y;
+  k->grid_dim.y = num_blocks;
+}
+
+
+// Fills out the 'z' dimension of the standard kernel (whose x and y dimensions
+// are assumed to already have been set up) using an raxis-index specified by the
+// user; this will normally be the one with the largest dim, and it won't be 0
+// because axis 0 goes to x and will already have been processed.
+static void ProcessStandardKernelZ(const Pattern &a,
+                                   StandardOneArgKernel *kernel) {
+  KALDI_PARANOID_ASSERT(a.num_axes > raxis && raxis > 0);
+
+  int dim = a.dims[raxis],
+      stride = a.strides[raxis];
+      c_stride = c.strides[raxis];
+
+  // bs means block size.
+  int bs_x = kernel->block_dim.x,
+      bs_y = kernel->block_dim.y;
+  // If the threads-per-block is too small, we may have to have grid_dim.z
+  // != 1.  But this is only possible if we can choose a value of grid_dim.z
+  // that exactly divides 'dim', because the kernel doesn't have an
+  // if-statement for the z dimension.
+
+  int bs_z;
+  for (int i = 1; i * bs_x * bs_y <= KALDI_STANDARD_THREAD_BLOCK_SIZE; i++)
+    if (dim % i == 0)
+      bs_z = i;
+  // Note: in the normal case, bs_z will be one now.  In all cases,
+  // bs_z will divide 'dim' exactly.
+
+  int num_blocks = dim / bs_z;  // round up.
+
+  k->sizes.thread_stride_a.z = stride;
+  k->sizes.block_stride_a.z = stride * bs_z;
+
+  // The kernel code will not actually inspect mindex_a_range.z; we just leave it
+  // as a guide in case of future code changes.
+  k->sizes.mindex_a_range.z = dim * stride;
+
+  k->block_dim.z = bs_z;
+  k->grid_dim.z = num_blocks;
+}
+
+
+
+
+void FinalizeKernel(const Pattern &a,
+                    ArrayRef<int32> remaining_axes,
+                    std::vector<StandardOneArgKernel> *kernels) {
+  // prev_size is the size of 'kernels'  before the most recent one
+  // was added (since GetStandardKernel appends).  Would normally be zero.
+  size_t prev_size = kernels->size() - 1;
+  if (kernels->back().grid_dim.x > 65535) {
+    SplitStandardKernelX(kernels);
+    if (kernels->back().grid_dim.y > 65535)
+      KALDI_ERR << "You are trying to process a tensor that's way too big";
+    // We don't handle the case where the x and y grid dims are both >65535,
+    // because that much data wouldn't fit on the GPU anyway once you take into
+    // account the thread block size.  (It would require code changes to do
+    // correctly).
+  } else if (kernels->back().grid_dim.y > 65535) {
+    SplitStandardKernelY(kernels);
+  }
+  if (kernels->back().grid_dim.z > 65535)
+    KALDI_ERR << "You are trying to process a tensor that's way too big";
+
+  for (size_t i = 0; i < remaining_axes.size(); i++) {
+    int32 raxis = remaining_axes[i];
+    std::vector<StandardKernel> next_kernels;
+    for (auto kernel: *kernels)
+      SplitStandardKernelByAxis(a, b, c, raxis, kernel, next_kernels);
+    kernels->swap(next_kernels);
+  }
+}
+
+
+// Returns the raxis with the most negative stride.  It is an error if any axis
+// has stride <= 0 (we can require this because of the normalization of the
+// pattern-tuples given to GetStandard{One,Two,Three}ArgKernel).  Intended to be
+// called from GetStandardKernel()
+int32 RaxisWithMostNegativeStride(const Pattern &p) {
+  int32 num_axes = a.num_axes,
+      ans = 0;
+  for (int32 raxis = 1; raxis < num_axes; raxis++)
+    if (p.strides[raxis] < p.strides[ans])
+      ans = raxis;
+  KALDI_ASSERT(p.strides[ans] != 0 &&
+               "Args to GetStandardKernel() do not have the expected "
+               "properties");
+  // if the assert fails, either the pattern-tuple was not in reduced form, or
+  // there is reduction in the operation, which is not allowed in a "standard"
+  // kernel.
+  return ans;
+}
+
+
+void GetStandardOneArgKernel(const Pattern &a,
+                             std::vector<StandardOneArgKernel> *kernels) {
+  int32 smallest_stride_raxis = RaxisWithMostNegativeStride(a);
+  if (a.strides[smallest_stride_raxis] <= 0)
+    KALDI_ERR << "Input pattern does not have expected properties";
+
+  if (smallest_stride_raxis != 0) {
+    // This is unexpected but we can deal with it by swapping axes.
+    Pattern a_new(a);
+    TransposeR(0, smallest_stride_raxis, &a_new);
+    GetStandardKernel(a_new, kernels);
+    return;
+  }
+  kernels->clear();
+  kernels->resize(1);
+  Kernel *kernel = &(kernels->back());
+  kernel->offset_a = a.offset;
+
+  int32 num_axes = a.num_axes;
+  switch (num_axes) {
+    case 0:
+      // The default constructor gives values suitable for a kernel that
+      // only processes a single element, so there is nothing more to do.
+    return;
+    case 1:
+      ProcessStandardKernelX(a, kernel);
+      FinalizeKernel(a, {}, kernels);
+      return;
+    case 2:
+      ProcessStandardKernelX(a, kernel);
+      ProcessStandardKernelY(a, 1, kernel);
+      FinalizeKernel(a, {}, kernels);
+      return;
+    default: {  // >= 3 axes
+      ProcessStandardKernelX(a, kernel);
+      // Sort the raxes 1, 2,... from greatest to least dimension.  (Note: there
+      // are cases where this won't be optimal and we may want to take the
+      // stride into account in order to ensure more consolidated memory access;
+      // we could think about that later).
+      std::vector<int32> raxes;
+      for (int i = 1; i < num_axes; i++)
+        raxes.push_back(i);
+      std::sort(raxes.begin(), raxes.end(),
+                // below is a C++11 lambda used as a comparator function, like
+                // the operator x < y.  The "a" in brackets is the Pattern a,
+                // declared above, which is a "captured" variable for this
+                // lambda.
+                [a] (int x, int y) {
+                  // reverse the direction of comparison because we want raxes
+                  // sorted from greatest to least dim.
+                  return a.dims[x] > a.dims[y];
+                });
+      ProcessStandardKernelY(a, raxes[0], kernel);
+      ProcessStandardKernelZ(a, raxes[1], kernel);
+      raxes_data = &(raxes[0]);
+      // The expression {raxes_data + 2, raxes_data + num_axes - 1} is a
+      // constructor to ArrayRef which gives an array of ints including raxes[2]
+      // and any remaining elements.  This is the possibly-empty subset of raxes
+      // that we haven't already processed, and they should all have fairly
+      // small dimension as we've sorted `raxes` from greatest to least
+      // dimension.  We'll process these left-over raxes by duplicating the
+      // kernel, shifting the offset_{a,b,c} value as needed.
+      FinalizeKernel(a, {raxes_data + raxes_data + num_axes - 1},
+                     raxes.begin  kernel);
+      return;
+    }
+  }
+}
+
+// Convert from 1-arg to 3-arg kernel.
+static void ConvertToThreeArgKernel(
+    const Pattern &a,
+    const Pattern &b,
+    const Pattern &c,
+    const StandardOneArgKernel &src,
+    StandardThreeArgKernel *dest) {
+  dest->dim_block = src.dim_block;
+  dest->dim_grid = src.dim_grid;
+  dest->offset_a = src.offset_a;
+  dest->offset_b = ConvertMindex(a, b, src.offset_a);
+  dest->offset_c = ConvertMindex(a, c, src.offset_a);
+
+  StandardThreeArgKernelSizes &s = dest->sizes;
+  s.thread_stride_a = src.sizes.thread_stride_a;
+  s.block_stride_a = src.sizes.block_stride_a;
+  s.mindex_a_range = src.sizes.mindex_a_range;
+
+  s.thread_stride_b.x = ConvertMindexDifference(a, b, s.thread_stride_a.x);
+  s.thread_stride_b.y = ConvertMindexDifference(a, b, s.thread_stride_a.y);
+  s.thread_stride_b.z = ConvertMindexDifference(a, b, s.thread_stride_a.z);
+  s.block_stride_b.x = ConvertMindexDifference(a, b, s.block_stride_a.x);
+  s.block_stride_b.y = ConvertMindexDifference(a, b, s.block_stride_a.y);
+  s.block_stride_b.z = ConvertMindexDifference(a, b, s.block_stride_a.z);
+
+  s.thread_stride_c.x = ConvertMindexDifference(a, c, s.thread_stride_a.x);
+  s.thread_stride_c.y = ConvertMindexDifference(a, c, s.thread_stride_a.y);
+  s.thread_stride_c.z = ConvertMindexDifference(a, c, s.thread_stride_a.z);
+  s.block_stride_c.x = ConvertMindexDifference(a, c, s.block_stride_a.x);
+  s.block_stride_c.y = ConvertMindexDifference(a, c, s.block_stride_a.y);
+  s.block_stride_c.z = ConvertMindexDifference(a, c, s.block_stride_a.z);
+}
+
+// Convert from 1-arg to 2-arg kernel.
+static void ConvertToTwoArgKernel(
+    const Pattern &a,
+    const Pattern &b,
+    const StandardOneArgKernel &src,
+    StandardThreeArgKernel *dest) {
+  dest->dim_block = src.dim_block;
+  dest->dim_grid = src.dim_grid;
+  dest->offset_a = src.offset_a;
+  dest->offset_b = ConvertMindex(a, b, src.offset_a);
+
+  StandardThreeArgKernelSizes &s = dest->sizes;
+  s.thread_stride_a = src.sizes.thread_stride_a;
+  s.block_stride_a = src.sizes.block_stride_a;
+  s.mindex_a_range = src.sizes.mindex_a_range;
+
+  s.thread_stride_b.x = ConvertMindexDifference(a, b, s.thread_stride_a.x);
+  s.thread_stride_b.y = ConvertMindexDifference(a, b, s.thread_stride_a.y);
+  s.thread_stride_b.z = ConvertMindexDifference(a, b, s.thread_stride_a.z);
+  s.block_stride_b.x = ConvertMindexDifference(a, b, s.block_stride_a.x);
+  s.block_stride_b.y = ConvertMindexDifference(a, b, s.block_stride_a.y);
+  s.block_stride_b.z = ConvertMindexDifference(a, b, s.block_stride_a.z);
+}
+
+
+void GetStandardThreeArgKernel(const Pattern &a,
+                               const Pattern &b,
+                               const Pattern &c,
+                               std::vector<StandardThreeArgKernel> *kernels) {
+  KALDI_PARANOID_ASSERT(a.num_axes >= b.num_axes && a.num_axes >= c.num_axes &&
+                        Broadcastable(a, b) && DimsGeq(a, b) &&
+                        Broadcastable(a, c) && DimsGeq(a, c));
+  std::vector<StandardThreeArgKernel> temp_kernels;
+  GetStandardOneArgKernel(a, kernels);
+  size_t size = temp_kernels.size();
+  kernels->resize(size);
+  for (size_t i = 0; i < size; i++)
+    ConvertToThreeArgKernel(a, b, c, temp_kernels[i], &((*kernels)[i]));
+}
+
+void GetStandardTwoArgKernel(const Pattern &a,
+                             const Pattern &b,
+                             std::vector<StandardThreeArgKernel> *kernels) {
+  KALDI_PARANOID_ASSERT(DimsGeq(a, b) && a.num_axes >= b.num_axes &&
+                        Broadcastable(a, b));
+  std::vector<StandardThreeArgKernel> temp_kernels;
+  GetStandardOneArgKernel(a, kernels);
+  size_t size = temp_kernels.size();
+  kernels->resize(size);
+  for (size_t i = 0; i < size; i++)
+    ConvertToThreeArgKernel(a, b, c, temp_kernels[i], &((*kernels)[i]));
+}
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/cuda-utils.h b/src/tensor/cuda-utils.h
new file mode 100644
index 00000000000..03cbd22afc9
--- /dev/null
+++ b/src/tensor/cuda-utils.h
@@ -0,0 +1,285 @@
+// tensor/cuda-utils.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_CUDA_UTILS_H_
+#define KALDI_TENSOR_TENSOR_CUDA_UTILS_H_ 1
+
+// Caution: don't include this header if we're not compiling with cuda.
+
+#include "tensor/tensor-common.h"
+#include <cuda_runtime_api.h>
+#include <limits>
+
+
+namespace kaldi {
+namespace tensor {
+
+
+
+struct StandardOneArgKernelSizes {
+  dim3 thread_stride_a;
+  dim3 block_stride_a;
+  dim3 mindex_a_range;
+};
+
+
+class StandardOneArgKernel {
+  dim3 dim_block;
+  dim3 dim_grid;
+  StandardOneArgKernelSizes sizes;
+  // offset_a is an offset that we have to add to the data-pointer of a before
+  // we call the kernel; it will normally equal the 'offset' members of the
+  // pattern, but may be different if we have to generate multiple kernels due
+  // to, say, size constraints.
+  int64 offset_a;
+};
+
+
+/**
+   This function returns the dimensions/sizes for one or more "standard one arg
+   kernels" to execute a "standard one arg operation" on Tensors a, of which
+   only the patterns are provided.  We define a standard one-arg operation as an
+   in-place elementwise operation of the form:
+
+       a[i] = f(a[i])
+
+   where i is an index-tuple in the index-tuple-set of the pattern a; search in
+   pattern.h for the meaning of this notation.  Note: one-arg kernels may not
+   actually be needed in practice as two-arg kernels with a and b identical
+   can do the same thing.
+
+   The standard one-arg kernel is as follows:
+<code>
+template <typename T>
+__global__ void _some_one_arg_kernel(StandardOneArgKernelSizes f, T *a) {
+    int a_offset_x = f.thread_stride_a.x * threadIdx.x + block_stride_a.x * blockIdx.x,
+      a_offset_y = f.thread_stride_a.y * threadIdx.y + block_stride_a.y * blockIdx.y,
+      a_offset_z = f.thread_stride_a.z * threadIdx.z + block_stride_a.z * blockIdx.z,
+      a_offset = a_offset_x + a_offset_y + a_offset_z;
+
+     if (a_offset_x < f.mindex_a_range.x && a_offset_y < f.mindex_a_range.y)
+       a[a_offset] = some_func(a[a_offset]);
+  }
+
+  // which would be invoked as follows:
+  template <typename T>
+  void some_one_arg_kernel(const Tensor &a,
+                               const StandardOneArgKernel &k) {
+    _some_one_arg_kernel<<<k.grid_dim, k.block_dim>>>(
+         k.sizes(), a.GetData<T>() + k.base_offset_a);
+         b.GetData<T>() + k.base_offset_b);
+  }
+  //
+
+</code>
+  }
+
+      @param [in] a   Fattern for which we want the kernel (or kernels).  This is
+                      an elementwise operation so it must be in-place.
+                      All its strides are required to be positive (hence it may
+                      have not trivial axes).
+      @param [out] kernels  The kernels are output to this vector.  Normally,
+                      we'll have `kernels->size() == 1` at exit.  The user is expected
+                      to call all of them (the order doesn't matter).
+ */
+void GetStandardOneArgKernel(const Pattern &a,
+                             std::vector<StandardOneArgKernel> *kernels);
+
+
+
+
+struct StandardTwoArgKernelSizes {
+  dim3 thread_stride_a;
+  dim3 thread_stride_b;
+  dim3 block_stride_a;
+  dim3 block_stride_b;
+  dim3 mindex_a_range;
+};
+
+
+class StandardTwoArgKernel {
+  dim3 dim_block;
+  dim3 dim_grid;
+  StandardTwoArgKernelSizes sizes;
+  // offset_a and offset_b are offsets that we have to add to the data-pointers
+  // of a and b before we call the kernel; these will normally equal the
+  // 'offset' members of the respective patterns, but they may be different from
+  // those if we have to generate multiple kernels due to, say, size
+  // constraints.
+  int64 offset_a;
+  int64 offset_b;
+};
+
+
+/**
+   This function returns the dimensions/sizes for one or more "standard two arg
+   kernels" to execute a "standard two arg operation" on Tensors a and b, of
+   which only the patterns are provided.  We define a standard two-arg operation
+   as an elementwise operation possibly with broadcasting, of the form:
+
+       a[i] = f(b[i])
+   where i is an index-tuple in the index-tuple-set of the pattern-tuple (a,b);
+   search in pattern.h for the meaning of this notation.
+
+   a and b must be broadcastable, and the dims of a must be >= the corresponding
+   dims of b (i.e.: no reduction).  We also require a.num_axes >= b.num_axes,
+   which results from the tuple (a,b) having been reduced (see ReducePatternTuple()
+   in pattern-tuple-utils.h).
+   The standard two-arg kernel is as follows:
+<code>
+template <typename T>
+__global__ void _some_two_arg_kernel(StandardTwoArgKernelSizes f, T *a, const T *b) {
+    int a_offset_x = f.thread_stride_a.x * threadIdx.x + block_stride_a.x * blockIdx.x,
+      a_offset_y = f.thread_stride_a.y * threadIdx.y + block_stride_a.y * blockIdx.y,
+      a_offset_z = f.thread_stride_a.z * threadIdx.z + block_stride_a.z * blockIdx.z;
+    int b_offset = f.thread_stride_b.x * threadIdx.x + block_stride_b.x * blockIdx.x +
+                   f.thread_stride_b.y * threadIdx.y + block_stride_b.y * blockIdx.y +
+                   f.thread_stride_b.z * threadIdx.z + block_stride_b.z * blockIdx.z;
+
+     if (a_offset_x < f.mindex_a_range.x && a_offset_y < f.mindex_a_range.y)
+       a[a_offset_x + a_offset_y + a_offset_z] = some_func(b[b_offset]);
+  }
+
+  // which would be invoked as follows:
+  template <typename T>
+  void some_two_arg_kernel(const Tensor &a, const Tensor &b,
+                               const StandardTwoArgKernel &k) {
+    _some_two_arg_kernel<<<k.grid_dim, k.block_dim>>>(
+         k.sizes,
+         a.GetData<T>() + k.base_offset_a,
+         b.GetData<T>() + k.base_offset_b);
+  }
+
+</code>
+  There is also way to invoke two-arg kernels "in-place" so that the function
+  takes two args, like a = f(a, b).
+
+      @param [in] a   First pattern for which we want the kernel (or kernels).
+                      All its strides are required to be positive (hence it may
+                      have not trivial axes).
+      @param [in] b   Second pattern for which we want the kernel (or kernels)
+                      Must satisfy Broadcastable(a, b).
+      @param [out] kernels  The kernels are output this vector.  Normally,
+                      we'll have `kernels->size() == 1` at exit.  The user is expected
+                      to call all of them (the order doesn't matter).
+ */
+void GetStandardTwoArgKernel(const Pattern &a, const Pattern &b,
+                             std::vector<StandardTwoArgKernel> *kernels);
+
+
+
+
+class StandardThreeArgKernelSizes {
+  dim3 thread_stride_a;
+  dim3 thread_stride_b;
+  dim3 thread_stride_c;
+
+  dim3 block_stride_a;
+  dim3 block_stride_b;
+  dim3 block_stride_c;
+
+  dim3 mindex_a_range;
+};
+
+class StandardThreeArgKernel {
+  dim3 dim_block;
+  dim3 dim_grid;
+  StandardTwoArgKernelSizes sizes;
+
+  // base_offset_{a,b,c} are offsets that we have to add to the data-pointers of
+  // the storage regions of a, b and c before we call the kernel; these will
+  // normally equal the 'offset' members of the input patterns, but they may
+  // differ from those if we have to generate multiple kernels due to, say, size
+  // constraints.
+  int64 base_offset_a;
+  int64 base_offset_b;
+  int64 base_offset_c;
+};
+
+
+/**
+   This function returns the dimensions/sizes for one or more "standard three arg
+   kernels" to execute a "standard three arg operation" on Tensors a and b, of
+   which only the patterns are provided.  We define a standard three-arg operation
+   as an elementwise operation possibly with broadcasting, of the form:
+
+       a[i] = f(b[i], c[i])
+   where i is an index-tuple in the index-tuple-set of the pattern-tuple (a,b,c);
+   search in pattern.h for the meaning of this notation.
+
+   a, b and c must be broadcastable, and the dims of a must be >= the
+   corresponding dims of b and of c (i.e.: no reduction).  We also require
+   a.num_axes >= b.num_axes and a.num_aces >= c.num_axes, which results from the
+   tuple (a,b,c) having been reduced (see ReducePatternTuple() in
+   pattern-tuple-utils.h).
+
+   The standard three-arg kernel is as follows:
+<code>
+template <typename T>
+  void _some_three_arg_kernel(StandardThreeArgKernelSizes f,
+                              T *a, const T *b, const T *c) {
+    int a_offset_x = f.thread_stride_a.x * threadIdx.x + block_stride_a.x * blockIdx.x,
+      a_offset_y = f.thread_stride_a.y * threadIdx.y + block_stride_a.y * blockIdx.y,
+      a_offset_z = f.thread_stride_a.z * threadIdx.z + block_stride_a.z * blockIdx.z;
+    int b_offset = f.thread_stride_b.x * threadIdx.x + block_stride_b.x * blockIdx.x +
+                   f.thread_stride_b.y * threadIdx.y + block_stride_b.y * blockIdx.y +
+                   f.thread_stride_b.z * threadIdx.z + block_stride_b.z * blockIdx.z,
+        c_offset = f.thread_stride_c.x * threadIdx.x + block_stride_c.x * blockIdx.x +
+                   f.thread_stride_c.y * threadIdx.y + block_stride_c.y * blockIdx.y +
+                   f.thread_stride_c.z * threadIdx.z + block_stride_c.z * blockIdx.z;
+
+     if (a_offset_x < f.mindex_a_range.x && a_offset_y < f.mindex_a_range.y)
+       a[a_offset_x + a_offset_y + a_offset_z] = some_func(b[b_offset], c[c_offset]);
+  }
+
+  // which would be invoked as follows:
+  template <typename T>
+  void some_three_arg_kernel(const Tensor &a, const Tensor &b,
+                                 const Tensor &c,
+                                 const StandardThreeArgKernel &k) {
+    _some_three_arg_kernel<<<k.grid_dim, k.block_dim>>>(
+         k.sizes,
+         a.GetData<T>() + k.base_offset_a,
+         b.GetData<T>() + k.base_offset_b,
+         c.GetData<T>() + k.base_offset_c);
+  }
+</code>
+  }
+
+      @param [in] a   First pattern for which we want the kernel (or kernels).
+                      All its strides are required to be positive (hence it may
+                      have not trivial axes).
+      @param [in] b   Second pattern for which we want the kernel (or kernels)
+      @param [in] c   Third pattern for which we want the kernel (or kernels)
+      @param [out] kernels  The kernels are output this vector.  Normally,
+                      we'll have `kernels->size() == 1` at exit.  The user is expected
+                      to call all of them (the order doesn't matter).
+ */
+void GetStandardThreeArgKernel(const Pattern &a, const Pattern &b,
+                               std::vector<StandardThreeArgKernel> *kernels);
+
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_PATTERN_H_
diff --git a/src/tensor/deriv-map.h b/src/tensor/deriv-map.h
new file mode 100644
index 00000000000..c5c8fc8e89a
--- /dev/null
+++ b/src/tensor/deriv-map.h
@@ -0,0 +1,187 @@
+// tensor/deriv-map.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_SETTINGS_H_
+#define KALDI_TENSOR_TENSOR_SETTINGS_H_ 1
+
+#include <cstdint>
+#include <vector>
+#include <string>
+#include "tensor/tensor-common.h"
+
+
+
+
+namespace kaldi {
+namespace tensor {
+
+
+/*
+  class DerivMap stores and updates a map from a Tensor to some derivative
+  quantity related to that Tensor.  We store this map separately from
+  the Tensor itself because this seems to generalize more naturally
+  to things like higher-order derivative, and helps keep the code
+  easy to understand.
+
+  Note: the memory for the derivatives is actually allocated for the whole
+  Storage region underlying a Tensor, so if we call Deriv() to create the
+  derivative for some Tensor, all Tensors sharing the same underlying storage
+  region will now also have an entry in the DerivMap.
+
+  Derivative shape:
+
+  For a quantity of shape, say, [ 2 3 ], the derivative will normally have the exact
+  same shape, e.g. [ 2 3 ].  But if the extra_dims
+
+
+  if ExtraDim() == 0, but if ExtraDim == x with x > 0,
+  the derivative will have the shape [ x 1 1 1 2 3 ].  This makes it possible
+  to compute derivatives w.r.t. vector-value quantities (of course, this
+  would be more expensive).
+*/
+class DerivMap {
+ public:
+  /** Construct a new, empty DerivMap.
+       @param [in] context  Context that determinize the dtype and device
+                     for derivatives we create.
+  */
+  DerivMap(const Context &context);
+
+
+  /**
+     Constructor where you can provide a vector of extra dimensions that the
+     derivatives will have (ordered as in the public numbering, in which
+     they will appear before the dimensions of the things used in the
+     forwardpass).  This is for when you are taking the derivative w.r.t.
+     a more-than-scalar-valued quantity (in backward mode) or taking the
+     derivative of a more-than-scalar-valued quantity w.r.t. things
+     (in forward mode).  This should rarely be used.
+
+        @param [in] context  Object that sets the default device and dtype
+        @param [in] extra_dims   Extra dimensions, ordered as in the public
+                       numbering, that the derivative has, e.g. in reverse-mode
+                       autograd (backprop) this is used when we are taking
+                       derivatives w.r.t a non-scalar quantity.
+        @param [in] axis_offset  The user should set this to a number >= the
+                       largest num_axes of any of the Tensors with which
+                       we will call Deriv() or DerivIfPresent() with this
+                       object.  (Note: any matrix multiplication implicitly
+                       adds an axis, so for example if you are doing matrix
+                       multiplication on Tensors with 3 axes, you should
+                       make sure axis_offset is at least 4) axis_offset
+                       ensures that the 'extra_dims' always appear
+                       at the same position regardless of the num_axes
+                       of the Tensor we called Deriv() with.
+                       Technically, axis_offset is only an axis offset in
+                       the private numbering;; in the public numbering it's the
+                       num_axes to which we pad the Tensors supplied to Deriv()
+                       before prepending extra_dims.
+
+     Example: if extra_dims = [2 3] and axis_offset = 4, and someone calls
+     Deriv() with a Tensor of shape [7 8], the derivative Tensor will have
+     shape [2 3 1 1 7 8].  (Note: any unused/trivial axes will have no effect
+     on the actual computation).
+  */
+  DerivMap(const Context &context,
+           ArrayRef<int32> extra_dims,
+           int32 axis_offset);
+
+
+  /**
+     Copy constructor.  This is expected to be used in typical neural net
+     training workflows, where we create a DerivMap for the parameters, and then
+     use it with the copy constructor to initialize a fresh DerivMap that will
+     also store the derivatives for the temporary quantities.
+  */
+  DerivMap(const DerivMap &other);
+
+
+
+  // Returns the derivative Tensor for Tensor 't', if one exists already; else
+  // NULL.  (To explain return type, see "Optional Tensor" in tensor.h).
+  std::shared_ptr<TensorImpl> DerivIfPresent(const Tensor &t) const;
+
+  /**
+     Returns the derivative for Tensor t, creating it if it did not already
+     exist.  The mapping from t to its derivative is only stored in this class.
+     See "Derivative shape:" above for explanation of the shape of this Tensor;
+     it will usually be the same as the shape of t.
+     In order to make sure that a Tensor t has an entry in this DerivMap,
+     you can call this function and ignore the return value.
+
+     Note: the derivative objects are created at the level of the Storage
+     region, so when any Tensor that uses a particular storage region
+     becomes tracked, all other Tensors using that storage region also
+     become tracked.
+
+         @param [in] t  The Tensor whose derivative the user is requesting
+  */
+  Tensor Deriv(const Tensor &t);
+
+
+  /**
+     Returns a value that is always positive and normally 1, which is the product of extra_dims_.
+  */
+  int64 ExtraDimsProd() const  { return extra_dims_prod_; }
+
+  std::vector<int32> &ExtraDims() const  { return extra_dims_; }
+
+ private:
+
+  Context context_;  // Dictates default dtype and device.
+
+  // extra_dims_ is the shape (in the public numbering) of the thing that we are
+  // taking the derivative of (in backward mod) or with respect to (in forward
+  // mode).  It would normally be the empty vector, meaning we're taking the
+  // derivative w.r.t. a scalar.  All elements must be positive.
+  std::vector<int32> extra_dims_;
+
+  // determines where we place the extra_dims_ (in the private numbering); or,
+  // in the public numbering, what num-axes we pad the arg to Deriv to, before
+  // prepending the dims in extra_dims_.   See example given in the doc
+  // for the 3-arg constructor.
+  int32 axis_offset_;
+
+  // extra_dims_prod_ is the product of the elements of extra_dims_.  It will
+  // normally be 1.
+  int64 extra_dims_prod_;
+
+
+  // The record relating to the map from one source Storage object to the
+  // corresponding derivative.  The num_bytes of the deriv_storage object will
+  // be equal to the num_bytes of src_storage times extra_dims_prod_.
+  struct DerivRecord {
+    std::weak_ptr<Storage> src_storage;
+    std::weak_ptr<Storage> deriv_storage;
+  };
+
+  // The key in this map is the int64 tick value when the src_storage object was
+  // created (see its Id() function).  (We don't use its memory address, since
+  // those can be re-used).
+  std::unordered_map<int64, DerivRecord> map_;
+};
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_SETTINGS_H_
diff --git a/src/tensor/gpu-impl-linear.h b/src/tensor/gpu-impl-linear.h
new file mode 100644
index 00000000000..3f6cd479db8
--- /dev/null
+++ b/src/tensor/gpu-impl-linear.h
@@ -0,0 +1,61 @@
+// tensor/gpu-impl-linear.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_GPU_IMPL_LINEAR_H_
+#define KALDI_TENSOR_GPU_IMPL_LINEAR_H_ 1
+
+#include "tensor/tensor.h"
+
+
+// This header actually contains implementations of functions that are required
+// by tensor-impl-linear.cc.  It should not be included by users of this
+// library.
+
+
+namespace kaldi {
+namespace tensor {
+
+
+template <typename Real>
+inline static void AddProductScalar3GPU(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  // TODO: make this actually work on GPU, probably by calling the 1-d vector version.
+  Real *a_data = static_cast<Real*>(a->data),
+      *b_data = static_cast<Real*>(b->data),
+      *c_data = static_cast<Real*>(c->data);
+  if (beta != 0.0) {
+    *c_data = (beta * *c_data) + alpha * (*a_data + *b_data);
+  } else {  // don't propagate NaN
+    *c_data = alpha * (*a_data + *b_data);
+  }
+}
+
+
+
+
+}
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_GPU_IMPL_LINEAR_H_
diff --git a/src/tensor/linear-cpu-ops.h b/src/tensor/linear-cpu-ops.h
new file mode 100644
index 00000000000..a30a517395d
--- /dev/null
+++ b/src/tensor/linear-cpu-ops.h
@@ -0,0 +1,355 @@
+// tensor/linear-cpu-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_LINEAR_SPECIAL_OPS_H_
+#define KALDI_TENSOR_LINEAR_SPECIAL_OPS_H_ 1
+
+#include "tensor/tensor.h"
+#include "tensor/linear-special-ops.h"
+#include "matrix/kaldi-blas.h"
+
+// This Ops are more specialized forms of the Ops declared in linear-ops.h;
+// these correspond to more specific combinations of Tensor shapes.  These Ops
+// are only intended to be created from inside other more generic Ops.
+namespace kaldi {
+namespace tensor {
+
+/**
+   Does a += b for a and b both scalar, on CPU.
+ */
+template <class T>
+class ScalarPlusEqScalarCpuOp<T>: public Op {
+
+  ScalarPlusEqScalarOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  Op *Copy() {
+    return new ScalarPlusEqScalar<T>(a_, b_);
+  }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    *a_.GetData<T>() += *b_.GetData<T>();
+  }
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+/**
+   Does a += b for a and b both possibly-strided vectors (Stvector), on CPU.
+
+   They must be normalized form, i.e. all axes trivial except raxis 0,
+   and they must have the same dimension.
+
+   This generic form of the template works for integer types (and would work,
+   if used, for float and double).  We will separately instantiate this
+   template for float and double, to use BLAS calls
+*/
+template <class T>
+class StvectorPlusEqStvectorCpuOp<T>: public Op {
+
+  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() {
+    return new StvectorPlusEqStvectorCpuOp<T>(a_, b_);
+  }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = a_pattern.dims[0],
+        a_stride = a_pattern.strides[0],
+        b_stride = b_pattern.strides[0];
+
+    bool uninitialized;
+    T *a_data = a_.GetData<T>(&uninitialized),
+        *b_data = a_.GetData<T>();
+    if (uninitialized) {
+      // This branch is an optimization to avoid writing, and reading, zeros
+      // to/from memory.
+      DebugNormalOp(a, kWrite, b_, kRead);
+      // In future could look into unrolling this loop if it becomes a bottleneck.
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] = b_data[i * b_stride];
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      // In future could look into unrolling this loop if it becomes a bottleneck.
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] += b_data[i * b_stride];
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+// override for float that uses BLAS
+template <>
+class StvectorPlusEqStvectorCpuOp<float>: public Op {
+  SvectorPlusEqSvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+  int32 Properties() { return kConcreteOp; }
+  Op *Copy() {
+    return new SvectorPlusEqSvectorCpuOp<float>(a_, b_);
+  }
+  void Do() {
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    bool uninitialized;
+    float *a_data = a_.GetData<float>(&uninitialized),
+        *b_data = a_.GetData<float>();
+    if (uninitialized) {
+      // This branch is an optimization to avoid writing, and reading, zeros
+      // to/from memory.
+      DebugNormalOp(a, kWrite, b_, kRead);
+      cblas_scopy(a_pattern.dims[0], 1.0,
+                  b_.GetData<float>(), b_pattern.strides[0],
+                  a_.GetData<float>(), a_pattern.strides[0]);
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      cblas_saxpy(a_pattern.dims[0], 1.0,
+                  b_.GetData<float>(), b_pattern.strides[0],
+                  a_.GetData<float>(), a_pattern.strides[0]);
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+// override for double that uses BLAS
+template <>
+class StvectorPlusEqStvectorCpuOp<double>: public Op {
+  SvectorPlusEqSvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+  int32 Properties() { return kConcreteOp; }
+  Op *Copy() {
+    return new SvectorPlusEqSvectorCpuOp<double>(a_, b_);
+  }
+  void Do() {
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    bool uninitialized;
+    double *a_data = a_.GetData<double>(&uninitialized),
+        *b_data = a_.GetData<double>();
+    if (uninitialized) {
+      // This branch is an optimization to avoid writing, and reading, zeros
+      // to/from memory.
+      DebugNormalOp(a, kWrite, b_, kRead);
+      cblas_dcopy(a_pattern.dims[0], 1.0,
+                  b_.GetData<double>(), b_pattern.strides[0],
+                  a_.GetData<double>(), a_pattern.strides[0]);
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      cblas_daxpy(a_pattern.dims[0], 1.0,
+                  b_.GetData<double>(), b_pattern.strides[0],
+                  a_.GetData<double>(), a_pattern.strides[0]);
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+/**
+   Does a += b for a scalar and b a vector or strided vector, on CPU.
+   (i.e. a += sum(b)).
+
+   They must be normalized form, i.e. all axes trivial except raxis 0
+   of b, and b must not have negative stride.  (This is to allow
+   the BLAS template overrides).
+
+   This generic form of the template works for integer types (and would work,
+   if used, for float and double).  We will separately instantiate this
+   template for float and double, to use BLAS calls.
+*/
+template <class T>
+class ScalarPlusEqStvectorCpuOp<T>: public Op {
+
+  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new ScalarPlusEqStvectorCpuOp<T>(a_, b_); }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = b_pattern.dims[0],
+        b_stride = b_pattern.strides[0];
+    T *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+    T sum(0);
+    // In future could look into unrolling this loop if it becomes a bottleneck.
+    for (int32 i = 0; i < dim; i++)
+      sum += b_data[i * b_stride];
+    *a_data += sum;
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+// Override for T = float.
+template <>
+class ScalarPlusEqStvectorCpuOp<float>: public Op {
+  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new ScalarPlusEqStvectorCpuOp<float>(a_, b_); }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = b_pattern.dims[0],
+        b_stride = b_pattern.strides[0];
+    float *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+    *a_data += cblas_sasum(dim, b_data, b_stride);
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+// Override for T = double
+template <>
+class ScalarPlusEqStvectorCpuOp<double>: public Op {
+  ScalarPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new ScalarPlusEqStvectorCpuOp<double>(a_, b_); }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = b_pattern.dims[0],
+        b_stride = b_pattern.strides[0];
+    double *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+    *a_data += cblas_dasum(dim, b_data, b_stride);
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+/**
+   Operation doing a += b with a a vector and b a scalar.  (I.e. add
+   a constant elementwise to a vector).
+
+   May not be used if a and b overlap.
+*/
+template <class T>
+class StvectorPlusEqScalarCpuOp<T>: public Op {
+  StvectorPlusEqScalarOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new StvectorPlusEqScalarCpuOp<T>(a_, b_); }
+
+  void Do() {
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = a_pattern.dims[0],
+        a_stride = a_pattern.strides[0];
+    bool uninitialized;
+    T *a_data = a_.GetData<T>(&uninitialized),
+        *b_data = a_.GetData<T>();
+
+    if (uninitialized) {
+      DebugNormalOp(a, kWrite, b_, kRead);
+      T b = *b_data;
+#pragma unroll (4)
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] = b;
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      T b = *b_data;
+#pragma unroll (4)
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] += b;
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+/**
+   Operation doing a += b with a a vector or strided vector (implicitly,
+   interpreted as a row vector) and b a matrix, so it sums up the rows of the
+   matrix.
+
+   May not be used if a and b overlap.
+*/
+template <class T>
+class StvectorPlusEqMatrixCpuOp<T>: public Op {
+  StvectorPlusEqMatrixCpuOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new StvectorPlusEqMatrixCpuOp<T>(a_, b_); }
+
+  void Do() {
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 a_dim = a_pattern.dims[0],
+        b_num_cols = b_pattern.dims[0],
+        b_num_rows = b_pattern.dims[1],
+        a_stride = a_pattern.strides[0],
+        b_stride =  b_pattern.strides[1];
+    KALDI_PARANOID_ASSERT(b_pattern.strides[0] == 1 &&
+                          a_dim == b_num_cols);
+
+    bool uninitialized;
+    T *a_data = a_.GetData<T>(&uninitialized),
+        *b_data = a_.GetData<T>();
+
+    if (uninitialized) {
+      DebugNormalOp(a, kWrite, b_, kRead);
+      T b = *b_data;
+#pragma unroll (4)
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] = b;
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      T b = *b_data;
+#pragma unroll (4)
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] += b;
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_LINEAR_OPS_H_
diff --git a/src/tensor/linear-cpu-ref-ops.h b/src/tensor/linear-cpu-ref-ops.h
new file mode 100644
index 00000000000..7e4ddab9297
--- /dev/null
+++ b/src/tensor/linear-cpu-ref-ops.h
@@ -0,0 +1,175 @@
+// tensor/linear-ref-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_LINEAR_REF_OPS_H_
+#define KALDI_TENSOR_LINEAR_REF_OPS_H_ 1
+
+#include "tensor/tensor.h"
+#include "tensor/op.h"
+
+
+// This header contains the "reference version" of linear Ops;
+// this is the very simple, not-efficient version that runs on
+// CPU when we run in "reference mode" (or when we encounter
+// some combination that can't be run using our normal BLAS-based
+// speciailized Ops).
+namespace kaldi {
+namespace tensor {
+
+// Corresponds to the command a += b.
+template <typename T>
+class PlusEqCpuRefOp: public Op {
+  PlusEqCpuRefOp(const Tensor &a, const Tensor &b):
+      a_(a), b_(b) {
+    KALDI_ASSERT(!Overlap(a, b) && BroadcastableAndCompatible(a, b));
+  }
+
+  int32 Properties() { return kConcreteOp ; }
+
+  Op *Copy() const override {
+    return new PlusEqCpuRefOp<T>(a_, b_);
+  }
+
+  void Do() const override {
+    RecordUse(a_, kReadWrite);
+    RecordUse(b_, kRead);
+    Do(a_.GetData<T>(), b_.GetData<T>,
+       KALDI_TENSOR_MAX_DIM - 1);
+  }
+
+  private:
+
+  void Do(T *a, const T *b, int32 raxis) {
+    int32 dim = std::max<int32>(a_.dims[raxis], b_.dims[raxis]),
+        a_stride = a_.strides[raxis], b_stride = b_.strides[raxis];
+    if (raxis == 0) {
+      for (int32 i = 0; i < dim; i++) {
+        a[i * a_stride] += b[i * b_stride];
+      }
+    } else {
+      for (int32 i = 0; i < dim; i++) {
+        Do(a + i * a_stride, b + i * b_stride, raxis - 1);
+      }
+    }
+  }
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+template <typename T>
+class SetZeroRefOp: public Op {
+  SetZeroRefOp(const Tensor &a):
+      a_(a) { }
+
+  int32 Properties() { return kConcreteOp ; }
+
+  Op *Copy() const override {
+    return new SetZeroRefOp<T>(a_);
+  }
+
+  void Do() const override {
+    RecordUse(a_, kWrite);
+    Do(a_.GetData<T>(), KALDI_TENSOR_MAX_DIM - 1);
+  }
+
+  private:
+
+  void Do(T *a, int32 raxis) {
+    int32 dim = a_.dims[raxis],
+        stride = a_.strides[raxis];
+    if (raxis == 0) {
+      // TODO: if stride is 1, use memset below.
+      if (stride == 1) {
+        std::memset(a, 0, dim * sizeof(T));
+      } else {
+#pragma unroll (4)
+        for (int32 i = 0; i < dim; i++) {
+          a[i * a_stride] = 0;
+        }
+      }
+    } else {
+      for (int32 i = 0; i < dim; i++) {
+        Do(a + i * a_stride, raxis - 1);
+      }
+    }
+  }
+  Tensor a_;
+};
+
+
+// T is the data-type of a, U is the data-type of b;
+// this Op supports type conversion.
+template <typename T, typename U>
+class AssignCpuRefOp: public Op {
+  AssignCpuRefOp(const Tensor &a, const Tensor &b):
+      a_(a), b_(b) {
+    // The DimsGeq() makes sure there is no summation, as this version of the op
+    // does not support summation.
+    KALDI_ASSERT(!Overlap(a, b) && Compatible(a, b) &&
+                 Broadcastable(a, b) &&
+                 DimsGeq(a.Pattern(), b.Pattern()));
+  }
+
+  int32 Properties() { return kConcreteOp ; }
+
+  Op *Copy() const override {
+    return new AssignCpuRefOp<T>(a_, b_);
+  }
+
+  void Do() const override {
+    RecordUse(a_, kWrite);
+    RecordUse(b_, kRead);
+    Do(a_.GetData<T>(), b_.GetData<U>,
+       KALDI_TENSOR_MAX_DIM - 1);
+  }
+
+  private:
+
+  void Do(T *a, const U *b, int32 raxis) {
+    int32 dim = std::max<int32>(a_.dims[raxis], b_.dims[raxis]),
+        a_stride = a_.strides[raxis], b_stride = b_.strides[raxis];
+    if (raxis == 0) {
+#pragma unroll (4)
+      for (int32 i = 0; i < dim; i++) {
+        a[i * a_stride] = static_cast<T>(b[i * b_stride]);
+      }
+    } else {
+      for (int32 i = 0; i < dim; i++) {
+        Do(a + i * a_stride, b + i * b_stride, raxis - 1);
+      }
+    }
+  }
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+}
+}
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_LINEAR_REF_OPS_H_
diff --git a/src/tensor/linear-cuda-ops.h b/src/tensor/linear-cuda-ops.h
new file mode 100644
index 00000000000..21884f98459
--- /dev/null
+++ b/src/tensor/linear-cuda-ops.h
@@ -0,0 +1,309 @@
+// tensor/linear-cuda-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_LINEAR_CUDA_OPS_H_
+#define KALDI_TENSOR_LINEAR_CUDA_OPS_H_ 1
+#if HAVE_CUDA == 1
+
+#include "tensor/tensor.h"
+#include "tensor/linear-special-ops.h"
+#include "matrix/kaldi-blas.h"
+
+// This Ops are more specialized forms of the Ops declared in linear-ops.h;
+// these correspond to more specific combinations of Tensor shapes.  These Ops
+// are only intended to be created from inside other more generic Ops.
+namespace kaldi {
+namespace tensor {
+
+
+//    Does a += b for a and b both scalar, on GPU
+template <class T>
+class ScalarPlusEqScalarOp<T, kGpuDevice>;
+
+
+template<>
+class ScalarPlusEqScalarOp<T, kGpuDevice>;
+
+
+  ScalarPlusEqScalarOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  Op *Copy() {
+    return new ScalarPlusEqScalar<T, kGpuDevice>(a_, b_);
+  }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+
+    *a_.GetData<T>() += *b_.GetData<T>();
+  }
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+/**
+   Does a += b for a and b both possibly-strided vectors (Stvector), on CPU.
+
+   They must be normalized form, i.e. all axes trivial except raxis 0,
+   and they must have the same dimension.
+
+   This generic form of the template works for integer types (and would work,
+   if used, for float and double).  We will separately instantiate this
+   template for float and double, to use BLAS calls
+*/
+template <class T>
+class StvectorPlusEqStvectorOp<T, kCpuDevice>: public Op {
+
+  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() {
+    return new StvectorPlusEqStvectorOp<T, kCpuDevice>(a_, b_);
+  }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = a_pattern.dims[0],
+        a_stride = a_pattern.strides[0],
+        b_stride = b_pattern.strides[0];
+
+    bool uninitialized;
+    T *a_data = a_.GetData<T>(&uninitialized),
+        *b_data = a_.GetData<T>();
+    if (uninitialized) {
+      // This branch is an optimization to avoid writing, and reading, zeros
+      // to/from memory.
+      DebugNormalOp(a, kWrite, b_, kRead);
+      // In future could look into unrolling this loop if it becomes a bottleneck.
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] = b_data[i * b_stride];
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      // In future could look into unrolling this loop if it becomes a bottleneck.
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] += b_data[i * b_stride];
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+// override for float that uses BLAS
+template <>
+class StvectorPlusEqStvectorOp<float, kCpuDevice>: public Op {
+  SvectorPlusEqSvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+  int32 Properties() { return kConcreteOp; }
+  Op *Copy() {
+    return new SvectorPlusEqSvectorOp<float, kCpuDevice>(a_, b_);
+  }
+  void Do() {
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    bool uninitialized;
+    float *a_data = a_.GetData<float>(&uninitialized),
+        *b_data = a_.GetData<float>();
+    if (uninitialized) {
+      // This branch is an optimization to avoid writing, and reading, zeros
+      // to/from memory.
+      DebugNormalOp(a, kWrite, b_, kRead);
+      cblas_scopy(a_pattern.dims[0], 1.0,
+                  b_.GetData<float>(), b_pattern.strides[0],
+                  a_.GetData<float>(), a_pattern.strides[0]);
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      cblas_saxpy(a_pattern.dims[0], 1.0,
+                  b_.GetData<float>(), b_pattern.strides[0],
+                  a_.GetData<float>(), a_pattern.strides[0]);
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+// override for double that uses BLAS
+template <>
+class StvectorPlusEqStvectorOp<double, kCpuDevice>: public Op {
+  SvectorPlusEqSvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+  int32 Properties() { return kConcreteOp; }
+  Op *Copy() {
+    return new SvectorPlusEqSvectorOp<double, kCpuDevice>(a_, b_);
+  }
+  void Do() {
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    bool uninitialized;
+    double *a_data = a_.GetData<double>(&uninitialized),
+        *b_data = a_.GetData<double>();
+    if (uninitialized) {
+      // This branch is an optimization to avoid writing, and reading, zeros
+      // to/from memory.
+      DebugNormalOp(a, kWrite, b_, kRead);
+      cblas_dcopy(a_pattern.dims[0], 1.0,
+                  b_.GetData<double>(), b_pattern.strides[0],
+                  a_.GetData<double>(), a_pattern.strides[0]);
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      cblas_daxpy(a_pattern.dims[0], 1.0,
+                  b_.GetData<double>(), b_pattern.strides[0],
+                  a_.GetData<double>(), a_pattern.strides[0]);
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+/**
+   Does a += b for a scalar and b a vector or strided vector, on CPU.
+   (i.e. a += sum(b)).
+
+   They must be normalized form, i.e. all axes trivial except raxis 0
+   of b, and b must not have negative stride.  (This is to allow
+   the BLAS template overrides).
+
+   This generic form of the template works for integer types (and would work,
+   if used, for float and double).  We will separately instantiate this
+   template for float and double, to use BLAS calls.
+*/
+template <class T>
+class ScalarPlusEqStvectorOp<T, kCpuDevice>: public Op {
+
+  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new ScalarPlusEqStvectorOp<T, kCpuDevice>(a_, b_); }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = b_pattern.dims[0],
+        b_stride = b_pattern.strides[0];
+    T *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+    T sum(0);
+    // In future could look into unrolling this loop if it becomes a bottleneck.
+    for (int32 i = 0; i < dim; i++)
+      sum += b_data[i * b_stride];
+    *a_data += sum;
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+// Override for T = float.
+template <>
+class ScalarPlusEqStvectorOp<float, kCpuDevice>: public Op {
+  StvectorPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new ScalarPlusEqStvectorOp<float, kCpuDevice>(a_, b_); }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = b_pattern.dims[0],
+        b_stride = b_pattern.strides[0];
+    float *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+    *a_data += cblas_sasum(dim, b_data, b_stride);
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+// Override for T = double
+template <>
+class ScalarPlusEqStvectorOp<double, kCpuDevice>: public Op {
+  ScalarPlusEqStvectorOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new ScalarPlusEqStvectorOp<double, kCpuDevice>(a_, b_); }
+
+  void Do() {
+    DebugNormalOp(a, kReadWrite, b_, kRead);
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = b_pattern.dims[0],
+        b_stride = b_pattern.strides[0];
+    double *a_data = a_.GetData<T>(),
+        *b_data = a_.GetData<T>();
+    *a_data += cblas_dasum(dim, b_data, b_stride);
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+/**
+   Operation doing a += b with a a vector and b a scalar.  (I.e. add
+   a constant elementwise to a vector).
+
+   May not be used if a and b overlap.
+*/
+template <class T>
+class StvectorPlusEqScalarOp<T, kCpuDevice>: public Op {
+  StvectorPlusEqScalarOp(const Tensor &a, const Tensor &b): a_(a), b_(b) { }
+
+  int32 Properties() { return kConcreteOp; }
+
+  Op *Copy() { return new StvectorPlusEqScalarOp<T, kCpuDevice>(a_, b_); }
+
+  void Do() {
+    const Pattern &a_pattern = a_.Pattern(),
+        &b_pattern = b_.Pattern();
+    int32 dim = a_pattern.dims[0],
+        a_stride = a_pattern.strides[0];
+    bool uninitialized;
+    T *a_data = a_.GetData<T>(&uninitialized),
+        *b_data = a_.GetData<T>();
+
+    if (uninitialized) {
+      DebugNormalOp(a, kWrite, b_, kRead);
+      T b = *b_data;
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] = b;
+    } else {
+      DebugNormalOp(a, kReadWrite, b_, kRead);
+      T b = *b_data;
+      for (int32 i = 0; i < dim; i++)
+        a_data[i * a_stride] += b;
+    }
+  }
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+#endif  // HAVE_CUDA == 1
+#endif  // KALDI_TENSOR__LINEAR_OPS_H_
diff --git a/src/tensor/linear-gpu-ref-ops.h b/src/tensor/linear-gpu-ref-ops.h
new file mode 100644
index 00000000000..bf0ed1a1ae9
--- /dev/null
+++ b/src/tensor/linear-gpu-ref-ops.h
@@ -0,0 +1,174 @@
+// tensor/linear-gpu-ref-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_LINEAR_REF_OPS_H_
+#define KALDI_TENSOR_LINEAR_REF_OPS_H_ 1
+
+#include "tensor/tensor.h"
+#include "tensor/op.h"
+#include "tensor/linear-ops.h"
+#include "tensor/linear-special-ops.h"
+
+
+// This header contains the "reference version" of linear Ops;
+// this is the very simple, not-efficient version that runs on
+// CPU when we run in "reference mode" (or when we encounter
+// some combination that can't be run using our normal BLAS-based
+// speciailized Ops).
+namespace kaldi {
+namespace tensor {
+
+
+
+
+
+// Corresponds to the command a += b.
+template <typename T>
+class PlusEqRefOp: public Op {
+  PlusEqRefOp(const Tensor &a, const Tensor &b):
+      a_(a), b_(b) {
+    KALDI_ASSERT(!Overlap(a, b) && BroadcastableAndCompatible(a, b));
+  }
+
+  int32 Properties() { return kConcreteOp ; }
+
+  Op *Copy() const override {
+    return new PlusEqRefOp<T>(a_, b_);
+  }
+
+  void Do() const override {
+    RecordUse(a_, kReadWrite);
+    RecordUse(b_, kRead);
+    Do(a_.GetData<T>(), b_.GetData<T>,
+       KALDI_TENSOR_MAX_DIM - 1);
+  }
+
+  private:
+
+  void Do(T *a, const T *b, int32 raxis) {
+    int32 dim = std::max<int32>(a_.dims[raxis], b_.dims[raxis]),
+        a_stride = a_.strides[raxis], b_stride = b_.strides[raxis];
+    if (raxis == 0) {
+      for (int32 i = 0; i < dim; i++) {
+        a[i * a_stride] += b[i * b_stride];
+      }
+    } else {
+      for (int32 i = 0; i < dim; i++) {
+        Do(a + i * a_stride, b + i * b_stride, raxis - 1);
+      }
+    }
+  }
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+template <typename T>
+class SetZeroRefOp: public Op {
+  SetZeroRefOp(const Tensor &a):
+      a_(a) { }
+
+  int32 Properties() { return kConcreteOp ; }
+
+  Op *Copy() const override {
+    return new SetZeroRefOp<T>(a_);
+  }
+
+  void Do() const override {
+    RecordUse(a_, kWrite);
+    Do(a_.GetData<T>(), KALDI_TENSOR_MAX_DIM - 1);
+  }
+
+  private:
+
+  void Do(T *a, int32 raxis) {
+    int32 dim = a_.dims[raxis],
+        stride = a_.strides[raxis];
+    if (raxis == 0) {
+      for (int32 i = 0; i < dim; i++) {
+        a[i * a_stride] = 0;
+      }
+    } else {
+      for (int32 i = 0; i < dim; i++) {
+        Do(a + i * a_stride, raxis - 1);
+      }
+    }
+  }
+  Tensor a_;
+};
+
+
+// T is the data-type of a, U is the data-type of b;
+// this Op supports type conversion.
+template <typename T, typename U>
+class AssignRefOp: public Op {
+  PlusEqRefOp(const Tensor &a, const Tensor &b):
+      a_(a), b_(b) {
+    // The DimsGeq() makes sure there is no summation, as this version of the op
+    // does not support summation.
+    KALDI_ASSERT(!Overlap(a, b) && Compatible(a, b) &&
+                 Broadcastable(a, b) &&
+                 DimsGeq(a.Pattern(), b.Pattern()));
+  }
+
+  int32 Properties() { return kConcreteOp ; }
+
+  Op *Copy() const override {
+    return new PlusEqRefOp<T>(a_, b_);
+  }
+
+  void Do() const override {
+    RecordUse(a_, kWrite);
+    RecordUse(b_, kRead);
+    Do(a_.GetData<T>(), b_.GetData<U>,
+       KALDI_TENSOR_MAX_DIM - 1);
+  }
+
+  private:
+
+  void Do(T *a, const U *b, int32 raxis) {
+    int32 dim = std::max<int32>(a_.dims[raxis], b_.dims[raxis]),
+        a_stride = a_.strides[raxis], b_stride = b_.strides[raxis];
+    if (raxis == 0) {
+      for (int32 i = 0; i < dim; i++) {
+        a[i * a_stride] = static_cast<T>(b[i * b_stride]);
+      }
+    } else {
+      for (int32 i = 0; i < dim; i++) {
+        Do(a + i * a_stride, b + i * b_stride, raxis - 1);
+      }
+    }
+  }
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+}
+}
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_LINEAR_REF_OPS_H_
diff --git a/src/tensor/linear-ops.cc b/src/tensor/linear-ops.cc
new file mode 100644
index 00000000000..18c4b3f5bfe
--- /dev/null
+++ b/src/tensor/linear-ops.cc
@@ -0,0 +1,419 @@
+// tensor/linear-ops.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/linear-ops.h"
+
+namespace kaldi {
+namespace tensor {
+
+
+void PlusEqOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
+  if (a_.DeviceType() == kCpuDevice) ExpandCpu(ops);
+  else ExpandCuda(ops);
+}
+
+void PlusEqOp::ExpandCpu(std::vector<std::unique_ptr<Op> > *ops) const {
+  Op *new_op;
+  if (ReferenceMode()) {
+    // In reference mode on CPU always use the reference implementation.
+    // Reference mode is only supported on CPU so we use the normal Ops
+    // on GPU.
+    SET_TO_TEMPLATED_CPU_OP_ALL(new_op, a_.Dtype(), PlusEqRefOp, a_, b_);
+    ops->push_back(new_op);
+    return;
+  }
+
+  // The implementation requires us to first reduce the patterns,
+  // so we don't have too many combinations of codes to handle.
+  Pattern a_pattern = a_.Impl().pattern,
+      b_pattern = b_.Impl().pattern;
+  ReducePatterns({a_pattern, b_pattern});
+
+  // The few lines below construct Tensors a and b which have the same data as
+  // a_ and b_, but with reduced patterns; we use a_ and b_ directly if the
+  // reduction made no difference.
+  Tensor a(a_), b(b_);
+  if (a_pattern != a_.Impl().pattern)
+    a = WithPattern(a, a_pattern);
+  if (b_pattern != b_.Impl().pattern)
+    b = WithPattern(b, b_pattern);
+
+
+  int64 combined_code = CombineCodes(a_pattern.GetCode(),
+                                     b_pattern.GetCode());
+
+  /*
+    'combined_code' may be viewed as a hex number 0xAAABBB where AAA is
+    the code of a_pattern and BBB is the code of b_pattern.  See
+    documentation for ComputePatternCode() in pattern-utils.h for
+    more documentation on the meanings of the values and our notation
+    with X,x,1.
+       Quick legend:
+             X means dim >1, stride = 1
+             x means dim >1, stride != 1
+             1 means dim == 1, stride = 0.
+                 (Note: the numbers in case-statements below exclude negative
+                 strides because bit 11 of the 12-bit chunks would be set if
+                 there were a negative stride).
+       Rightmost position in these (xX)-type notations below is the
+       highest-numbered axis / lowest-number raxis
+  */
+
+
+  // We implemented the blas-like operations for general T as well as the versions
+  // that use BLAS, so we don't need to check if the type is float or double.
+
+  // We are doing a += b.
+  switch(combined_code) {
+    // A scalar += scalar,
+    case 0x000000:   // () +=  ()
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), ScalarPlusEqScalarCpuOp, a, b);
+      break;
+      // We may split apart some of the following cases in future.
+      // They all represent, vector += vector.
+    case 0x101101:  //  (X) += (X)
+    case 0x001001:  //  (x) += (x)
+    case 0x101001:  //  (X) += (x)
+    case 0x001101:  //  (X) += (x)
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), StvectorPlusEqStvectorCpuOp, a, b);
+      break;
+      // Scalar += (sum of) vector or strided vector
+    case 0x000101:  //  () += (X)
+    case 0x000001:  //  () += (X)
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), ScalarPlusEqStvectorCpuOp, a, b);
+      break;
+      // vector or strided vector += scalar.
+      // We could later split apart the strided and non-strided cases.
+    case 0x101000:  //  (x) += ()
+    case 0x001000:  //  (X) += ()
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), StvectorPlusEqScalarCpuOp, a, b);
+      break;
+      // scalar += matrix
+    case 0x000103: { // () += (xX)
+      int32 num_rows = b.Pattern().dims[1];
+      // Create a temporary- a column vector, which is what we call
+      // a vector whose nontrivial axis is raxis 1 instead of raxis 0.
+      Tensor temp({num_rows, 1}, {a.Dtype(), a.Device()});
+      // Below we do temp += b.  We could use PlusEqOp for this and also for the
+      // following reduction, but doing it this way avoids an unnecessary layer
+      // of expansion.
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(),
+                               ColVectorEqMatrixCpuOp, temp, b);
+      ops->push_back(new_op);
+      // Normalize the temporary vector so its nontrivial axis is raxis 0, by
+      // removing the current raxis 0 and having current raxis 1 shift down.
+      Tensor temp_normalized = SqueezeR(temp, 0);
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(),
+                               ScalarPlusEqStvectorCpuOp, a,
+                               temp_normalized);
+      break;
+    }
+    case 0x101103: // (X) += (xX)
+    case 0x001103: // (x) += (xX)
+      // vector += matrix.  Implicitly this is a row vector, since its
+      // nontrivial axis is in the same position as the column axis of the
+      // matrix.  So we are summing the rows of the matrix.
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), StvectorPlusEqMatrix);
+
+    default:
+          // The reference op, which might be slow especially if there is
+          // reduction.  We'll continue trying to add special handling for common
+          // cases.
+          SET_TO_TEMPLATED_OP_ALL(new_op, a_.Dtype(), PlusEqRefCpuOp, a_, b_);
+  }
+    } else {  // CPU, but not float or double.
+      switch (dtype) {
+        case kInt32Dtype:
+          new_op = new PlusEqRefOp<int32>(a_, b_);
+        default:
+          KALDI_ERR << "Unexpected dtype: " << dtype;
+      }
+    }
+    ops->push_back(new_op);
+    return;
+  } else {
+    KALDI_ASSERT(a.DeviceType() == kCuda);
+#if HAVE_CUDA == 1
+    if (a.Dtype() == kFloat || a.Dtype() == kDouble) {
+      // For certain special cases we have a BLAS implementation.
+      switch(combined_code) {
+        // We may split apart some of the following cases in future.
+        // They all represent, vector += vector.
+        case 0x101101:  //  (X) += (X)
+        case 0x001001:  //  (x) += (x)
+        case 0x101001:  //  (X) += (x)
+        case 0x001101:  //  (X) += (x)
+          SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), StvectorPlusEqStvectorCudaOp, a, b);
+          break;
+          // Scalar += (sum of) vector or strided vector
+        case 0x000101:  //  () += (X)
+        case 0x000001:  //  () += (X)
+          SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), ScalarPlusEqStvectorCudaOp, a, b);
+          break;
+          // vector or strided vector += scalar.
+          // We could later split apart the strided and non-strided cases.
+        case 0x101000:  //  (x) += ()
+        case 0x001000:  //  (X) += ()
+          SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), StvectorPlusEqScalarCudaOp, a, b);
+          break;
+          // scalar += matrix
+        case 0x000103: { // () += (xX)
+          int32 num_rows = b.Pattern().dims[1];
+          // Create a temporary- a column vector, which is what we call
+          // a vector whose nontrivial axis is raxis 1 instead of raxis 0.
+          Tensor temp({num_rows, 1}, {a.Dtype(), a.Device()});
+          // Below we do temp += b.  We could use PlusEqOp for this and also for the
+          // following reduction, but doing it this way avoids an unnecessary layer
+          // of expansion.
+          SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(),
+                                       ColVectorEqMatrixOp, temp, b);
+          ops->push_back(new_op);
+          // Normalize the temporary vector so its nontrivial axis is raxis 0, by
+          // removing the current raxis 0 and having current raxis 1 shift down.
+          Tensor temp_normalized = Squeeze(temp, 0);
+          SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(),
+                                   ScalarPlusEqStvectorOp, a, temp_normalized);
+        }
+      }
+#else
+      KALDI_ERR << "You have not compiled for CUDA but are trying to use GPU."
+          "Please configure for GPU use and recompile."
+#endif  //  HAVE_CUDA == 1
+  }
+}
+
+
+void PlusEqOp::ExpandCpu(std::vector<std::unique_ptr<Op> > *ops) const {
+  Op *new_op;
+  // The implementation requires us to first normalize the patterns,
+  // so we don't have too many combinations of codes to handle.
+  Pattern a_pattern = a_.Impl().pattern,
+      b_pattern = b_.Impl().pattern;
+  NormalizePatterns({a_pattern, b_pattern});
+
+  // The few lines below construct Tensors a and b which have the same data as
+  // a_ and b_, but with reduced patterns; we use a_ and b_ directly if the
+  // reduction made no difference.
+  Tensor a(a_), b(b_);
+  if (a_pattern != a_.Impl().pattern)
+    a = WithPattern(a, a_pattern);
+  if (b_pattern != b_.Impl().pattern)
+    b = WithPattern(b, b_pattern);
+
+
+
+}
+
+void AssignOp::Expand() const {
+  Op *new_op;
+
+  if (a.Dtype() != b.Dtype()) {
+    if (a.Device() != b.Device()) {
+      KALDI_ERR << "Cross-device copying combined with type convesion not "
+          "supported yet.";
+      // Actually it would be easy to support just by creating a temporary
+      // (search above for `temp` for an example).
+    }
+
+
+  }
+  if (a.Device() != b.Device()) {
+    KALDI_ERR << "Cross-device copying not supported yet.";
+  }
+
+  if (ReferenceMode() && a_.DeviceType() == kCpuDevice) {
+    // In reference mode on CPU always use the reference implementation.
+    // Reference mode is only supported on CPU so we use the normal Ops
+    // on GPU.
+    SET_TO_TEMPLATED_OP_ALLPAIRS(new_op, a_.Dtype(), b.Dtype(),
+                                 AssignRefOp, a_, b_);
+    ops->push_back(new_op);
+    return;
+  }
+
+  // The generic implementation requires us to first normalize the patterns.
+  Pattern a_pattern = a_.Impl().pattern,
+      b_pattern = b_.Impl().pattern;
+  NormalizePatterns({a_pattern, b_pattern});
+
+  KALDI_ASSERT(Compatible(a_, b_));  // dtype and device, check they match.
+
+  Tensor a(a_), b(b_);
+
+  if (a_pattern != a_.Impl().pattern)
+    a = WithPattern(a, a_pattern);
+  if (b_pattern != b_.Impl().pattern)
+    b = WithPattern(b, b_pattern);
+
+  /*
+    The case-statement values in the switch statement below may be interpreted
+    in groups of 3 hex characters, are 0xAAABBB, pertaining to Tensors a and b
+    respectively.  See GetPatternCode() in pattern-utils.h for documentation on
+    the meanings of the values and our notation with X,x,1.
+
+  */
+  int64 combined_code = CombineCodes(a_pattern.GetCode(),
+                                     b_pattern.GetCode());
+
+  /*
+    The case-statement values in the switch statement below may be interpreted
+    in groups of 3 hex characters, are 0xAAABBB, pertaining to Tensors a and b.
+    See ComputePatternCode() in pattern-utils.h for documentation on the meanings of
+    the values and our notation with X,x,1.
+       Quick legend:
+             X means dim >1, stride = 1
+             x means dim >1, stride != 1
+             1 means dim == 1, stride = 0.
+                 (Note: the numbers in case-statements below exclude negative
+                 strides because bit 11 of the 12-bit chunks would be set if
+                 there were a negative stride).
+   */
+
+  // We are doing a += b.
+  switch(combined_code) {
+    // A scalar += scalar,
+    case 0x000000:   // () +=  ()
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), ScalarPlusEqScalarOp, a, b);
+      break;
+    // We may split apart some of the following cases in future.
+    // They all represent, vector += vector.
+    case 0x101101:  //  (X) += (X)
+    case 0x001001:  //  (x) += (x)
+    case 0x101001:  //  (X) += (x)
+    case 0x001101:  //  (X) += (x)
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), StvectorPlusEqStvectorOp, a, b);
+      break;
+    // Scalar += (sum of) vector or strided vector
+    case 0x000101:  //  () += (X)
+    case 0x000001:  //  () += (X)
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), ScalarPlusEqStvectorOp, a, b);
+      break;
+    // vector or strided vector += scalar.
+    // We could later split apart the strided and non-strided cases.
+    case 0x101000:  //  (x) += ()
+    case 0x001000:  //  (X) += ()
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(), StvectorPlusEqScalarOp, a, b);
+      break;
+    // scalar += matrix
+    case 0x000103: { // () += (xX)
+      int32 num_rows = b.Pattern().dims[1];
+      // Create a temporary- a column vector, which is what we call
+      // a vector whose nontrivial axis is raxis 1 instead of raxis 0.
+      Tensor temp({num_rows, 1}, {a.Dtype(), a.Device()});
+      // Below we do temp += b.  We could use PlusEqOp for this and also for the
+      // following reduction, but doing it this way avoids an unnecessary layer
+      // of expansion.
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(),
+                               ColVectorEqMatrixOp, temp, b);
+      ops->push_back(new_op);
+      // Normalize the temporary vector so its nontrivial axis is raxis 0, by
+      // removing the current raxis 0 and having current raxis 1 shift down.
+      Tensor temp_normalized = Squeeze(temp, 0);
+      SET_TO_TEMPLATED_OP_REAL(new_op, a.Dtype(), a.DeviceType(),
+                               ScalarPlusEqStvectorOp, a, temp_normalized);
+    }
+
+
+    default:
+      // Later we can add a more generic implementation that handles arbitrary
+      // patterns.
+      KALDI_ERR << "Unhandled code: " << std::hex << combined_code;
+  }
+  ops->push_back(new_op);
+}
+
+
+
+void AddProduct(float alpha, float beta,
+                const TensorImpl &a, const TensorImpl &b, const TensorImpl *c){
+
+  if (a.pattern.code < b.pattern.code) {
+    // Ensure, via a recursion, that a.pattern.code >= b.pattern.code.
+    // This avoids us having to test for the swapped versions of the patterns.
+    AddProduct(alpha, beta, b, a, c);
+    return;
+  }
+
+  CheckDeviceAndDtype(a, b, *c);
+
+
+  int64 combined_code = CombineCodes(a.pattern.code, b.pattern.code,
+                                     c->pattern.code);
+
+  /*
+    The case-statement values in the switch statement below may be
+    interpreted in groups of 3 hex characters, are 0xAAABBBCCC,
+    pertaining to Tensors a, b and c respectively.  See
+    GetPatternCode() in pattern-utils.h for documentation on
+    the meanings of the values and our notation with X,x,1.
+   */
+  switch(combined_code) {
+    case 0x000000000:
+      // () * () -> ()
+      // scalar * scalar -> scalar
+      AddProductScalar3(a, b, c);
+      return;
+    case 0x101000101:
+      //  (X) * ()-> (X)
+      // vector * scalar -> vector
+      AddProductVecScalarVec(a, b, c);
+      return;
+    case 0x101101101:
+      // (X) * (X) -> (X)
+      // vector .* vector -> vector
+      AddProductVec3(a, b, c);
+      return;
+    case 0x103101202:
+      // (x,X) * (X)  -> (X,1)
+      // vector * matrix -> vector.unsqueeze(-1)
+      AddProductMatVecVec(a, b, c);
+      return;
+    case 0x203101202:
+      // (X,x) * (X) -> (X,1)
+      // transposed-matrix * vector -> vector.unsqueeze(-1)
+      AddProductTmatVecVec(a, b, c);
+      return;
+    case 0x202101103:
+      // (X,1) * (X) -> (x,X)
+      // vector * vector -> matrix (outer product)
+      AddProductVec2Mat(a, b, c);
+      return;
+
+
+    default:
+      break;
+
+  }
+
+  // If we reached this point, it means we could
+  // not handle this request with any of the basic operations above.
+  // Something is a little differ
+
+
+  SubTensor a_temp(a), b_temp(b), c_temp(*c);
+
+  PadAxes(&(a.pattern), &(b.pattern), &(c.pattern));
+
+  CompressPatterns({&a_temp, &b_temp, &c_temp});
+}
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/linear-ops.h b/src/tensor/linear-ops.h
new file mode 100644
index 00000000000..8d5deffc02b
--- /dev/null
+++ b/src/tensor/linear-ops.h
@@ -0,0 +1,622 @@
+// tensor/linear-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_LINEAR_OPS_H_
+#define KALDI_TENSOR_LINEAR_OPS_H_ 1
+
+#include "tensor/tensor.h"
+
+
+// Note: user-level code will not interact directly with these Ops.  See
+// tensor-linear.h for the user-level code.
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   Add operation taking two Tensors (T), i.e. a += b, which may include
+   summation and/or broadcasting depending on the dimensions of a and b
+
+   May not be used if a and b overlap.
+*/
+class PlusEqOp: public Op {
+ public:
+
+  PlusEqOp(const Tensor &a, Tensor &b):
+      a_(a), b_(b) {
+    KALDI_ASSERT(!Overlap(a, b) &&
+                 BroadcastableAndCompatible(a, b));
+  }
+
+  int32 Properties() { return 0; }  // not concrete
+
+  Op *Copy() const override {
+    return new PlusEqOp(a_, b_);
+  }
+
+  // Defined in linear-ops.cc; this function works out the more concrete
+  // structure (e.g. vectors, matrices, things like that) and chooses the
+  // appropriate implementation
+  void Expand(std::vector<std::unique_ptr<Op> > *ops) const override;
+
+  void GetBackwardDerivOps(
+      DerivMap *map,
+      std::vector<std::unique_ptr<Op> > *ops) const override {
+    std::shared_ptr<TensorImpl> b_deriv = map->DerivIfPresent(b_);
+    if (b_deriv == nullptr)  // b wasn't tracked, so a won't be.
+      return;
+    // else return the Op corresponding to:
+    // b_deriv_ += a_deriv_.
+    ops->push_back(std::unique_ptr<Op>(new PlusEqOp(AsTensor(b_deriv),
+                                                 map->Deriv(a_))));
+
+  }
+  void GetForwardDerivOps(
+      DerivMap *map,
+      std::vector<std::unique_ptr<Op> > *ops) const override {
+    std::shared_ptr<TensorImpl> a_deriv = map->DerivIfPresent(b_);
+    if (b_deriv == nullptr)  // b wasn't tracked, so a won't be.
+      return;
+    // else return the Op corresponding to:
+    // a_deriv_ += b_deriv_.
+    ops->push_back(std::unique_ptr<Op>(new PlusEqOp(AsTensor(a_deriv),
+                                                 map->Deriv(b_))));
+  }
+
+
+ private:
+  // The implementation of Expand() is complicated so we split it
+  // into two separate functions.
+  void ExpandCpu(std::vector<std::unique_ptr<Op> > *ops) const;
+  void ExpandCuda(std::vector<std::unique_ptr<Op> > *ops) const;
+
+  Tensor a_;
+  Tensor b_;
+};
+
+
+
+/**
+   Assign operation, doing
+      b := a,
+   which may actually do summation and/or broadcasting depending on the
+   dimensions of b and a.  Formally, and with reference to the notation
+   in pattern.h, we can describe its operation as follows:
+       - Set all elements of b to zero
+       - For each index-tuple i in the index-tuple-set of b, b[i] += a[i].
+   Must not be used if b and a overlap.
+
+   While most Ops require the arguments to be "compatible", i.e. on the same
+   dtype and device, the Assign op does not require this.  (For the time being,
+   though, there may be limitations on what kinds of things you can do across
+   dtype and device, e.g. it may not support all the broadcasting and summation
+   operations that would normally be allowed).
+*/
+class AssignOp: public Op {
+ public:
+  /**
+     If `zero_in_backprop` is true, then the backprop command for this operation
+     will zero the deriv w.r.t. b after that command.  (It would be safer to
+     set it by default to true, but this requires extra work).
+
+     Setting this to true should rarely be necessary-- only when we are
+     overwriting something that already had a derivative.  If you forget to set
+     this to true when you needed to, when you run in debug mode the
+     memory-checker code will tell you about the issue and crash.
+  */
+  AssignOp(const Tensor &a, Tensor &b,
+           bool zero_in_backprop = false):
+      a_(a), b_(b), zero_in_backprop(zero_in_backprop) {
+    // We don't require a and b to be compatible (same dtype and device),
+    // although other Ops do require this.
+    KALDI_ASSERT(!Overlap(a, b) && Broadcastable(a, b));
+  }
+  Op *Copy() const override {
+    return new AssignOp(a_, b_, zero_in_backprop_);
+  }
+
+  int32 Properties() { return 0; }  // not concrete
+
+  /**
+     Expand into concrete Ops, depending on the dimensions and device.
+  */
+  void Expand() const override;
+
+
+  void GetBackwardDerivOps(
+      DerivMap *map,
+      std::vector<std::unique_ptr<Op> > *ops) const override {
+    std::shared_ptr<TensorImpl> a_deriv = map->DerivIfPresent(a_);
+    if (a_deriv == nullptr)  // a wasn't tracked, so b won't be.
+      return;
+    // Return the Op corresponding to:
+    // a_deriv_ += b_deriv_.
+    ops->push_back(std::unique_ptr<Op>(new PlusEqOp(map->Deriv(b_),
+                                                    AsTensor(a_deriv))));
+
+    if (zero_in_backprop_)
+      ops->push_back(std::unique_ptr<Op>(new ZeroOp(map->Deriv(b_))));
+  }
+
+  void GetForwardDerivOps(
+      DerivMap *map,
+      std::vector<std::unique_ptr<Op> > *ops) const override {
+    std::shared_ptr<TensorImpl> a_deriv = map->DerivIfPresent(a_);
+    if (a_deriv == nullptr)  // a wasn't tracked, so b won't be.
+      return;
+    // else return the Op corresponding to:
+    // b_deriv_ := a_deriv_.
+    ops->push_back(std::unique_ptr<Op>(new AssignOp(AsTensor(a_deriv),
+                                                    map->Deriv(b_))));
+  }
+ private:
+  Tensor a_;
+  Tensor b_;
+  // If true, we'll zero the derivative w.r.t. b after doing the backprop to a.
+  // This allows correct backprop in certain cases where you overwrite data, but
+  // it's rarely necessary so we make it optional to avoid unnecessary zeroing.
+  bool zero_in_backprop_;
+};
+
+
+
+
+
+/**
+   class Op is a base-class for objects that are created when we do operations
+   on Variables.  The important thing to know here is that the Variables in
+   question will always have been allocated with particular dimensions,
+   and possibly even contain defined values, before we get to the Op.
+   Examples of Ops include,
+      a := b * c
+      a += b
+      a *= b
+   where the interpretation of the commands above will depend on the
+   dimensions of the Tensors involved.
+
+   Notice that all the member functions of class Op are `const`, i.e. they
+   shouldn't change this class (although of course they may change the
+   underlying Tensor data).  This is to remind users that Ops are supposed
+   to be reusable, and calls to this object shouldn't affect the behavior
+   of subsequent calls, except to the extent that the underlying Tensor
+   data has been changed.
+ */
+class Op {
+ public:
+
+  /**
+     Do whatever it is that this Op does (e.g. execute the command `a += b`,
+     if that was what this Op did)
+   */
+  virtual void Do() const;
+
+  /**
+     Return a copy of this object.  (This won't be needed very often but might
+     possibly be needed in the context of computing higher-order derivatives).
+  */
+  virtual Op *Copy() const;
+
+  /**
+     This is for forward-mode automatic differentiation (a rarely-used thing).
+     It appends to 'ops' the commands corresponding to the forward-mode
+     automatic differentiation w.r.t. this Op.
+
+       @param [in,out] 'map' is the map that maps from tensors to the
+             corresponding derivative values.  May be modified by adding
+             new key/value pairs.
+       @param [out] ops  This funtion will *append* to `ops` the
+             commands for computing the derivatives associated with
+             this Op in forward-mode automatic differentiation.  If none
+             of the inputs to the Op were tracked w.r.t. `map`,
+             nothing will be done.
+
+     Example: if the command was "a += b", the derivative operation would
+     be: deriv(a) += deriv(b).  In most cases these Ops would be executed
+     immediately and then deleted.
+   */
+  virtual void GetForwardDerivOps(DerivMap *map,
+                                  std::vector<std::unique_ptr<Op> > *ops) const;
+
+
+
+  /**
+     This is for reverse-mode automatic differentiation (the normal type of
+     autograd).
+
+       @param [in,out] map   This object maps from tensors to the
+                       corresponding derivative values.  It may be changed by
+                       adding new elements to the map, if its Deriv() function
+                       is called.
+       @param [out]    ops  This function may *append* to 'ops' the commands
+                       used in the reverse-mode automatic differentiation.
+                       (Note: nothing will be appended if none of the inputs
+                       to the Op were already tracked w.r.t. 'map'.)
+
+     Example: if the command was "a += b * c", the operations added to
+     'ops' would correspond to `deriv(b) += deriv(a) * c` and
+     `deriv(c) += deriv(a) * b`.
+  */
+  virtual void GetBackwardDerivOps(DerivMap *map,
+                                   std::vector<std::unique_ptr<Op> > *ops) const;
+
+
+  /** Destructor.  It's important for efficiency of memory use to destroy Ops as
+      soon as you won't need them any more, because it may trigger the freeing
+      of Tensors and hence Storage regions.
+  */
+  virtual ~Op();
+};
+
+
+
+class Op {
+
+  Op(): tick_(GetTick()) { }
+
+  /// InputIteratorBegin() and InputIteratorEnd() form the begin and
+  /// end points of a list of Variables that were inputs of this Op
+  /// but were not outputs.  This is used by the backprop code when finding
+  /// the topological order of ops.  (Note: output variables themselves
+  /// refer to Ops, so if we included them in the input list we'd
+  /// get a cycle in the graph).  These Variables are expected to
+  /// still have their graph information (i.e. sub-classes of class Op
+  /// class must not call RemoveGraph() on the members of this list).
+  virtual Op *DepIteratorBegin() = 0;
+  virtual Op *DepIteratorEnd() = 0;
+
+
+
+  // This number >= 0 is used to determine the order of Ops in a graph; each
+  // time we generate an Op we increment a global counter.  Doing it this way,
+  // rather than via topological sorting, is simpler.
+  int64 GetTimestamp() const final { return tick_; }
+
+  virtual void Backprop();
+
+ protected:
+
+  /**
+     The time (`GetTick()`) at which this Op was created; should be set
+     in child classes by doing:
+      `tick_ = GetTick()`
+     as the last statement of the constructor.   (This ensures the
+     tick is later-numbered than any ticks stored in the ChangeTracker
+     code by operations called from the constructor.)
+  */
+  int64 tick_;
+
+
+  /*
+    This function intended to be called from the Backprop() routines
+    of child classes, for example:
+       ` if (DebugMode()) {  CheckTensorTime(*a_);  } `
+    This will die if the memory underlying the Tensor being checked has been
+    modified more recently than tick_.
+  */
+  inline void CheckTensorTime(const Tensor &tensor) {
+    if (DebugMode()) {
+    }
+  }
+
+
+
+
+};
+
+
+template <class OpImpl>
+class OpPointer {
+
+  std::shared_ptr<OpImpl>
+
+}
+
+
+
+/**
+   This is a special version of base-class Op that is created when
+   any SharedGrad is allocated for a non-leaf Variable.  Its purpose
+   is to ensure that, when we get to this Op in the backprop, we deallocate
+   the data underlying the gradient Tensor (so we don't keep gradient
+   Tensors around for longer than is needed).
+*/
+class DeallocateOp: public Op {
+
+  // This operator has no dependencies as it will be created when a SharedGrad
+  // is first initialized, when no Ops have been done on it.
+  Op *DepIteratorBegin() override { return NULL; }
+  Op *DepIteratorEnd() override { return NULL; }
+
+  void Backprop() override {
+    if (auto s = tensor_to_deallocate_.lock())
+      ZeroDeallocating(s.get());
+  }
+
+ private:
+  // Since we just want to deallocate its underlying data, there is no point
+  // increasing its ref-count; we can just shrug our shoulders if it has
+  // already been deleted.d
+  std::weak_ptr<Tensor> tensor_to_deallocate_;
+};
+
+
+/**
+   A slight simplification of class UnaryOp for cases where it's
+   done in-place.
+ */
+class InPlaceUnaryOp: public Op {
+
+};
+
+
+class UnaryOp: public Op {
+
+  //
+  UnaryOp(const Variable &input, const Variable &output) {
+    if
+
+
+
+    if (SameVariable(input, output)) {
+
+    } else {
+    }
+  }
+
+ public:
+
+  std::shared_ptr<Op> op1_;
+  std::shared_ptr<Op> op2_;
+
+
+
+
+}
+
+class GenericOp: public Op {
+
+  // GenericOp is a child of class Op that is intended as a generic base-class
+  // for expressions.
+
+
+
+ protected:
+  // Constructor, to be used from child classes.  This base-class takes care
+  // of storing the list of input Variables for purposes of tracing dependencies;
+  //
+  //  @param [in] input_vars  The list of input Variables (meaning: Variables
+  //                   that are inputs to, but not outputs of, i.e. not modified
+  //                   by, this Op).
+  //  @param [in] output_var  The output Variable of this Op, i.e. the Variable
+  //                   which is modified or set by it.  We may provide another
+  //                   constructor taking ArrayRef<Variable> in this position,
+  //                   as and when we need to support Ops that operate on
+  //                   multiple output Variables.
+  void Op(const ArrayRef<Variable> &input_vars,
+          const Variable &output_var);
+
+
+  // TODO: maybe have a constructor of Op that takes an ArrayRef of the inputs
+  // that are not also outputs?  Could use that for graph traversal.
+
+ private:
+
+  // num_inputs_ is the number of base Variables that are the base Variables of
+  // inputs of this Op (but not of outputs).  These are stored in the
+  // array 'inputs_'.
+
+  // inputs_ is a pointer to an array of shared_ptr<Variable> of size num_inputs_, which
+  // will be be allocated by new [] in the constructor and deleted by delete []
+  // in the destructor.
+
+  // This is a list of the Op-input-nodes (see glossary in tensor.h for explanation).
+  // We don't store the Op-output-nodes here; instead, they refer to this Op in
+  // their op_lists.
+  // (We don't store the Node(s) that is(are) the outputs of the Op here; its own
+  // op_list refers to this Op).
+  std::shared_ptr<Node> *inputs_;
+
+  int32 num_inputs_;
+
+  // If num_inputs_ is 1, then inputs_ is
+  void *inputs_;
+
+  int64 n_;  // initialized from the counter when this object is created.
+  std::shared_ptr<Op> tail_;  // TODO: make it unique_ptr?
+ protected:
+  // Return true if this is not the last Op in the list of Ops attached to this
+  // base Variable (can be useful to know whether we need bother to scale the
+  // derivative in a scaling operation, for instance).
+  bool HasTail() const { return tail_ != nullptr; }
+};
+
+
+class AddToOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b  :=  alpha a  +   beta b.  \f$
+  // with broadcasting or summation depending on the dimensions
+  // involved.  Alpha and beta are constants, and differentiation w.r.t. them is
+  // not supported (you wouldn't reach this code if a or b were actual
+  // variables.)
+  //
+  // The Op is only constructed if b.Tracked() (which it would normally if
+  // a.Tracked()).
+  AddToOp(float alpha, float beta,
+          const Variable &a, const Variable &b):
+      Op({a}),
+      alpha_(alpha),
+      beta_(beta),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+
+    Add(alpha, beta, *a_data_, b_data_.get());
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+class AssignOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b := a  \f$
+  // with broadcasting or summation depending on the dimensions.
+  //
+  // Constructing this Op will make b tracked if it was already.
+  AssignOp(const Variable &a, const Variable &b):
+      Op({a}),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+    Copy(a_data_, b_data_);
+
+      `tick_ = GetTick()`
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+class AssignOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b  :=  alpha a  +   beta b.  \f$
+  // with broadcasting or summation depending on the dimensions
+  // involved.  Obviously alpha and beta are constants,
+  // and differentiation w.r.t. them is not supported.
+  //
+  // The Op is only constructed if b_.Tracked() (which it
+  // would normally if a_.Tracked()).
+  AddToOp(float alpha, float beta,
+          const Variable &a, const Variable &b):
+      Op({a}),
+      alpha_(alpha),
+      beta_(beta),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+
+    Add(alpha, beta, *a_data_, b_data_.get());
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR__LINEAR_OPS_H_
diff --git a/src/tensor/linear-special-ops.cc b/src/tensor/linear-special-ops.cc
new file mode 100644
index 00000000000..06f9247acb5
--- /dev/null
+++ b/src/tensor/linear-special-ops.cc
@@ -0,0 +1,154 @@
+// tensor/linear-ops.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/linear-ops.h"
+
+namespace kaldi {
+namespace tensor {
+
+void AddOp::Expand(std::vector<std::unique_ptr<Op> > *ops) const {
+
+  Pattern a_pattern = a_.Impl().pattern,
+      b_pattern = b_.Impl().pattern;
+  NormalizePatterns({a_pattern, b_pattern});
+
+  Tensor a(a_), b(b_);
+
+  if (a_pattern != a_.Impl().pattern)
+    a = WithPattern(a, a_pattern);
+  if (b_pattern != b_.Impl().pattern)
+    b = WithPattern(b, b_pattern);
+
+  /*
+    The case-statement values in the switch statement below may be
+    interpreted in groups of 3 hex characters, are 0xAAABBBCCC,
+    pertaining to Tensors a, b and c respectively.  See
+    GetPatternCode() in pattern-utils.h for documentation on
+    the meanings of the values and our notation with X,x,1.
+
+  */
+  int64 combined_code = CombineCodes(a_pattern.GetCode(),
+                                     b_pattern.GetCode());
+
+  // We are doing a += b.
+  switch(combined_code) {
+    // A scalar plus a scalar
+    case 0x000000000:
+
+
+
+}
+
+
+
+inline static void AddProductScalar3(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  switch (a.device.device_type) {
+    case kCpuDevice:
+      AddProductScalar3Cpu(alpha, beta, a, b, c);
+      return;
+#ifdef HAVE_CUDA
+    case kCudaDevice:
+      AddProductScalar3Gpu(alpha, beta, a, b, c);
+      return;
+#endif
+    default:
+      KALDI_ERR << "Unsupported device type " << a.ToString();
+  }
+}
+
+
+void AddProduct(float alpha, float beta,
+                const TensorImpl &a, const TensorImpl &b, const TensorImpl *c){
+
+  if (a.pattern.code < b.pattern.code) {
+    // Ensure, via a recursion, that a.pattern.code >= b.pattern.code.
+    // This avoids us having to test for the swapped versions of the patterns.
+    AddProduct(alpha, beta, b, a, c);
+    return;
+  }
+
+  CheckDeviceAndDtype(a, b, *c);
+
+
+  int64 combined_code = CombineCodes(a.pattern.code, b.pattern.code,
+                                     c->pattern.code);
+
+  /*
+    The case-statement values in the switch statement below may be
+    interpreted in groups of 3 hex characters, are 0xAAABBBCCC,
+    pertaining to Tensors a, b and c respectively.  See
+    GetPatternCode() in pattern-utils.h for documentation on
+    the meanings of the values and our notation with X,x,1.
+   */
+  switch(combined_code) {
+    case 0x000000000:
+      // () * () -> ()
+      // scalar * scalar -> scalar
+      AddProductScalar3(a, b, c);
+      return;
+    case 0x101000101:
+      //  (X) * ()-> (X)
+      // vector * scalar -> vector
+      AddProductVecScalarVec(a, b, c);
+      return;
+    case 0x101101101:
+      // (X) * (X) -> (X)
+      // vector .* vector -> vector
+      AddProductVec3(a, b, c);
+      return;
+    case 0x103101202:
+      // (x,X) * (X)  -> (X,1)
+      // vector * matrix -> vector.unsqueeze(-1)
+      AddProductMatVecVec(a, b, c);
+      return;
+    case 0x203101202:
+      // (X,x) * (X) -> (X,1)
+      // transposed-matrix * vector -> vector.unsqueeze(-1)
+      AddProductTmatVecVec(a, b, c);
+      return;
+    case 0x202101103:
+      // (X,1) * (X) -> (x,X)
+      // vector * vector -> matrix (outer product)
+      AddProductVec2Mat(a, b, c);
+      return;
+
+
+    default:
+      break;
+
+  }
+
+  // If we reached this point, it means we could
+  // not handle this request with any of the basic operations above.
+  // Something is a little differ
+
+
+  SubTensor a_temp(a), b_temp(b), c_temp(*c);
+
+  PadAxes(&(a.pattern), &(b.pattern), &(c.pattern));
+
+  CompressPatterns({&a_temp, &b_temp, &c_temp});
+}
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/linear-special-ops.h b/src/tensor/linear-special-ops.h
new file mode 100644
index 00000000000..d9ea6a18447
--- /dev/null
+++ b/src/tensor/linear-special-ops.h
@@ -0,0 +1,95 @@
+// tensor/linear-special-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_LINEAR_SPECIAL_OPS_H_
+#define KALDI_TENSOR_LINEAR_SPECIAL_OPS_H_ 1
+
+#include "tensor/tensor.h"
+
+
+// This Ops are more specialized forms of the Ops declared in linear-ops; these
+// correspond to more specific combinations of Tensor shapes.
+// Just the template declarations are here; the overrides for CPU and
+// GPU are in linear-cpu-ops.h and linear-gpu-ops.h.
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   Operation doing a += b with a and b scalar.
+
+   a and b may not point to the same data.
+
+   Template parameter T is the datatype concerned (say, T = float)
+   D is the DeviceType enum, kCpuDevice or kCudaDevice.
+
+   Will be specialized for CPU and GPU in linear-cpu-ops.h and linear-gpu-ops.h
+*/
+template <class T, DeviceType D>
+class ScalarPlusEqScalarOp;
+
+
+/**
+   Operation doing a += b with a and b possibly-strided vectors.
+
+   a and b may not overlap.
+
+   Template parameter T is the datatype concerned (say, T = float)
+   D is the DeviceType enum, kCpuDevice or kCudaDevice.
+
+   Will be specialized for CPU and GPU in linear-cpu-ops.h and linear-gpu-ops.h
+*/
+template <class T, DeviceType D>
+class StvectorPlusEqStvectorOp;
+
+
+/**
+   Operation doing a += b with a a vector and b a scalar.  (I.e. add
+   a constant elementwise to a vector).
+
+   May not be used if a and b overlap.
+
+   Template parameter T is the datatype concerned (say, T = float)
+   D is the DeviceType enum, kCpuDevice or kCudaDevice.
+
+   Will be specialized for CPU and GPU in linear-cpu-ops.h and linear-gpu-ops.h
+*/
+template <class T, DeviceType D>
+class StvectorPlusEqScalarOp;
+
+
+/**
+   Operation doing a += b with a and b possibly-strided vectors.
+
+   a and b may not overlap.
+
+   Template parameter T is the datatype concerned (say, T = float)
+   D is the DeviceType enum, kCpuDevice or kCudaDevice.
+
+   Will be specialized for CPU and GPU in linear-cpu-ref-ops.h and linear-gpu-ref-ops.h
+*/
+template <class T, DeviceType D>
+class PlusEqRefOp;
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR__LINEAR_OPS_H_
diff --git a/src/tensor/memory-checker.h b/src/tensor/memory-checker.h
new file mode 100644
index 00000000000..b8649137e02
--- /dev/null
+++ b/src/tensor/memory-checker.h
@@ -0,0 +1,577 @@
+// tensor/memory-checker.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_MEMORY_CHECKER_H_
+#define KALDI_TENSOR_MEMORY_CHECKER_H_ 1
+
+#include <functional>
+#include "tensor/tensor-common.h"
+#include "tensor/pattern.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+
+/**
+   class ChangeTracker is something we only use in 'debug mode'.  Its purpose is
+   to keep track of when data was last changed, to make sure people don't mutate
+   data via in-place operations in a way that will invalidate the backprop.
+   This is a replacement for the 'version numbering' of Variables used in
+   PyTorch, i.e. it's a different way of solving the same problem.  The
+   mechanism is (I think) more exact than version numbering, and less hassle for
+   the calling code; but since it's slower, we will only activate it
+   occasionally.  c.f. SetDebugMode(), GetDebugMode().
+
+   During the forward pass, when an Op records, as members, certain Tensors that
+   will be needed during the backprop pass, it also records the time in
+   ticks (c.f. GetTick()) at which the forward pass happened.  Then in
+   the backward pass, during debug mode we want to check that the memory
+   underlying those Tensors has not beeen changed since that recorded tick.
+   (If people use in-place operations in an unsupported way, this might
+   have happened).
+
+   This class provides a mechanism to do that.  It's actually quite an
+   interesting mathematical problem as it involves detecting overlap between
+   patterns in memory and we want to do it efficiently, not using a huge array.
+   Note: in debug mode, any time a memory region underlying a tracked Variable's
+   data is written to (whether or not that write actually went through a tracked
+   Variable or even a regular Tensor), we record the change (see member function
+   RecordChange()).
+*/
+class ChangeTracker {
+ public:
+
+  /** Constructor.  Note: a Storage object is created for each allocated block
+      of memory, and each Storage object has at most one ChangeTracker object.
+
+      @param [in] num_bytes  The number of bytes allocated in this block.
+                           Only needed for checking, to make sure that
+                           the patterns do not overstep this bound.
+   */
+  ChangeTracker(size_t num_bytes);
+
+
+  /**
+     Record a write to this storage region at the current time (obtained by
+     GetTick()).  Just appends it to the vector of writes after canonicalizing
+     the pattern.  Inlined since it's only called from Storage::WrittenSince().
+
+     @param [in] element_size  The size in bytes of the data type being stored
+                             here: for example, 4 for float.
+     @param [in] pattern    The pattern being changed.  It will be reduced
+                            to canonical form (c.f. CanonicalizePattern())
+                            before being stored.
+   */
+  inline void RecordWrite(int32 element_size,
+                          const Pattern &pattern);
+
+
+  /**
+     Returns true if any element covered by this pattern has been
+     changed since the time given by 'tick'.  Inlined since it's only
+     called from Storage::WrittenSince().
+
+      @param [in] element_size  The size in bytes of the data type being stored
+                       here: for example, 4 for float.
+      @param [in] pattern  The pattern that we are checking
+      @param [in] tick  The time (obtained by GetTick()) since when
+                       we want to know about changes
+
+   */
+  inline bool WrittenSince(int32 element_size,
+                           const Pattern &pattern,
+                           int64 tick);
+
+ private:
+
+  // number of bytes in this storage region (or possibly just a very big number,
+  // if the size of the region was not known).
+  int64 num_bytes_;
+
+  // The size of elements in this storage region (e.g. 4 for float).  If for
+  // some region the same region was accessed with multiple different element
+  // sizes, this will be their lowest common denominator and all patterns
+  // will have their strides and offsets scaled appropriately.
+  // (We don't just store patterns in terms of bytes because we don't want
+  // to increase the risk of overflowing int32 storage).
+  int32 element_size_;
+
+
+  struct WriteRecord {
+    Pattern pattern;  // The pattern (offset, dims, strides) that was
+                            // changed within this storage region.  This pattern
+                            // will have been reduced to canonical form.  View
+                            // it as a memory-index-set (c.f. glossary in
+                            // pattern.h).
+
+    int64 tick;             // The time, in ticks (c.f. NextTick()) at which
+                            // this set of memory-indexes was changed.
+
+    // Next in a singly linked list of WriteRecord.
+    std::unique_ptr<WriteRecord> tail;
+  };
+
+
+  // Head of a singly linked list of changes.  When RecordChange() is called, we
+  // will add to the head of this (and then de-dupe; see doc for change_map)).
+  // When WrittenSince() is called, we will traverse it element by element until
+  // we get to the tick passed to WrittenSince, and if there is any overlap with
+  // the passed-in pattern, we'll return true.
+  std::unique_ptr<WriteRecord> changes_;
+
+
+  // This is a map from a pointer to the Pattern in WriteRecord::pattern
+  // (hashing the pattern itself, not the pointer value), to the WriteRecord
+  // that holds it.  We actually map to the address of the std::unique_ptr
+  // pointing to that WriteRecord, which might be the address of this->changes_
+  // or WriteRecord::tail, because we need to be able to write to that to
+  // remove a WriteRecord from the singly linked list.  This map is used
+  // in de-duping the list of changes, so that if someone provides the
+  // exact same pattern twice, we only keep the most recent tick; this
+  // keeps memory usage under control.
+  std::unordered_map<Pattern*, std::unique_ptr<WriteRecord>*,
+                     PatternPtrHasher, PatternPtrEqual> change_map_;
+};
+
+
+
+// This class is a common base-class for UninitializedDataChecker and
+// InvalidatedDataChecker.
+class DataCheckerBase {
+ protected:
+  DataCheckerBase(int64 num_bytes);
+
+  /**
+     This function records an event (i.e. that this memory area is being written to,
+     or is now no longer valid, depending on the child class).
+     It may insert something into map_, if an event with this pattern hasn't
+     been recorded before.
+
+       @param [in] element_size  The size, in bytes, of the element that this
+                          array contains (e.g. 4 for float, 8 for double)
+                          Currently expected to be the same for all invocations
+                          (we can later extend this code to handle changes).
+       @param [in] pattern   The pattern which we are recording as an event
+                          (e.g. saying that its memory-index-set has been
+                          written to, or has been invalidated.  Its memory-index-set
+                          must be within [0, k-1] where k = num_bytes_ / element_size.
+   */
+a  void RecordEvent(int32 element_size,
+                   const Pattern &pattern);
+
+  /**
+     This function is intended to return true if the memory-index-set of
+     the provided Pattern is fully covered by the Patterns passed to
+     previous invocations of RecordEvent.
+
+     Because it sometimes (for efficiency) uses a randomized algorithm,
+     it may not always detect less-than-complete coverage.  That is, there
+     may be situations where `pattern` is not fully covered and it returns
+     true anyway; but if it returns false, then `pattern` is definitely
+     not covered by all the patterns passed to RecordEvent().
+
+     The algorithm is:
+
+       - If we can find a pattern identical to `pattern` in
+         `map_`, return true (this is a common special case).
+       - If `map_` contains exactly one pattern:
+         See whether the the memory-index-set of `pattern` is
+         a subset of the memory-index-set of that one pattern,
+         and return true if so; else false.
+       - Otherwise: choose a number of random memory-indexes from
+         `pattern`, and for each one, see whether they are covered
+         by any of the stored patterns.  If any such memory-index
+         is not so covered, return false; else return true.  (Note:
+         this last `true` may be inaccurate, meaning we fail to
+         detect a problem we should have detected.)
+
+      @param [in] element_size  The size, in bytes, of the element that this
+                      array contains (e.g. 4 for float, 8 for double).
+                      Currently required to be the same as the element_size
+                      provided to any invocations of RecordEvent(); we may
+                      relax that assumption in future.
+      @param [in] pattern   The pattern we are checking. Its memory-index-set
+                     must be within [0, k-1] where k = num_bytes_ / element_size.
+      @return   True if `pattern` was fully covered by patterns recorded in
+                RecordEvent() or if our randomized algorithm failed to detect
+                the less-than-complete coverage.  False otherwise.
+   */
+  bool FullyCovered(int32 element_size,
+                    const Pattern &pattern);
+
+  /**
+     This function is intended to return true if the memory-index-set of
+     `pattern` has nonempty intersection with the memory-index-set of at least
+     ones of the Patterns provided to RecordEvent().
+
+     Because it is a randomized algorithm, it may sometimes return false
+     when an exact version would have returned true, but not vice versa.
+
+     The algorithm is:
+
+       - If we can find a pattern identical to `pattern` in `map_`, return true
+         (this is a common special case).
+       - Otherwise:
+          - For some or all of the Patterns provided to `RecordEvent()`:
+            - If `pattern` has nonempty intersection with that pattern:
+               return true
+          - return false
+   */
+  bool PartlyCovered(int32 element_size,
+                     const Pattern &pattern);
+
+ private:
+
+  // number of bytes in this storage region (or possibly just a very big number,
+  // if the size of the region was not known).
+  int64 num_bytes_;
+
+  // The size of elements in this storage region (e.g. 4 for float).  If for
+  // some region the same region was accessed with multiple different element
+  // sizes, this will be their lowest common denominator and all patterns
+  // will have their strides and offsets scaled appropriately.
+  // (We don't just store patterns in terms of bytes because we don't want
+  // to increase the risk of overflowing int32 storage).
+  int32 element_size_;
+
+
+  // `map` can actually be thought of as a set of Patterns, but it's
+  // actually stored as a map from Pattern* to the std::unique_ptr holding
+  // that same Pattern.  This may seem an odd thing to do; it's just
+  // a convenient way to manage the memory.  Thanks to PatternPtrHasher,
+  // we can avoid storing duplicate records for the same Pattern.
+  std::unordered_map<Pattern*, std::unique_ptr<Pattern*>,
+                     PatternPtrHasher, PatternPtrEqual> map_;
+
+
+  // This is another way of storing the Patterns that have been recorded,
+  // ordered by NumElements(); this enables us to check the larger patterns
+  // first, which may be more efficient.
+  std::multimap<int64, Pattern*> by_size_;
+};
+
+/**
+   The purpose of this class is to check for use of uninitialized data.  It will
+   only be used when debug mode is enabled.
+
+   There are situations when initializing the memory of a Tensor/Variable (say,
+   to zero) would be wasteful because we know that we're going to eventually
+   write to all of it.  But doing this is risky because we might end up using
+   values in uninitialized memory if we're not careful.  This class detects that
+   situation, but only we are in debug mode; see SetDebugMode(), GetDebugMode().
+ */
+class UninitializedDataChecker: public DataCheckerBase {
+ public:
+
+  /** Constructor.  Note: a Storage object is created for each allocated block
+      of memory, and each Storage object has at most one
+      UninitializedDataChecker object.
+
+      @param [in] num_bytes  The number of bytes allocated in this block.
+                          Only needed for checking, to make sure that
+                          the patterns do not overstep this bound.
+   */
+  UninitializedDataChecker(size_t num_bytes):
+      DataCheckerBase(num_bytes),
+      disabled_(false) { }
+
+
+  /**
+     This function records that this memory area is being written to.
+
+        @param [in] element_size  The size of the element stored in the
+                  Tensor, e.g. 4 for float, 8 for double.
+        @param [in] pattern  The pattern which is being written; this
+                  function records the write.
+   */
+  inline void RecordWrite(int32 element_size,
+                          const Pattern &pattern) {
+    RecordEvent(element_size, pattern);
+  }
+
+  /**
+     This function checks that this memory area is currently uninitialized;
+     if any part of it was previously initialized, it will crash.
+
+        @param [in] element_size  The size of the element stored in the
+                  Tensor, e.g. 4 for float, 8 for double.
+        @param [in] pattern  The pattern which we are checking
+   */
+  inline void CheckUninitialized(int32 element_size,
+                                 const Pattern &pattern);
+
+
+  /**
+     This function is called when this memory area is being read from.
+     It will (usually) crash if an element of this memory area has not been
+     written to.  The algorithm is randomized so a problem won't be
+     detected in all cases.
+        @param [in] element_size  The size of the element stored in the
+                  Tensor, e.g. 4 for float, 8 for double.
+        @param [in] pattern  The pattern which is being read.  If it
+                  is not fully covered by the Patterns passed to
+                  RecordWrite, this call will (usually) crash.
+   */
+  void RecordRead(int32 element_size,
+                  const Pattern &pattern);
+};
+
+
+/**
+   The purpose of this class is to check for use of invalidated data.  It will
+   only be used when debug mode is enabled.
+
+   This is a checking mechanism that helps us to fairly safely avoid certain
+   unnecessary operations on parts of Variables in the backprop phase.  (If the
+   check fails, user-level code will have to be changed).  It's best illustrated
+   with an example.  Let A and B both be Variables representing 2x2 matrices
+   that have been freshly created with uninitialized data.  Suppose we do:
+
+      (1) Initialize A's data to something requiring derivative tracking
+      (2) Copy A to B
+      (3) Copy A to B again
+      (4) Do something that depends on the value of B
+
+   In the backprop, when doing the backprop of operation (3), after propagating
+   the derivative back to A we'd need to zero out the first row of B's
+   derivative matrix, to reflect the fact that its value before operation (3)
+   doesn't affect the outcome; otherwise after the backprop of (2) we would have
+   twice the value we should really have for the derivative w.r.t. A.  So
+   naively, any time we do the backprop for an operation that writes to a
+   variable that was already tracked at the time we did that operation, we would
+   have to zero out that part of the derivative matrix afterwards.  But much of
+   the time we wouldn't have previously written to that part of memory, so such
+   zeroing would be wasteful.  (Note: we can't just rely on checking whether or
+   not this base Variable has previously had an operation done on it; the hard
+   case is where there are multiple Variables that are sub-parts of the same
+   base Variable).
+
+   The way we handle this is: we assume by default that any time we do an
+   operation that sets a Variable but does not depend on its previously existing
+   value, the memory underlying it has not been previously written to in an
+   operation that required derivative-tracking.  That is, the framework assumes
+   by default that you DO NOT REUSE MEMORY, except for in-place operations.  If
+   you do want to re-use memory (specifically:a if you do something that does
+   require overwriting previously-written data that required derivative
+   tracking, like the above), you can inform the framework that you plan to do
+   this as follows:
+     DoSomethingWith(a, b, &c.Overwrite());
+   instead of
+     DoSomethingWith(a, b, &c);
+   (here a, b and c are Variables; and let's suppose this operation
+   DoSomethingWith() ignores the previous value of `c`).
+
+   This purpose of class InvalidatedDataChecker is to detect cases where someone
+   should have invoked Overwrite() because tracked data was overwritten, but
+   failed to do so.
+
+   See also the comment for the overwrite_ member of class VariableImpl, and
+   the Untouched() member of Variable.
+ */
+class InvalidatedDataChecker: public DataCheckerBase {
+ public:
+
+  /** Constructor.  Note: a Storage object is created for each allocated block
+      of memory, and each Storage object has at most one InvalidatedDataChecker
+      object.
+
+      @param [in] num_bytes  The number of bytes allocated in this block.
+                         Only needed for checking, to make sure that
+                         the patterns do not overstep this bound.
+   */
+  InvalidatedDataChecker(size_t num_bytes):
+      DataCheckerBase(num_bytes) { }
+
+
+  /**
+     This function records that this memory area is being invalidated Normally
+     this object will be attached to the Tensor for a derivative, and will be
+     called when we do the backprop for an Op that should ideally have zeroed
+     out this part of the matrix, but we didn't do that because we believe this
+     memory region won't be read from in future.
+   */
+  inline void RecordInvalidation(int32 element_size,
+                                 const Pattern &pattern) {
+    RecordEvent(element_size, pattern);
+  }
+
+
+  /**
+     This function is called when this memory area is being read from.  It will
+     (usually, since the algorithm is randomized) crash if `pattern` has
+     nonempty overlap with a pattern passed to RecordInvalidation().
+
+        @param [in] element_size  The size of the element stored in the
+                  Tensor, e.g. 4 for float, 8 for double.
+        @param [in] pattern  The pattern which is being read.  If it
+                  overlaps with an invalidated Pattern, this will
+                  (usually) crash.
+  */
+  void RecordRead(int32 element_size,
+                  const Pattern &pattern);
+};
+
+
+class MemoryChecker {
+ public:
+
+  /**
+     Constructor: constructs a MemoryChecker object for a storage region
+
+        @param [in] num_bytes   Number of bytes in the storage region
+        @param [in] new_region  True if this object is being allocated at
+                     the same time as we are allocating this region.
+                     (may be false if debug mode was not active when
+                     the region was first allocated).
+  */
+  MemoryChecker(int64 num_bytes,
+                bool new_region): num_bytes_(num_bytes) {
+    Initialize(new_region);
+  }
+
+  /**
+     This is called by functions that implement low-level functions on tensors,
+     before or after actually accessing the memory.  The options are:
+         kRead
+         kReadWrite
+         kWrite
+         kCheckUninitialized
+         kReadAndInvalidate
+         kInvalidate
+     From a user's perspective the only thing this function might do is crash--
+     which it is designed to do if it detects various "disallowed" things.
+  */
+  void RecordUse(int32 element_size,
+                 const Pattern &pattern,
+                 TensorUseEnum use_type) {
+    KALDI_PARANOID_ASSERT(DebugMode());
+    if (debug_tick_ != DebugTick())
+        Initialise(false);  // false means: not a new region.
+
+    if (use_type == kInitialize || use_type == kCheckUninitialized) {
+      if (uninitialized_checker_)
+        uninitialized_checker_->CheckUninitialized(element_size, pattern);
+    }
+    if (use_type == kRead || use_type == kReadWrite ||
+        use_type == kReadInvalidate) {
+      invalidated_checker_->RecordRead(element_size, pattern);
+      if (uninitialized_checker_)
+        uninitialized_checker_->RecordRead(element_size, pattern);
+    }
+    if (use_type == kWrite || use_type == kReadWrite ||
+        use_type == kInitialize) {
+      // Important that this happens after checking the reads above.
+      // uninitialized_checker_ would never find an error in RecordRead() if it
+      // was done after the RecordWrite().
+      if (uninitialized_checker_)
+        uninitialized_checker_->RecordWrite(element_size, pattern);
+      change_tracker_->RecordWrite(element_size,  pattern);
+    }
+    if (use_type == kInvalidate || use_type == kReadInvalidate) {
+      RecordInvalidation(element_size, pattern);
+    }
+  }
+
+  /**
+     Record the invalidation of data.  This occurs in certain backprop
+     operations as a way to avoid unnecessary zeroing operations.  See
+     the documentation for class InvalidatedDataChecker for a longer
+     explanation.
+   */
+  void RecordInvalidation(int32 element_size,
+                          const Pattern &pattern) {
+    if (!invalidated_checker_)
+      invalidated_checker_ = new InvalidatedDataChecker(num_bytes_);
+    invalidated_checker_->RecordInvalidation(element_size, pattern);
+  }
+
+  /**
+     Record that the entire storage region is being zeroed.  (This avoids the
+     need to use uninitialized_checker_, so we delete it if it was set).
+   */
+  inline void RecordZeroing() { uninitialized_checker_ = NULL; }
+
+
+  /**
+     This function is called by the backprop code in Ops when it wants to
+     make sure that certain data stored from the forward pass has not
+     been written to since the specified tick.
+   */
+  void CheckUnchangedSince(
+      int32 element_size,
+      const Pattern &pattern,
+      int64 tick) {
+    if (change_tracker_ &&
+        change_tracker_->WrittenSince(element_size, pattern, tick)) {
+      KALDI_ERR << "Quantity needed during backprop has changed since "
+          "the value used in the forward pass.  You have likely used "
+          "an in-place or overwriting operation in a way that's not "
+          "allowed.  Solution: don't overwrite data if you want "
+          "to do backprop.";
+    }
+  }
+
+ private:
+  /**
+     Initialize all members of this object except for num_bytes_ (which is set
+     in the constructor).  This is called from the constructor, but also whenever
+     we detect that debug mode has been turned off and then on again.
+   */
+  void Initialize(bool new_region);
+
+  // the number of bytes in the region, set only in the constructor.
+  int64 num_bytes_;
+
+  // debug_tick_ is the value of DebugTick() at the time when Initialize() was
+  // most recently called.  I.e. it's the start of the current debug cycle.
+  // It's used to detect when debug mode has been turned off and then on, which
+  // requires us to re-initialize this object.
+  int64 debug_tick_;
+
+  // Checker object for uninitialized data.  This is only non-NULL if
+  // the following two conditions hold:
+  //   (a) `new_region` as passed to Initialize() was true (because if we
+  //      started debugging after this region was already created, we
+  //      wouldn't know whether any data in it was uninitialized, so
+  //      this check is meaningless.
+  //   (b) No-one has called RecordZeroing() since Initialize() was
+  //      last called.  (This records that the entire region was
+  //      zeroed, which means there would be no uninitialized data.
+  std::unique_ptr<UninitializedDataChecker> uninitialized_checker_;
+
+  // Checker object for invalidated data.  Will only be allocated if
+  // RecordInvalidation() has been called since Initialize().  See docs for
+  // InvalidatedDataChecker for explanation of what this means.
+  std::unique_ptr<InvalidatedDataChecker> invalidated_checker_;
+
+  // Checker object that checks that we don't overwrite quantities
+  // that will be needed in the backward pass.
+  std::unique_ptr<ChangeTracker> change_tracker_;
+
+
+};
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+#endif  // KALDI_TENSOR_MEMORY_CHECKER_H_
diff --git a/src/tensor/op.h b/src/tensor/op.h
new file mode 100644
index 00000000000..5dcc7321757
--- /dev/null
+++ b/src/tensor/op.h
@@ -0,0 +1,326 @@
+// tensor/op.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_OP_H_
+#define KALDI_TENSOR_TENSOR_OP_H_ 1
+
+#include "tensor/tensor.h"
+
+namespace kaldi {
+namespace tensor {
+
+class Variable;
+
+
+enum OpProperties {
+  kNotConcreteOp = 0,
+  kConcreteOp = 1,  // An Op that is concrete is one that can be executed
+                    // directly, i.e. its Do() function works; these Ops will
+                    // generally correspond to a single function call, e.g. a
+                    // particular BLAS call If an Op is not concrete, you should
+                    // keep expanding via Expand() until you get concrete ops,
+                    // and then execute those.
+};
+
+/**
+   class Op is a base-class for objects that are created when we do operations
+   on Variables.  The important thing to know here is that the Variables in
+   question will always have been allocated with particular dimensions,
+   and possibly even contain defined values, before we get to the Op.
+   Examples of Ops include,
+      a := b * c
+      a += b
+      a *= b
+   where the interpretation of the commands above will depend on the
+   dimensions of the Tensors involved.
+
+   Notice that all the member functions of class Op are `const`, i.e. they
+   shouldn't change this class (although of course they may change the
+   underlying Tensor data).  This is to remind users that Ops are supposed
+   to be reusable, and calls to this object shouldn't affect the behavior
+   of subsequent calls, except to the extent that the underlying Tensor
+   data has been changed.
+ */
+class Op {
+ public:
+
+  /**
+     Do whatever it is that this Op does (e.g. execute the command `a += b`,
+     if that was what this Op did).  Only needs to be defined for Ops that
+     are concrete, i.e. Properties() & kOpConcrete
+  */
+  virtual void Do() const {
+    KALDI_ERR << "Execution not supported for this Op (not concrete); "
+        "please expand ";
+  }
+
+  /**
+     Return a copy of this object, newly allocated using new.
+  */
+  virtual Op *Copy() const = 0;
+
+
+  /**
+     Properties of this Op, a bunch of boolean flags such as kConcreteOp
+     (may add more in future)
+   */
+  virtual int32 Properties() const = 0;
+
+  /**
+     To be called only for non-concrete Ops, i.e. Ops for which Properties() &
+     kConcreteOp is zero.  Calling this function will expand this Op into one or
+     more concrete Ops, appending them to 'ops'.
+        @param [out] ops
+                     Operations will be *appended* to `ops`.  These operations
+                     will be fully-expanded versions of this Op.  (i.e. they
+                     will be concrete).
+   */
+  virtual void Expand(std::vector<std::unique_ptr<Op> > *ops) = 0;
+
+
+  /**
+     This is for forward-mode automatic differentiation (a rarely-used thing).
+     It appends to 'ops' the commands corresponding to the forward-mode
+     automatic differentiation w.r.t. this Op.
+
+       @param [in,out] 'map' is the map that maps from tensors to the
+             corresponding derivative values.  May be modified by adding
+             new key/value pairs.
+       @param [out] ops  This funtion will *append* to `ops` the
+             commands for computing the derivatives associated with
+             this Op in forward-mode automatic differentiation.  If none
+             of the inputs to the Op were tracked w.r.t. `map`,
+             nothing will be done.
+
+     Example: if the command was "a += b", the derivative operation would
+     be: deriv(a) += deriv(b).  In most cases these Ops would be executed
+     immediately and then deleted.
+
+     This only has to be defined for Ops that are called directly by
+     user-level code; ops that are only encountered as a byproduct of
+     expanding other Ops do not have to define this function.
+  */
+  virtual void GetForwardDerivOps(DerivMap *map,
+                                  std::vector<std::unique_ptr<Op> > *ops) const {
+    KALDI_ERR << "Forward-mode autograd not supported for this Op";
+  }
+
+
+  /**
+     This is for reverse-mode automatic differentiation (the normal type of
+     autograd).
+
+       @param [in,out] map   This object maps from tensors to the
+                       corresponding derivative values.  It may be changed by
+                       adding new elements to the map, if its Deriv() function
+                       is called.
+       @param [out]    ops  This function may *append* to 'ops' the commands
+                       used in the reverse-mode automatic differentiation.
+                       (Note: nothing will be appended if none of the inputs
+                       to the Op were already tracked w.r.t. 'map'.)
+
+     Example: if the command was "a += b * c", the operations added to
+     'ops' would correspond to `deriv(b) += deriv(a) * c` and
+     `deriv(c) += deriv(a) * b`.
+
+     This only has to be defined for Ops that are called directly by
+     user-level code; ops that are only encountered as a byproduct of
+     expanding other Ops do not have to define this function.
+  */
+  virtual void GetBackwardDerivOps(DerivMap *map,
+                                   std::vector<std::unique_ptr<Op> > *ops) const {
+    KALDI_ERR << "Reverse-mode autograd not supported for this Op";
+  }
+
+
+
+  /** Destructor.  It's important for efficiency of memory use to destroy Ops as
+      soon as you won't need them any more, because it may trigger the freeing
+      of Tensors and hence Storage objects.
+  */
+  virtual ~Op();
+ protected:
+
+  // This function ensures that the *last element* of `ops` is fully expanded At
+  // entry, `ops` is a nonempty vector of Op pointers, which are all concrete
+  // except the last entry.  At exit, `ops` is a nonempty vector of Op pointers
+  // which are all concrete.  This function will usually be called from Expand()
+  // after code that appends an Op that might not be concrete to `ops`.
+  void EnsureExpanded(std::vector<std::unique_ptr<Op> > *ops) {
+    if (!(ops->back()->Properties() & kConcreteOp)) {
+      Op *op = ops->back().get();
+      ops->pop_back();
+      op->Expand(ops);
+    }
+  }
+
+};
+
+
+
+#ifdef HAVE_CUDA
+// The following macro is primarily for use inside other macros defined below.
+// This version is for when we compile with CUDA support.
+#define SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, T, ...) \
+   {                                                                      \
+   switch (device_type) {                                                 \
+    case kCpuDevice:                                                      \
+      pointer_name = new OpName<T, kCpuDevice>(__VA_ARGS__); break;       \
+    case kCudaDevice:                                                      \
+      pointer_name = new OpName<T, kCudaDevice>(__VA_ARGS__); break;       \
+    default:                                                              \
+    KALDI_ERR << "Invalid device type " << int32(device_type);            \
+  }  while (0)
+// the while(0) is to allow a semicolon after the invocation.
+#else
+// The following macro is primarily for use inside other macros defined below.
+// This version is for when we compile without CUDA support.
+#define SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, T, ...) \
+   {                                                                      \
+   switch (device_type) {                                                 \
+    case kCpuDevice:                                                      \
+      pointer_name = new OpName<T, kCpuDevice>(__VA_ARGS__); break;       \
+    case kCudaDevice:                                                      \
+    KALDI_ERR << "You did not compile for CUDA, reconfigure with "        \
+                 "CUDA support.";                                         \
+    default:                                                              \
+    KALDI_ERR << "Invalid device type " << int32(device_type);            \
+  }  while (0)
+// the while(0) is to allow a semicolon after the invocation.
+#endif
+
+// the following macro is to be used to dispatch device and dtype-specific
+// implementations.  The idea is that you have defined a template like
+// template<class Dtype, class DeviceType> class OpName
+// and have specialized that template for the various combinations.
+// This executes commands like:
+//    pointer_name = new OpName<float, kCpu>(a, b, c);
+// See also SET_TO_TEMPLATED_OP_REAL for ops where integers are not
+// supported
+#define SET_TO_TEMPLATED_OP_ALL(pointer_name, dtype, device_type, OpName, ...) \
+    switch (dtype) {                                \
+     case kFloatDtype:                              \
+     SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, float, __VA_ARGS__); \
+      break;                                        \
+     case kDoubleDtype:                             \
+     SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, double, __VA_ARGS__); \
+      break;                                        \
+     case kInt32Dtype:                             \
+     SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, int32, __VA_ARGS__); \
+      break;                                        \
+    default:                                        \
+      KALDI_ERR << "Invalid dtype (this op only allows float or double): " \
+      << int32(dtype);                              \
+  } while(0)
+// the while(0) is to allow a semicolon after the invocation.
+
+#define SET_TO_TEMPLATED_OP_REAL(pointer_name, dtype, device_type, OpName, ...) \
+    switch (dtype) {                                \
+     case kFloatDtype:                              \
+       SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, float, __VA_ARGS__); \
+      break;                                        \
+     case kDoubleDtype:                             \
+       SET_TO_TEMPLATED_OP_DEVICE(pointer_name, device_type, OpName, double, __VA_ARGS__); \
+      break;                                        \
+    default:                                        \
+      KALDI_ERR << "Invalid dtype (this op only allows float or double): " \
+                << int32(dtype);                              \
+  } while(0)
+// the while(0) is to allow a semicolon after the invocation.
+
+
+#define SET_TO_TEMPLATED_CPU_OP_REAL(pointer_name, dtype, OpName, ...) \
+    switch (dtype) {                                \
+     case kFloatDtype:                              \
+       pointer_name = new OpName<float, kCpuDevice>(__VA_ARGS__); break;       \
+      break;                                        \
+     case kDoubleDtype:                             \
+       pointer_name = new OpName<double, kCpuDevice>(__VA_ARGS__); break;       \
+      break;                                        \
+    default:                                        \
+      KALDI_ERR << "Invalid dtype (this op only allows float or double): " \
+                << int32(dtype);                              \
+  } while(0)
+// the while(0) is to allow a semicolon after the invocation.
+
+// The following is used when you know that you are only using CPU, particularly
+// for "reference implementations"
+#define SET_TO_TEMPLATED_CPU_OP_ALL(pointer_name, dtype, OpName, ...) \
+    switch (dtype) {                                \
+     case kFloatDtype:                              \
+       pointer_name = new OpName<float>(__VA_ARGS__); break;       \
+      break;                                        \
+     case kDoubleDtype:                             \
+       pointer_name = new OpName<double>(__VA_ARGS__); break;       \
+      break;                                        \
+    default:                                        \
+      KALDI_ERR << "Invalid dtype (this op only allows float or double): " \
+                << int32(dtype);                              \
+  } while(0)
+// the while(0) is to allow a semicolon after the invocation.
+
+// The following is used when you know that you are only using CPU, particularly
+// for "reference implementations"; this version accepts two dtype arguments,
+// for SimpleAssignOp which supports type conversion and possibly broadcasting,
+// transpose etc., but not summation.
+#define SET_TO_TEMPLATED_CPU_OP_ALLPAIRS(pointer_name, dtype1, dtype2, OpName, ...) \
+  switch (static_cast<DataType>(int32(dtype1) + (int32(dtype2) << 4))) { \
+     case kFloatFloatDtype:                               \
+       pointer_name = new OpName<float, float>(__VA_ARGS__); break; \
+      break;                                         \
+     case kFloatDoubleDtype:                               \
+       pointer_name = new OpName<float, double>(__VA_ARGS__); break; \
+      break;                                         \
+     case kFloatInt32Dtype:                               \
+       pointer_name = new OpName<float, int32>(__VA_ARGS__); break; \
+      break;                                         \
+     case kDoubleFloatDtype:                               \
+       pointer_name = new OpName<double, float>(__VA_ARGS__); break; \
+      break;                                         \
+     case kDoubleDoubleDtype:                               \
+       pointer_name = new OpName<double, double>(__VA_ARGS__); break; \
+      break;                                         \
+     case kDoubleInt32Dtype:                               \
+       pointer_name = new OpName<double, int32>(__VA_ARGS__); break; \
+      break;                                         \
+     case kInt32FloatDtype:                               \
+       pointer_name = new OpName<int32, float>(__VA_ARGS__); break; \
+      break;                                         \
+     case kInt32DoubleDtype:                               \
+       pointer_name = new OpName<int32, double>(__VA_ARGS__); break; \
+      break;                                         \
+     case kInt32Int32Dtype:                               \
+       pointer_name = new OpName<int32, int32>(__VA_ARGS__); break; \
+      break;                                         \
+    default:                                        \
+      KALDI_ERR << "Invalid pair of dtypes in Assign Op: "       \
+             << int32(dtype1) << ", " << int32(dtype2);   \
+  } while(0)
+
+
+
+
+
+// See linear-ops.h and nonlinear-ops.h for concrete examples of Ops.
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_VARIABLE_H_
diff --git a/src/tensor/pattern-extra-utils-inl.h b/src/tensor/pattern-extra-utils-inl.h
new file mode 100644
index 00000000000..2fbd08b6eed
--- /dev/null
+++ b/src/tensor/pattern-extra-utils-inl.h
@@ -0,0 +1,37 @@
+// tensor/pattern-extra-utils-inl.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_INL_H_
+#define KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_INL_H_ 1
+
+// This file is only to be included by pattern-extra-utils.h; do not include it
+// directly.
+
+
+
+namespace kaldi {
+namespace tensor {
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_PATTERN_EXTRA_UTILS_INL_H_
diff --git a/src/tensor/pattern-tuple-utils.cc b/src/tensor/pattern-tuple-utils.cc
new file mode 100644
index 00000000000..8e5942e127a
--- /dev/null
+++ b/src/tensor/pattern-tuple-utils.cc
@@ -0,0 +1,1422 @@
+// tensor/pattern-extra-utils.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "tensor/pattern-extra-utils.h"
+
+namespace kaldi {
+namespace tensor {
+
+
+
+
+/**
+   This function, not declared in the header, creates a sorted list of all the
+   stride values which are present in either 'pattern1' or 'pattern2'.  These
+   will all be positive, since pattern1 and pattern2 are required to be in
+   canonical form.
+
+     @param [in] pattern1   First input pattern, must be in canonical form.
+     @param [in] pattern2   Second input pattern, must be in canonical form.
+     @param [out] strides   A sorted list of all stride values that are present
+                            in either pattern1 or pattern2 will be written
+                            to here.  There will be no repeats.
+*/
+static void FindAllStrides(
+    const Pattern &pattern1,
+    const Pattern &pattern2,
+    std::vector<int32> *strides) {
+  KALDI_PARANOID_ASSERT(IsCanonical(pattern1) && IsCanonical(pattern2));
+  strides->clear();
+  strides->reserve(pattern1_.num_axes + pattern2_.num_axes);
+  for (int32 raxis = 0; raxis < pattern1.num_axes; raxis++)
+    strides->push_back(pattern1.strides[raxis]);
+  for (int32 raxis = 0; raxis < pattern2.num_axes; raxis++)
+    strides->push_back(pattern2_.strides[raxis]);
+  SortAndUniq(strides);  // sort from least to greatest; remove duplicates.
+}
+
+
+
+// See declaration in header.
+bool IsRegular(const Pattern &pattern) {
+  int32 num_axes = pattern.num_axes;
+
+  for (int32 i = 0; i + 1 < num_axes; i++) {
+    int32 this_stride = pattern.strides[i],
+        this_dim = pattern.dims[i],
+        this_prod = this_stride * this_dim;
+    for (int32 j = i + 1; j < num_axes; j++) {
+      if (pattern.strides[j] >= this_prod) {
+        // in this case, 'j' would be the 'k' value used in the proof.  If we
+        // fall off this loop, it would correspond to k == num_axes, which is
+        // also OK.
+        break;
+      } else if (pattern.dims[j] != 1 ||
+                 pattern.strides[j] % this_stride != 0) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+
+/**
+   This function, called by ConvertPatternStrides(), is not declared in the
+   header.  It converts a pattern in canonical form to a Pattern whose strides
+   are equal to the provided 'strides' vector, which is valid-2,
+   and has normalized (i.e. positive and increasing) strides.
+
+
+       @param [in] pattern_in  The input pattern; must be valid and
+                               in canonical form.
+       @param [in] strides     The list of strides which we want
+                               'pattern_out' to have.  Must be a list of
+                               positive integers sorted from least to
+                               greatest with size <= KALDI_TENSOR_MAX_AXES,
+                               and all strides in pattern_in must
+                               be present in this list.
+       @param [out] pattern_out  The output pattern (must not point to
+                               pattern_in).  On exit its memory-index-set will
+                               equal that of pattern_in; its strides will be
+                               equal to 'strides' (including the order, when
+                               numbered in the private numbering); it will
+                               be valid-2, and it will be linear in pattern_in.
+*/
+static void ConvertPatternStridesLazily(
+    const Pattern &pattern_in,
+    const std::vector<int32> &strides,
+    Pattern* pattern_out) {
+  KALDI_PARANOID_ASSERT(IsCanonical(pattern_in));
+  int32 num_axes_in = pattern_in.num_axes,
+      num_axes_out = strides.size();
+  pattern_out->num_axes = num_axes_out;
+  pattern_out->code = -1;
+  int32 raxis_in = 0;
+  pattern_out->offset = pattern_in->offset;
+  // The following code relies on pattern_in being in canonical form
+  // (so its strides are in sorted order), and all of its strides being
+  // present in the list 'strides'.
+  for (int32 raxis_out = 0; raxis_out < num_axes_out; raxis_out++) {
+    int32 stride = strides[raxis_out];
+    pattern_out->strides[raxis_out] = stride;
+    if (pattern_in.strides[raxis_in] == stride) {
+      pattern_out->dims[raxis_out] = pattern_in.dims[raxis_in];
+      pattern_in++;
+    } else {
+      pattern_out->dims[raxis_out] = 1;
+    }
+  }
+  if (raxis_in != num_axes_in) {
+    KALDI_ERR << "Something went wrong converting strides; trying to "
+        "convert pattern with strides = " << StridesAsString(pattern_in)
+              << " to strides " << ArrayAsString(strides);
+  }
+}
+
+
+
+/**
+   This function, not declared in the header, attempts to ensure that the axis-sorting
+   property in a provided Pattern holds for the axis-index 'raxis' (in the private
+   numbering, of course).  I.e. it ensures (for the pattern we are to modify) that:
+
+      `pattern->strides[raxis+1] >= pattern->strides[raxis] * pattern->dims[raxis]`.
+
+   This function expects that the pattern will also satisfy that property for
+   all axis-indexes `0 <= i < raxis`, and will be valid-2.  This function will
+   always succeed if the pattern is regular (see IsRegular(), and "Regularity
+   property" in the glossary).
+
+   Ensuring this property exists may sometimes require splitting this Pattern up
+   (i.e. adding extra Patterns); the union of their memory-index-sets together
+   with that of the modified pattern will equal the memory-index-set of the
+   original pattern at input (these sets being unioned will be disjoint).  Any
+   newly created Patterns will be appended to the vector 'patterns'.
+
+    @param [in]      raxis    The axis for which we are ensuring that the
+                             axis-sorting property holds.
+    @param [in]      pattern_index  The index in the vector 'patterns'
+                             of the pattern for which we are ensuring that
+                             the axis-sorting property holds.
+    @param [in,out]  patterns  The vector of patterns in which to look for the
+                             pattern to operate on; we may also append
+                             Patterns to this vector if needed, as mentioned
+                             above.  Note: the newly added patterns may not satisfy
+                             the axis-sorting property for 'raxis', but they will
+                             still satisfy it for all axes numbered less than
+                             'raxis', assuming the pattern at 'pattern_index'
+                             did at entry.
+
+    @return                  Returns true on success, false on failure.
+                             Will always return true if `(*patterns)[pattern_index]`,
+                             satisfied the 'regularity property' at entry;
+                             see IsRegular().
+ */
+static bool EnsureAxisSortingPropertyHolds(
+    int32 raxis,
+    int32 pattern_index,
+    std::vector<Pattern> *patterns) {
+  Pattern *pattern = (*patterns)[pattern_index];
+  // We use 'i' as the internal name for 'raxis', because we want to mirror the
+  // notation used for the regularity property in the glossary, and in the
+  // function IsRegular() that checks for it.  There is an index k with `i < k
+  // <= num_axes`, that appears in the definition of the regularity property.
+  // The algorithm used here iteratively decreases the value of k until it
+  // equals i + 1, adding new patterns as needed, at which point the
+  // axis-sorting property will hold for index i.
+  int32 i = raxis, num_axes = pattern->num_axes;
+  int32 this_stride = pattern->strides[i],
+      this_dim = pattern->dims[i],
+      this_prod = this_stride * this_dim;
+  if (this_dim == 1)  // This is a small optimization for a common case.
+    return true;
+  KALDI_PARANOID_ASSERT(raxis + 1 < num_axes && this_stride > 0 &&
+                        ValidMM(*pattern));
+  int32 j, k = num_axes;
+  for (j = i + 1; j < num_axes; j++) {
+    if (pattern->strides[j] >= this_prod) {
+      k = j;
+      break;  // regularity property is OK as far as this 'i' is concerned.
+    } else if (pattern->dims[k] != 1 ||
+               pattern->strides[k] % this_stride != 0) {
+      return false;  // Pattern was not regular.
+    }
+  }
+  for (; j = k - 1; j > i; j--) {
+    int32 j_stride = pattern->strides[j],
+        stride_ratio = j_stride / this_stride;  // will divide exactly; we
+                                                     // checked above.
+    KALDI_PARANOID_ASSERT(j_stride % this_stride == 0);
+
+    // We can prove that j_dim will always be at least 1; if this is the
+    // first time round the loop this is easy to show (else k would be smaller);
+    // otherwise we can use the fact that the strides for axes i, i+1 .. k-1 are
+    // strictly increasing and all multiples of this_stride (hence stride_ratio
+    // strictly increases from one j to the next).
+    int32 j_dim = this_dim / stride_ratio,
+        remainder = this_dim % stride_ratio;
+
+    if (remainder != 0) {
+      patterns->resize(patterns->size() + 1);
+      pattern = (*patterns)[i];  // in case it was reallocated.
+      Pattern *remainder_pattern = &(patterns->back());
+      *remainder_pattern = *pattern;
+      remainder_pattern->dims[i] = remainder;
+      remainder_pattern->offset += int64(j_stride) * j_dim;
+    }
+
+    pattern->dims[j] = j_dim;
+    pattern->dims[i] = stride_ratio;
+    this_prod = j_stride;
+  }
+  return true;
+}
+
+
+// see declaration in header for documentation.
+bool ConvertPatternStrides(
+    const Pattern &pattern,
+    const ArrayRef<int32> &strides,
+    std::vector<Pattern*> *patterns) {
+  patterns->resize(1);
+  ConvertPatternStridesLazily(pattern, &((*patterns)[0]));
+  int32 num_axes = strides.size();
+  for (int32 raxis = 0; raxis + 1 < num_axes; raxis++) {
+    for (int32 p = 0; p < static_cast<int32>(patterns->size()); p++) {
+      if (!EnsureAxisSortingPropertyHolds(raxis, p, patterns)){
+        patterns->clear();
+        return false;  // Couldn't be converted, because 'pattern' was not
+                       // regular.
+      }
+    }
+  }
+#ifdef KALDI_PARANOID
+  {
+    int64 num_elements = NumElements(pattern),
+        num_elements_check = 0;
+    for (int32 p = 0; p < static_cast<int32>(patterns->size()); p++) {
+      KALDI_ASSERT(IsValidM(*patterns)[p]);
+      num_elements_check += NumElements((*patterns)[p]);
+    }
+    KALDI_ASSERT(num_elements == num_elements_check);
+  }
+#endif
+  return true;
+}
+
+
+/**
+   FindOffsetsRecursive() is a utility function that is used in the
+   implementation of FindOffsets().  See the documentation of FindOffsets(*) in
+   pattern-extra-utils.h for context.
+   Briefly: we are finding the set of offsets o such that there exists i
+   with pattern1[i + o] = pattern2[i].
+
+   The algorithm for computing the list of potential offsets o is recursive,
+   starting from the last-numbered raxis, which will have the highest
+   stride since the strides are normalized.
+
+   Let s be the the vector of strides of the patterns, in the private numbering
+   (pattern1 and pattern2 have identical strides).  Expanding the equation
+
+      pattern1[i + o] = pattern2[i]                     (1)
+
+   (see "Indexing a Pattern" in pattern.h to understand the notation),
+   we get:
+
+      pattern1.offset + s . (i + o)  ==  pattern2.offset + s . i
+
+   where a `.` with space around it means dot product.
+
+   Simplifying:
+      s . o = pattern2.offset - pattern1.offset.         (2)
+
+   which we can expand as follows (using latex notation),
+
+   \sum_{r=0}^{num_axes - 1}  s[r] o[r] = pattern2.offset - pattern1.offset.   (3)
+
+   For each raxis r, there are limits on the possible values of o[r], which are
+   imposed by the dimensions of the two Tensors.  In Equation (1), for the
+   indexes into the patterns to be valid, i[r] + o[r] must be in
+   [0 .. pattern1.dims[r] - 1] and i[r] must be in [0 .. pattern2.dims[r] - 1], For
+   at least one such i[r] to exist, we require
+
+       -pattern2.dims[r] < o[r] < pattern1.dims[r]        (4)
+
+   (a formal derivation is kind of tedious but straightforward).
+   There is a further limitation on the elements of o that we can obtain using
+   the properties above plus the axis-dominance property.  Our algorithm for
+   finding the list of possible offsets o is recursive starting from the
+   last-numbered raxis, and we derive it below.
+
+   Suppose for some raxis r, we are trying to find the possible values for o[r],
+   and we have been provided the values of o[q] for q > r.  Define
+
+     remainder    = pattern2.offset - pattern1.offset
+                    - \sum_{q=r+1}^{num_axes-1}  o[r] s[r]
+
+   And define
+     lower_sum =   \sum_{q=0}^{r-1} s[q] * o[q],
+
+   We can use the axis-dominance lemma (see pattern.h) and the limitation
+   on o[r] from (4) to prove that:
+         -s[r] <  lower_sum <  s[r].                 (5)
+   (the axis-dominance lemma is relevant here because o[r] behaves just like an
+   index into a pattern, except it be negative as well as positive).
+   For (3) to hold, we must have:
+       lower_sum = remainder - o[r] s[r]            (6)
+   and expanding lower_sum in (5) using (6), we have:
+     -s[r] <  remainder - s[r] * o[r]  <  s[r]       (7)
+   (notice: in the recursion o[r] is the only unknown in this equation).  There
+   will be either one or two values of o[r] satisfying (7), and Eq. (4) may
+   eliminate one or both of those.
+
+          @param [in] pattern1  First pattern; must be valid-1
+          @param [in] pattern2  Second pattern; must be valid-1 and satisfy
+                          SameStrides(pattern1, pattern2).
+          @param [in] known_offsets     (Note: semantically this is an input;
+                       it is temporarily changed inside the function and
+                       then restored to its previous state).
+                       It is the list of already-known offsets (i.e. the
+                       elements of some members o) but in the public numbering,
+                       so that element 0 corresponds to raxis = num_axes - 1.
+                       This is convenient because the algorithm starts from
+                       the highest-numbered raxis.
+          @param [in] remainder    This is defined as pattern2.offset - pattern1.offset
+                         - \f$ \sum_{q=r+1}^{num_axes-1}  o[r] s[r]. \f$,
+                         where you can work out the raxis r we are immediately
+                         processing as r = pattern1.num_axes - 1 - known_offsets->size().
+                         The higher-numbered elements of o[r] are available through
+                         the recursion.
+          @param [in] keep_all_offsets   Bool that says whether the user
+                       is interested in all the offsets.  If true we'll
+                       output all valid offsets; if false we may stop
+                       after one.
+          @param [out] offsets_out  A list of offset vectors to be output
+                       (should be empty when called by the user; it will
+                       be appended to).  Each element of (*offsets_out)
+                       will be a vector o, in the private numbering.
+*/
+void FindOffsetsRecursive(const Pattern &pattern1,
+                          const Pattern &pattern2,
+                          std::vector<int32> *known_offsets,
+                          int64 remainder,
+                          bool keep_all_offsets,
+                          std::vector<std::vector<int32> > *offsets_out) {
+  int32 num_axes = pattern1.num_axes,  // will equal pattern2.num_axes
+      raxis = num_axes - 1 - static_cast<int32>(known_offsets->size()),
+      stride = pattern1.strides[raxis],  // will equal pattern2.strides[raxis]
+      dim1 = pattern1.dims[raxis],
+      dim2 = pattern2.dims[raxis];
+  int32 this_offset = remainder / stride,
+      next_remainder = remainder - (stride * this_offset);
+  // Note: abs(next_remainder) will be less than stride.
+  // 'this_offset' is one of the possible solutions for o[r].
+
+  if (raxis == 0) {
+    if (next_remainder == 0) {
+      // The offset vector we're about to append to known_offsets will be
+      // `this_offset` followed by the reverse of `known_offsets` (since
+      // known_offsets is in the public numbering; we want the private).
+      offsets_out->resize(offsets_out->size() + 0);
+      offsets_out->back().push_back(this_offset);
+      offsets_out->back().insert(offsets_out->back().end(),
+                                 known_offsets->rbegin(),
+                                 known_offsets->rend());
+#ifdef KALDI_PARANOID
+      {  // Check these really are valid.  TODO: remove this eventually.
+        std::vector<int32> i1(num_axes), i2(num_axes);
+        std::vector<int32> &o = known_offsets->back();
+        for (int32 r = 0; r < num_axes; r++) {
+          if (o[r] > 0)
+            i1[r] = o;
+          else
+            i2[r] = -o;
+        }
+        // this i1 and i2 satisfy i1 = i2 + o, so i2 is the i in the
+        // equation pattern1[i + o] == pattern2[i].
+        KALDI_PARANOID_ASSERT(IndexPattern(pattern1, i1) ==
+                              IndexPattern(pattern2, i2));
+      }
+#endif
+    }
+    return;
+  } else {
+    known_offsets->push_back(this_offset);
+    if (this_offset > -pattern2.dims[raxis] &&
+        this_offset < pattern1.dims[raxis]) {
+      // if eq. (4) is satisfied..
+      FindOffsetsRecursive(pattern1, pattern2, known_offsets,
+                           next_remainder, keep_all_offsets,
+                           offsets_out);
+    }
+    if (next_remainder == 0 ||
+        (!keep_all_offsets && !offsets_out->empty())) {
+      // if next_remainder == 0 there would be only one solution to (7)
+      known_offsets->pop_back();
+      return;
+    }
+    int32 offset_change = (next_remainder > 0 ? -1 : 1);
+    this_offset += offset_change;
+    next_remainder -= stride * offset_change;
+    known_offsets->back() = this_offset;
+    if (this_offset > -pattern2.dims[raxis] &&
+        this_offset < pattern1.dims[raxis]) {
+      // if eq. (4) is satisfied..
+      FindOffsetsRecursive(pattern1, pattern2, known_offsets,
+                           next_remainder, keep_all_offsets,
+                           offsets_out);
+    }
+    known_offsets->pop_back();
+    return;
+  }
+}
+
+
+// Declared in header, see documentation there.
+void FindOffsets(const Pattern &pattern1,
+                 const Pattern &pattern2,
+                 bool keep_all_offsets,
+                 std::vector<std::vector<int32> > *offsets_out) {
+  KALDI_PARANOID_ASSERT(IsValid1(pattern1) && IsValid1(pattern2) &&
+                        HasNormalizedPositiveStrides(pattern1) &&
+                        SameStrides(pattern1, pattern2));
+  offsets_out->clear();
+  std::vector<int32> known_offsets;
+  FindOffsetsRecursive(pattern1, pattern2,
+                       &known_offsets,
+                       keep_all_offsets,
+                       pattern2.offset - pattern1.offset,
+                       offsets_out);
+}
+
+
+/*
+
+ A hyperrectangle (here expressed in terms of integers) is a Cartesian product
+ of integer intervals, here expressed as (begin, end) pairs so that the
+ integers in that interval are [ begin .. end - 1].  The vector must be
+ nonempty for us to consider this a valid hyperrectangle; and for each
+ interval we require end > begin.
+
+ [set view of hyperrectangles]
+
+ A hyperrectangle can be used to represents a set of integer tuples.
+ For a hyperrectangle h, let set(h) represent all the index-tuples i
+ with h.size() members such that, for each raxis 0 <= r < h.size(),
+      h[r].first <= i[r] < h[r].second.
+*/
+typedef std::vector<std::pair<int32, int32> > Hyperrectangle;
+
+bool IsValidHyperrectangle(const Hyperrectangle &a) {
+  if (a.empty()) return false;
+  for (auto iter = a.begin(); iter != a.end(); ++iter)
+    if (iter->first >= iter->second)
+      return false;
+}
+
+std::vector<int32>  RandomIndexFromHyperrectangle(const Hyperrectangle &a) {
+  // Returns a random index-tuple drawn from the set represented by the
+  // hyperrectangle a.
+  std::vector<int32> ans(a.size());
+  auto ans_iter = ans.begin(), ans_end = ans.end();
+  auto a_iter = a.begin();
+  for (; ans_iter != ans_end; ++ans_iter, ++a_iter)
+    *ans_iter = RandInt(a_iter->first, a_iter->second - 1);
+
+
+}
+
+// Returns true if two hyperrectangles, as defined above,
+// intersect.  We require a.size() == b.size() and a and
+// to be valid hyperrectangles.
+bool HyperrectanglesIntersect(const Hyperrectangle &a,
+                              const Hyperrectangle &b) {
+  KALDI_PARANOID_ASSERT(a.size() == b.size() &&
+                        IsValidHyperrectangle(a) && IsValidHyperrectangle(b));
+  auto iter_a = a.begin(),  iter_b = b.begin(), end_a = a.end();
+  for (; iter_a != end_a; ++iter_a, ++iter_b) {
+    if (a->second <= b->first ||
+        b->second <= a->first)
+      return false;
+  }
+}
+
+/**
+   If called with i == 0, this recursive function computes the set-wise
+   difference of hyperrectangles a - b (viewed as sets of tuples of ints,
+   obviously); it appends that difference, expressed as zero or more
+   hyperrectangles, to the vector `difference`.  See definition of
+   typedef Hyperrectangle for more explanation.
+
+      @param [in] a  A valid hyperrectangle
+      @param [in] b  A valid hyperrectangle, must satisfy a.size() == b.size()
+      @param [in] i  The user will call this recursive function with i == 0.
+                     It is an index in the range [0 .. a.size() - 1] (view this
+                     as an axis-index).  The caller asserts that for each index
+                     0 <= j < i, a's interval is contained in b's interval; that
+                     is, a[j].first >= b[j].first and a[j].second <=
+                     b[j].second.
+      @param [out] difference   Zero or more hyperrectangles will be
+                     *appended* to `difference`.  Their union will equal
+                     the set-wise difference a - b.
+*/
+static void SubtractHyperrectangles(const Hyperrectangle &a,
+                                    const Hyperrectangle &b,
+                                    size_t i,
+                                    std::vector<Hyperrectangle> *difference) {
+  size_t size = a.size();
+  KALDI_PARANOID_ASSERT(i == 0 ||
+                        (a[i-1].first >= b[i-1].first &&
+                         a[i-1].second <= b[i-1].second));
+  KALDI_PARANOID_ASSERT(i != 0 ||
+                        (IsValidHyperrectangle(a) &&
+                         IsValidHyperrectangle(b)));
+
+  Hyperrectangle &a_non_const = const_cast<Hyperrectangle&> a;
+  Hyperrectangle &b_non_const = const_cast<Hyperrectangle&> b;
+
+  int32 a_start = a[i].first, a_end = a[i].second,
+    b_start = b[i].first, b_end = b[i].second;
+
+  if (b_start < a_end && b_end > a_start) {
+    // If a's and b's intervals overlap at all....
+    if (a_start < b_start) {
+      // Append to `difference` the portion of a's interval that doesn't
+      // intersect with b's interval and that is before b starts.
+      a_non_const[i].second = b_start;
+      difference->append(a);
+      a_non_const[i].second = a_end;  // restore the state.
+    }
+    if (a_end > b_end) {
+      // Append to `difference` the portion of a's interval that doesn't
+      // intersect with b's interval and that is after b ends.
+      a_non_const[i].first = b_end;
+      difference->append(a);
+      a_non_const[i].first = a_start;  // restore the state.
+    }
+    // If this is not the last axis, handle the part that overlaps.  (If this is
+    // the last axis, we don't need to do anything with it, because the
+    // overlapping part won't appear in the difference a - b).
+    if (i + 1 < size) {
+      int32 intersection_start = std::max<int32>(a_start, b_start);
+      int32 intersection_end = std::min<int32>(a_start, b_start);
+      a_non_const[i].first = intersection_start;
+      a_non_const[i].second = intersection_end;
+      SubtractHyperrectangles(a, b, i + 1, difference);
+      // now restore the state.
+      a_non_const[i].first = a_start;
+      a_non_const[i].second = a_end;
+    }
+  } else {
+    // These intervals don't overlap, so the difference is just a.
+    difference->push_back(a);
+  }
+}
+
+/**
+       @param [in] pattern1   First input pattern.  Must be valid-1 and
+                        normalized+ (i.e. HasNormalizedPositiveStrides(pattern1)).
+       @param [in] pattern2   Second input pattern.  Must be valid-1 and
+                        satisfy SameStrides(pattern1, pattern2).
+       @param [in] offset  An offset as described in the documentation for
+                        FindOffsets(): a tuples o such that there exists
+                        i with pattern1[i + o] = pattern2[i].  Its size
+                        must equal the num_axes of pattern1 and pattern2.
+       @param [out] hyperrectangle  This will be set to a hyperrectangle
+                        with hyperrectangle.size() == offset.size(),
+                        which represents the set S of index-tuples which we
+                        could use to index pattern1, satisfying pattern1[S] =
+                        pattern2[S - o].  The two elements of the pair on each
+                        axis thus correspond to (begin, end) indexes into
+                        pattern1 where "end" one past the last valid index.
+                        See "[set view of hyperrectangles]" for explanation.
+*/
+static void OffsetToHyperrectangle(
+    const Pattern &pattern1,
+    const Pattern &pattern2,
+    const std::vector<int32> &offset,
+    Hyperrectangle *hyperrectangle) {
+  KALDI_PARANOID_ASSERT(IsValid1(pattern1) && IsValid1(pattern2) &&
+                        SameStrides(pattern1, pattern2) &&
+                        int32(offsets.size()) == pattern1.num_axes);
+  int32 num_axes = pattern1.num_axes;
+  hyperrectangle->resize(num_axes);
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    int32 o = offset[raxis];
+    // Caution: interval_start and interval_end aren't the range
+    // of possible elements of i in the equation; they represent
+    // i + o.
+    int32 interval_start = std::max<int32>(o, 0),
+        interval_end = std::min<int32>(pattern1.dims[raxis],
+                                       o + pattern2.dims[raxis]);
+      KALDI_ASSERT(interval_end > interval_start);
+      (*hyperrectangle)[raxis].first = interval_start;
+      (*hyperrectangle)[raxis].second = interval_end;
+  }
+#if 1
+  {  // testing code, will remove eventually.
+    std::vector<int32> index1 = RandomIndexFromHyperrectangle(*hyperrectangle),
+      index2(index1.size());
+    for (size_t i = 0; i < index.size(); i++)
+      index2[i] = index1[i] - offset[i];
+    KALDI_ASSERT(IndexPattern(pattern1, index1) == IndexPattern(pattern2, index2));
+  }
+#endif
+}
+
+
+/**
+   Given a pattern `src` and a hyperrectangle h, output a pattern `dest` that
+   represents `src` indexed with all the index-tuples i in set(h).  See
+   [set view of hyperrectangles] to understand the notation.
+
+          @param [in] src     Source pattern.  Must be valid-1.
+          @param [in] h       A hyperrectangle.  Every i in set(h) must be
+                              in the index-tuple-set of src.
+          @param [out] dest   Destination pattern.  Its memory-index-set
+                              equals src[set(h)].  Will have same strides
+                              as src, and will be valid-1.
+ */
+static void HyperrectangleToPattern(const Pattern &src,
+                                    const Hyperrectangle &h,
+                                    Pattern *dest) {
+  KALDI_PARANOID_ASSERT(IsValid1(src) && IsValidHyperrectangle(h));
+  int32 num_axes = src.num_axes;
+  int64 offset = src.offset;
+  dest->num_axes = num_axes;
+  for (int32 r = 0; r < num_axes; r++) {
+    int32 src_dim = src.dims[r],
+        stride = src.strides[r],
+        begin = h[r].first,
+        end = h[r].second;
+    dest->dims[r] = end - begin;
+    dest->strides[r] = stride;
+    offset += int64(begin) * stride;
+  }
+  SetUnusedDimsAndStrides(num_axes, dest);
+  dest->num_axes = num_axes;
+  dest->offset = offset;
+  SetDefaultCodeAndProperties(dest);
+  KALDI_PARANOID_ASSERT(IsValid1(*dest));
+}
+
+/**
+   Outputs to h a hyperrectangle that represents the index-tuple-set
+   of the Pattern `src`.  A vector where the r'th element is the
+   pair (0, src.dims[r]).
+ */
+static void GetFullHyperrectangleOfPattern(const Pattern &src,
+                                        Hyperrectangle *h) {
+  int32 num_axes = src.num_axes;
+  h->resize(num_axes);
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    (*h)[raxis].first = 0;
+    (*h)[raxis].second = src.dims[raxis];
+  }
+}
+
+
+
+/**
+   Given patterns pattern1 and pattern2 that are valid-1 and share
+   the same strides, and an offset o such that there
+   exists at least one index i with pattern1[i + o] = pattern2[i]
+   (c.f. "Indexing a Pattern" in the glossary in pattern.h),
+   outputs a Pattern representing the part of the intersection
+   of the memory-index-sets of pattern1 and pattern2 that has
+   offset o.
+
+      @param [in] pattern1   First input pattern.  Must be valid-1.
+      @param [in] pattern2   First input pattern.  Must be valid-1
+                             and satisfy SameStrides(pattern1, pattern2).
+      @param [in] o        Offset vector.  There must exist at least
+                           one index-tuple i such that
+                           pattern1[i + o] = pattern2[i].
+      @param [out] dest     Destination pattern with this part of the
+                            intersection of pattern1 and pattern2.
+                            Will be valid-1 at exit, and have the
+                            same strides as the input patterns.
+ */
+static void OffsetToPattern(const Pattern &pattern1,
+                            const Pattern &pattern2,
+                            const std::vector<int32> &o,
+                            Pattern *dest) {
+  KALDI_PARANOID_ASSERT(IsValid1(pattern1) && IsValid1(pattern2) &&
+                        SameStrides(pattern1, pattern2));
+  int32 num_axes = pattern1.num_axes;
+  int64 offset = pattern1.offset;
+  dest->num_axes = num_axes;
+  for (int32 r = 0; r < num_axes; r++) {
+    int32 stride = pattern1.strides[r],  // equals pattern2.strides[r].
+        offset = o[r];
+    dest->strides[r] = stride;
+    if (offset >= 0) {
+      // The first index into pattern1 would be offset, the first
+      // index into pattern2 would be 0.
+      // The dimension is the minimum of (pattern1.dim - offset, pattern2.dim)
+      offset += int64(offset) * stride;
+      dest->dims[r] = std::min<int32>(pattern1.dims[r] - offset,
+                                      pattern2.dims[r]);
+    } else {
+      // The first index into pattern1 would be 0, the first index
+      // into pattern2 would be -offset.  The dimension is the minimum
+      // of (pattern1.dim, pattern2.dim + offset).
+      dest->dims[r] = std::min<int32>(pattern1.dims[r],
+                                      pattern2.dims[r] + offset);
+    }
+  }
+  SetUnusedDimsAndStrides(num_axes, dest);
+  dest->num_axes = num_axes;
+  dest->offset = offset;
+  SetDefaultCodeAndProperties(dest);
+  KALDI_PARANOID_ASSERT(IsValid1(*dest));
+
+#ifdef KALDI_PARANOID
+  {  // TODO: remove this check when debugged.
+    Hyperrectangle h;
+    OffsetToHyperrectangle(pattern1, pattern2, o, &h);
+    Pattern p;
+    HyperrectangleToPattern(pattern1, h, &p);
+    KALDI_ASSERT(p == *dest);
+  }
+#endif
+}
+
+
+
+// See documentation in header.
+bool ComputeIntersection(const Pattern &pattern1_in,
+                         const Pattern &pattern2_in,
+                         bool keep_all_patterns,
+                         std::vector<Pattern> *intersection) {
+  intersection->clear();
+  Pattern pattern1(pattern1_in),
+    pattern2(pattern2_in);
+  CanonicalizePattern(&pattern1);
+  CanonicalizePattern(&pattern2);
+  std::vector<int32> strides;
+  FindAllStrides(pattern1, pattern2, &strides);
+  int32 num_axes = strides.size();
+  if (num_axes == 0) {
+    // Some of the code below with num_axes - 1 would crash in this case, so
+    // handle it separately.  Note: for 1-element patterns, if their offsets are
+    // different, they don't intersect.
+    if (pattern1.offset == pattern2.offset)
+      intersection->push_back(pattern1);
+    return true;
+  }
+  std::vector<Pattern> patterns1, patterns2;
+  patterns1.reserve(8);
+  patterns2.reserve(8);
+  if (!ConvertPatternStrides(pattern1, strides, &patterns1) ||
+      !ConvertPatternStrides(pattern2, strides, &patterns2))
+    return false;
+
+  auto iter1 = patterns1.begin(), end1 = patterns1.end();
+  for (; iter1 != end1; ++iter1) {
+    Pattern &sub_pattern1 = *iter1;
+    auto iter2 = patterns2.begin(), end2 = patterns2.end();
+
+    // Below, 'end_mindex1' is not the actual largest mindex in `sub_pattern1`,
+    // but an upper bound on it (in fact, it is greater than the last element in
+    // it); to prove this we require the axis-dominance property and the fact
+    // that the strides are normalized (positive and increasing).  This is part
+    // of an optimization to more quickly skip over pairs of patterns that will
+    // have empty intersection.
+    int64 begin_mindex1 = sub_pattern1.mindex,
+        end_mindex1 = begin_mindex1 +
+        sub_pattern1.strides[num_axes - 1] * sub_pattern1.dims[num_axes - 1];
+
+    for (; iter2 != end2; ++iter2) {
+      Pattern &sub_pattern2 = *iter2;
+      int64 min_mindex2 = sub_pattern2.mindex,
+          end_mindex2 = min_mindex2 +
+          sub_pattern2.strides[num_axes - 1] * sub_pattern2.dims[num_axes - 1];
+#if 0
+      if (min_mindex2 >= end_mindex1 || begin_mindex1 >= end_mindex2)
+        continue;  //  This is an optimization for efficiency when it's easy to
+                   // see that two Patterns won't overlap.  Will enable it
+                   // when the rest of the code is debugged.
+#endif
+
+      std::vector<std::vector<int32> > offsets;
+      FindOffsets(sub_pattern1, sub_pattern2, keep_all_patterns,
+                  &offsets);
+
+      for (auto oiter = offsets.begin; oiter != offsets.end(); ++oiter) {
+        intersection->resize(intersection->size() + 1);
+        OffsetToPattern(pattern1, pattern2, *oiter, &intersection->back());
+      }
+
+      if (!keep_all_patterns && !intersection.empty())
+        return true;
+    }
+  }
+  return true;
+}
+
+bool PatternContains(const Pattern &pattern_in,
+                     int64 mindex) {
+  Pattern pattern_mod;
+  const Pattern *pattern;
+  if (!IsCanonical(pattern_in)) {
+    CanonicalizePattern(pattern_in, &pattern_mod);
+    pattern = &pattern_mod;
+  } else {
+    pattern = &pattern_in;
+  }
+  mindex -= pattern->offset;
+  int32 num_axes = pattern->num_axes;
+  for (int32 raxis = num_axes - 1; raxis >= 0; raxis--) {
+    int32 index = mindex / p->strides[raxis];
+    // The following expression returns true if index is outside
+    //  range [ 0, p->dims[raxis] - 1 ].
+    if (static_cast<uint32>(index) >= static_cast<uint32>(p->dims[raxis]))
+      return false;
+    mindex -= p->strides[raxis] * index;
+  }
+  return (mindex == 0);
+}
+
+
+
+bool ToMemoryIndexSet(const Pattern &pattern_in,
+                      std::vector<char> *s) {
+  KALDI_PARANOID_ASSERT(pattern.IsValid());
+  s->clear();
+  Pattern pattern_mod;
+  const Pattern *pattern;
+  if (!IsCanonical(pattern_in)) {
+    CanonicalizePattern(pattern_in, &pattern_mod);
+    pattern = &pattern_mod;
+  } else {
+    pattern = &pattern_in;
+  }
+  int32 num_axes = pattern->num_axes;
+  if (num_axes == 0)
+    num_axes = 1;  // this does the right thing, as there will be dim=1,
+                   // stride=0 physically present in the pattern.
+
+  // 'end_mindex' is actually a strict upper bound on the maximum possible
+  // memory-index, i.e. it is more than the largest possible memory-index.  We
+  // rely on the axis-dominance property and also, thanks to the canonical form,
+  // the fact that the strides are normalized (sorted and positive).
+  int64 end_mindex = pattern->strides[num_axes - 1] *
+      pattern->dims[num_axes - 1];
+  s->clear();
+  s->resize(end_mindex, static_cast<char>(0));
+
+  auto recursively_set_elements = [pattern] (int32 raxis, int64 mindex) {
+    int32 this_stride = pattern->strides[raxis],
+         this_dim = pattern->dims[raxis];
+    if (raxis == 0) {
+      // Base case
+      char *c = &((*s)[mindex]);
+      for (int32 d = 0; d < this_dim; d++)
+        c[d * static_cast<int64>(this_stride)] = static_cast<char>(1);
+    } else {
+      for (int32 d = 0; d < this_dim; d++)
+        recursively_set_elements(raxis - 1, mindex + d * this_stride);
+    }
+  }
+  recursively_set_elements(num_axes - 1, pattern->offset);
+}
+
+int64 RandomMemoryIndex(const Pattern &pattern) {
+  int32 num_axes = pattern.num_axes;
+  int64 mindex = pattern.offset;
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    mindex += RandInt(0, pattern.dims[raxis] - 1) * pattern.strides[raxis];
+  }
+  return mindex;
+}
+
+
+bool PatternsIntersectExhaustive(const Pattern &pattern1,
+                                 const Pattern &pattern2) {
+}
+
+
+bool PatternsIntersect(const Pattern &pattern1,
+                       const Pattern &pattern2) {
+  KALDI_PARANOID_ASSERT(pattern1.IsValid() && pattern2.IsValid());
+  int64 min_mindex1, max_mindex1,
+      min_mindex2, max_mindex2;
+  ComputeMinAndMaxMindex(pattern1, &min_mindex1, &max_mindex1);
+  ComputeMinAndMaxMindex(pattern2, &min_mindex2, &max_mindex2);
+  if (min_mindex2 > max_mindex1 ||
+      min_mindex1 > max_mindex2)
+    return false;
+
+  // The next line is a check to see if one or other of the patterns includes
+  // the first element of the other; this much faster than the algorithm for
+  // computing pattern intersection.
+  if (min_mindex2 >= min_mindex1) {
+    if (PatternContains(pattern1, min_mindex2))
+      return true;
+  } else {
+    if (PatternContains(pattern2, min_mindex1))
+      return true;
+  }
+
+  bool keep_all_patterns = false;  // Settin keep_all_patterns to false sets
+                                   // "fast mode", used where we just want to
+                                   // see whether the intersection is empty.
+
+  std::vector<Pattern> intersection;
+  if (ComputeIntersection(pattern1, pattern2, &intersection,
+                          keep_all_patterns)) {
+    return (!intersection.empty());
+  }
+
+  // OK, if we reached here it was not possible to convert both patterns to the
+  // same set of strides.  This is not expected to happen in practice for any
+  // reasonable program.  Warn.
+  static int32 num_warned = 0;
+  int32 warn_limit = 10;
+  if (num_warned < warn_limit) {
+    num_warned++;
+    KALDI_WARN << "Testing intersection of patterns that cannot be brought "
+        "to common strides.  This will be extremely slow!";
+  }
+
+  // Randomly select 10 memory-indexes from the smaller pattern and see if it is
+  // in the later pattern; this is faster than the next thing we'll try.
+  const int32 num_draws = 10;
+  if (NumElements(pattern1) < NumElements(pattern2)) {
+    for (int32 i = 0; i < num_draws; i++)
+      if (PatternContains(pattern2, RandomMemoryIndex(pattern1)))
+        return true;
+  } else {
+    for (int32 i = 0; i < num_draws; i++)
+      if (PatternContains(pattern1, RandomMemoryIndex(pattern2)))
+        return true;
+  }
+  // OK, just try an exhaustive search.  If speed becomes an issue we may find a
+  // way to disable the next check, which could be extremely slow for large
+  // patterns.
+  return PatternsIntersectSlow(pattern1, pattern2);
+}
+
+
+// See documentation in header.
+bool ComputeDifference(const Pattern &pattern1,
+                       const Pattern &pattern2,
+                       std::vector<Pattern> *difference) {
+  difference->clear();
+  Pattern pattern1(pattern1_in),
+      pattern2(pattern2_in);
+  CanonicalizePattern(&pattern1);
+  CanonicalizePattern(&pattern2);
+  std::vector<int32> strides;
+  FindAllStrides(pattern1, pattern2, &strides);
+  int32 num_axes = strides.size();
+  if (num_axes == 0) {
+    // Some of the code below with num_axes - 1 would crash in this case, so
+    // handle it separately.  Note: for 1-element patterns, if their offsets are
+    // different, they don't intersect.
+    if (pattern1.offset != pattern2.offset)
+      difference->push_back(pattern1);
+    return true;
+  }
+  std::vector<Pattern> patterns1, patterns2;
+  patterns1.reserve(8);
+  patterns2.reserve(8);
+  if (!ConvertPatternStrides(pattern1, strides, &patterns1) ||
+      !ConvertPatternStrides(pattern2, strides, &patterns2))
+    return false;
+
+  // The algorithm is iterative where the iteration is over
+  // `patterns2`.
+  //
+  // First w initialize `cur_difference` to
+  // patterns1.  Then
+  // For each member p2 of `patterns2`
+  //   For each member p1 of cur_difference
+  //      Compute (p1 - p2), appending the result (as zero or more
+  //      patterns) to next_difference.
+  //   Set cur_difference = next_difference and clear next_difference.
+  // Result is in cur_difference.
+  std::vector<Pattern> cur_difference, next_difference;
+  cur_difference.swap(patterns1);
+
+  for (auto iter2 = patterns2.begin(); iter2 != patterns2.end(); ++iter2) {
+    const Pattern &sub_pattern2 = *iter2;
+    // Below, 'end_mindex2' is not the actual largest mindex in `sub_pattern2`,
+    // but an upper bound on it (in fact, it is greater than the last element in
+    // it); to prove this we require the axis-dominance property and the fact
+    // that the strides are normalized (positive and increasing).  This is part
+    // of an optimization to more quickly process pairs of patterns that will
+    // have empty intersection, so won't interact.
+    int64 begin_mindex2 = sub_pattern2.offset,
+      end_mindex2 = begin_mindex2 +
+      sub_pattern2.strides[num_axes - 1] * sub_pattern2.dims[num_axes - 1];
+
+    for (auto iter = cur_difference.begin(); iter != cur_difference.end(); ++iter){
+      const Pattern &sub_pattern1 = *iter;
+      // as before, end_mindex1 is strictly greater than the actual largest
+      // mindex.
+      int64 begin_mindex1 = sub_pattern1.offset,
+          end_mindex1 = begin_mindex1 +
+        sub_pattern1.strides[num_axes - 1] * sub_pattern1.dims[num_axes - 1];
+
+      if (begin_mindex2 >= end_mindex1 || begin_mindex1 >= end_mindex2) {
+        // The two Patterns don't intersect, so the set difference is
+        // just sub_pattern1.
+        next_difference.push_back(sub_pattern1);
+        continue;
+      }
+
+      std::vector<Hyperrectangle> cur_rects(1);
+      // Get a hyperrectangle that represents all index-tuples into
+      // sub_pattern1.
+      GetFullHyperrectangleOfPattern(sub_pattern1, &cur_rects.back());
+
+      // each member of `offsets` represents one part of the intersection
+      // between sub_pattern1 and sub_pattern2.  Each of these will be converted
+      // to a hyperrectangle representing the set of indexes it covers within
+      // sub_pattern1, and that hyperrectangle will be subtracted from the
+      // hyperrectangle representing all the indexes in sub_pattern1; (or,
+      // if offsets.size() > 1, from whatever hyperrectangles we have
+      // after subracting  previous things.
+      std::vector<std::vector<int32> > offsets;
+      FindOffsets(sub_pattern1, sub_pattern2, true, &offsets);
+
+      std::vector<Hyperrectangle> next_rects;
+      for (const std::vector<int32> &offset: offsets) {
+        Hyperrectangle h;
+        OffsetToHyperrectangle(sub_pattern1, sub_pattern2, offset, &h);
+        // h represents a set of indexes into sub_pattern1, which cover one part
+        // of the intersection between sub_pattern1 and sub_pattern2.
+
+        // we need to subtract h from each hyperrectangle in cur_rects;
+        // the results are appended to next_rects;
+        for (const Hyperrectangle &rect: cur_rects)
+          SubtractHyperrectangles(*cur_iter, h, 0, &next_rects);
+
+        cur_rects.swap(next_rects);
+        next_rects.clear();
+      }
+      for (auto hiter = cur_rects.begin();
+           hiter !=  cur_rects.end(); ++hiter) {
+        // *hiter represents one piece of the difference sub_pattern1 -
+        // sub_pattern2, expressed as indexes into sub_pattern1.  We turn
+        // it back into a pattern and append it to 'next_difference'.
+        next_difference->resize(next_difference->size() + 1);
+        HyperrectangleToPattern(sub_pattern1, *hiter, &next_difference->back());
+      }
+    }
+    cur_difference.swap(next_difference);
+    next_difference.clear;
+  }
+  // output to the user-supplied vector `difference`.
+  difference->swap(cur_difference);
+  return true;
+}
+
+bool PatternIsSubsetOf(const Pattern &p,
+                       const Pattern &q) {
+  std::vector<Pattern> intersection;
+  ComputeIntersection(p, q, true, &intersection);
+  int64 total_size = 0;
+  for (Pattern &r : intersection)
+    total_size += NumElements(r);
+  return (total_size == NumEements(p));
+}
+
+
+
+bool PatternsIntersectSlow(const Pattern &pattern1_in,
+                           const Pattern &pattern2_in) {
+  Pattern pattern1(pattern1_in),
+      pattern2(pattern2_in);
+  Canonicalize(&pattern1);
+  Canonicalize(&pattern2);
+  // Note: the offsets are the minimum elements, now that the
+  // patterns are canonical.
+  int64 min_offset = std::min(pattern1.offset, pattern2.offset);
+  pattern1.offset -= min_offset;
+  pattern2.offset -= min_offset;
+  int64 max_offset = std::max(pattern1.offset, pattern2.offset);
+  // Explicitly get the memory-index-set of pattern1 and pattern2
+  // as possibly-huge arrays, and see if they intersect.  Obviously
+  // this will be extremely slow.
+  std::vector<char> pattern1_mindexes, pattern2_mindexes;
+  ToMemoryIndexSet(pattern1, &pattern1_mindexes);
+  ToMemoryIndexSet(pattern2, &pattern2_mindexes);
+  auto iter1 = pattern1_mindexes.begin() + max_offset,
+      iter2 = pattern2_mindexes.begin() + max_offset;
+  for (; iter1 != pattern1_mindexes.begin() &&
+           iter2 != pattern2_mindexes.end();
+       ++iter1, ++iter2) {
+    if (*iter1 && *iter2)
+      return true;
+  }
+  return false;
+}
+
+
+bool PatternRebaser::Convert(Pattern *pattern) {
+  if (!needs_conversion_)
+    return;  // An optimization to make the common case fast.
+
+  pattern->offset = ConvertMemoryIndex(pattern->offset);
+
+  if (num-axes_ == 0)
+    return;  // Another optimization to make a fairly common case fast.
+  int32 num_axes = pattern->num_axes;
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    int32 stride = pattern->strides[raxis],
+        dim = pattern->dims[raxis];
+    if (stride == 0)
+      continue;
+    int32 pstride = std::abs(stride),
+        product = pstride * dim;
+    // We will convert 'pstride' using
+
+
+  }
+  return true;  // Success
+
+}
+
+
+int64 PatternRebaser::ConvertMemoryIndex(int64 m) {
+  int32 num_axes = num_axes_;
+  int64 ans = dest_offset_;
+  m -= src_offset_;
+  if (num_axes == 0)
+    return m;
+  // We visit the reduced axes in order from greatest to least src_stride.
+  // What this loop does is to reverse engineer the indexes into (the reduced
+  // version of) src_pattern that we'd need to get memory-offset m.  The 'i'
+  // values in the loop are those indexes.
+  for (int32 raxis = num_axes - 1; raxis >= 0; raxis--) {
+    int32 stride = src_strides_[raxis];
+    int64 i = m / stride;
+    m -= i * stride;
+    ans += i * dest_strides_[raxis]
+  }
+  if (m != 0) {
+    // This should not happen; likely it means the memory-index m was not covered
+    // by the src_pattern passed to the constructor, so someone was trying
+    // to rebase a pattern which was not covered by src_pattern.
+    KALDI_ERR << "Could not convert this memory-index (likely code error)";
+  }
+  return ans;
+}
+
+
+// Note on implementation: likely the most common case we'll call this
+// is when -DKALDI_PARANOID has been set and we are checking that
+// tensors we are rebasing are strictly inside the source tensor.
+// So in the common case, pattern1 *will* include pattern2.
+bool PatternIncludes(const Pattern &pattern1,
+                     const Pattern &pattern2) {
+
+  std::vector<Pattern> intersection;
+  if (!ComputeIntersection(pattern1, pattern2, &intersection))
+    return -1;  // Could not determine whether the patterns intersect.
+  int64 num_elements = 0;
+  for (auto pattern : intersection)
+    num_elements += NumElements(pattern);
+  if (num_elements == NumElements(pattern1))
+    return 1;  // pattern1 includes pattern2;
+  else
+    return 0;  // pattern1 does not include pattern2
+}
+
+
+void MakeCompactAndJustified(const Pattern &src,
+                             Pattern *dest) {
+  KALDI_PARANOID_ASSERT(src.IsValid());
+  int32 num_axes = src.num_axes;
+
+  // The sorter object provides an order in which we can visit the axes of 'src'
+  // that is from least to greatest abs(stride).
+  OutOfPlaceAxisSorter sorter(src);
+
+  int64 offset = 0;  // 'offset' will be the offset that ensures 'dest' is
+                     // justified (means lowest memory-index is 0).
+  int32 next_abs_stride = 1;
+  for (int32 i = 0; i < num_axes; i++) {
+    int32 raxis = sorter.GetIndex(i);
+    // We are going through the raxis-indexes in increasing order of stride.
+    // We'll set each stride to the product of the preceding dims.
+    int32 this_stride = src.strides[raxis],
+        this_dim = src.dims[raxis];
+    dest->dims[raxis] = this_dim;
+    if (this_stride == 0) {
+      dest->strides[raxis] = 0;
+      // Note: if 'src' is valid, this implies the dim is 1,
+      // so no need to multiply 'next_stride'
+    } else {
+      int32 abs_stride = std::abs(this_stride);
+      KALDI_PARANOID_ASSERT(abs_stride >= next_abs_stride &&
+                            "Input pattern was not valid.");
+      if (this_stride < 0) {
+        offset += int64(next_stride) * (this_dim - 1);
+        dest->strides[raxis] = -next_abs_stride;
+      } else {
+        dest->strides[raxis] = next_abs_stride;
+      }
+      next_abs_stride *= this_dim;
+    }
+  }
+  SetUnusedDimsAndStrides(num_axes, dest);
+  dest->num_axes = num_axes;
+  dest->offset = offset;
+  SetDefaultCodeAndProperties(dest);
+
+  KALDI_PARANOID_ASSERT(IsCompactAndJustified(*dest) &&
+                        IsValid(*dest) && SameDims(src, *dest));
+}
+
+
+void MakeCompactNonnegativeAndJustified(const Pattern &src,
+                                        Pattern *dest) {
+  KALDI_PARANOID_ASSERT(src.IsValid());
+  int32 num_axes = src.num_axes;
+
+  // The sorter object provides an order in which we can visit the axes of 'src'
+  // that is from least to greatest abs(stride).
+  OutOfPlaceAxisSorter sorter(src);
+
+  int32 next_stride = 1;
+  for (int32 i = 0; i < num_axes; i++) {
+    int32 raxis = sorter.GetIndex(i);
+    // We are going through the raxis-indexes in increasing order of stride.
+    // We'll set each stride to the product of the preceding dims.
+    int32 this_stride = src.strides[raxis],
+        this_dim = src.dims[raxis];
+    dest->dims[raxis] = this_dim;
+    if (this_stride == 0) {
+      dest->strides[raxis] = 0;
+      // Note: if 'src' is valid, this implies the dim is 1,
+      // so no need to multiply 'next_stride'
+    } else {
+      dest->strides[raxis] = next_stride;
+      next_abs_stride *= this_dim;
+    }
+  }
+  SetUnusedDimsAndStrides(num_axes, dest);
+  dest->num_axes = num_axes;
+  dest->offset = 0;
+  SetDefaultCodeAndProperties(dest);
+  KALDI_PARANOID_ASSERT(IsCompactAndJustified(*dest) &&
+                        HasNonnegativeStrides(*dest) &&
+                        IsValid(*dest) && SameDims(src, *dest));
+}
+
+
+
+void MakeCompactNormalizedAndJustified(const Pattern &src,
+                                       Pattern *dest) {
+  KALDI_PARANOID_ASSERT(src.IsValid());
+  int32 num_axes = src.num_axes;
+
+  int32 next_stride = 1;
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    int32 this_dim = src.dims[raxis],
+        this_stride = src.strides[raxis];
+    dest->dims[raxis] = this_dim;
+    if (this_stride == 0) {
+      dest->strides[raxis] = 0;
+      // no need to multiply next_stride by dim, since it must be 1.
+    } else {
+      dest->strides[raxis] = next_stride;
+      next_stride *= this_dim;
+    }
+  }
+  SetUnusedDimsAndStrides(num_axes, dest);
+  dest->num_axes = num_axes;
+  dest->offset = 0;
+  SetDefaultCodeAndProperties(dest);
+  KALDI_PARANOID_ASSERT(IsCompactAndJustified(*dest) &&
+                        HasNormalizedStrides(*dest) &&
+                        IsValid(*dest) && SameDims(src, *dest));
+}
+
+
+
+
+
+/**
+   Class TupleAxisComparator is used when we want to sort the axes of a tuple
+   of Patterns.  It helps to reduce the possible number of axis orderings that
+   we have to handle in implementation code.  (I.e. it reduces the number
+   of case statements that we have to handle in certain Ops).
+
+   Each stride is first converted to a number 0, 1 or 2, where 0 and 1
+   correspond to strides of 0 and 1 respectively and 2 means "any other value".
+   Call this number a stride-code.  The first comparion we do is on the first
+   pattern; we produce an order such that the stride-codes of the first pattern
+   are ordered from least to greatest value in the private mumbering.
+
+   In case of ties on the stride-codes of the first pattern, we then sort on the
+   sum of squares of the stride-codes of the other patterns.  (Using the
+   sum of squares rather than the simple sum reduces the chance of ties,
+   i.e. we don't get cases where 1 + 1 == 2 introduces a tie, because the
+   2's become 4's.
+
+   Note: the ordering this induces on the axes is not a total order for every
+   Pattern-tuple, so this comparator cannot be used as part of a
+   "canonicalization" process for Pattern-tuples.
+ */
+class TupleAxisComparator {
+
+  /**
+     Comparator function.  Returns true if raxis1 should appear before raxis2 in
+     the sorted ordering.
+        @param [in] raxis1  Axis in the private numbering, must be
+                            in range [0, num_axes - 1] where num_axes
+                            is the num_axes of the Patterns.
+        @param [in] raxis2  Axis in the private numbering, satisfying
+                            the same conditions as raxis1
+        @return             Returns true if the raxis numbered raxis1
+                            should come before raxis2 in the new axis
+                            ordering.  Like a less-than operator.
+  */
+  bool operator () (int32 raxis1, int32 raxis2) const {
+    KALDI_PARANOID_ASSERT(static_cast<uint32>(raxis1) <
+                          static_cast<uint32>(patterns_[0].num_axes));
+    uint32 stride_code1 = std::min<uint32>(patterns_[0].strides[raxis1], 2),
+        stride_code2 = std::min<uint32>(patterns_[0].strides[raxis2], 2);
+    if (stride_code1 < stride_code2) return true;
+    else if (stride_code1 > stride_code2) return false;
+    uint32 stride_code1_sumsq = 0,
+        stride_code2_sumsq = 0;
+    for (size_t i = 1; i < patterns_.size; i++) {
+      stride_code1 = std::min<uint32>(patterns_[i].strides[raxis1], 2);
+      stride_code2 = std::min<uint32>(patterns_[i].strides[raxis2], 2);
+      stride_code1_sumsq += stride_code1 * stride_code1;
+      stride_code2_sumsq += stride_code2 * stride_code2;
+    }
+    return stride_code1_sumsq < stride_code2_sumsq;
+  };
+
+  /**
+     Constructor
+            @param [in] patterns   The tuple of Patterns.  Must be
+                          a valid Pattern-tuple; search for
+                          "Valid Pattern-tuple" in pattern.h.
+  */
+  TupleAxisComparator(ArrayRef<Pattern*> patterns): patterns_(patterns) {
+    KALDI_PARANOID_ASSERT(IsValidPatternTuple(patterns_));
+  }
+
+private:
+  ArrayRef<Pattern*> patterns_;
+};
+
+
+/**
+   This object is to be instantiated when you want to know what permutation
+   you'd get if you were to sort the axes of this tuple of Patterns using
+   TupleAxisComparator.  Note: this is not a total order for all pattern-tuples,
+   so its behavior may not be completely deterministic, especially across
+   different versions of the stl library.
+ */
+class OutOfPlaceTupleAxisSorter {
+ public:
+  // Constructor.
+  inline OutOfPlaceTupleAxisSorter(ArrayRef<Pattern*> src) {
+    KALDI_PARANOID_ASSERT(IsValidPatternTuple(src));
+    int32 num_axes = src[0]->num_axes;
+    for (int32 raxis = 0; raxis < num_axes; raxis++)
+      orig_raxis_[raxis] = raxis;
+    TupleAxisComparator c(src);
+    std::sort(orig_raxis_, orig_raxis_ + num_axes, c);
+  }
+  // Returns the 'source' raxis-index for a particular destination
+  // raxis-index, e.g..:  `src_raxis = GetIndex(dest_raxis)`.
+  // Copying as e.g. `dest.strides[dest_raxis] = src.strides[src_raxis]`,
+  // and the same for the dims, would give you a `dest` with axes
+  // sorted from smallest to greatest absolute value.
+  inline int32 GetIndex(int32 raxis) { return orig_raxis_[raxis]; }
+
+ private:
+  int32 orig_raxis_[KALDI_TENSOR_MAX_DIM];
+};
+
+
+void SortTupleAxes(ArrayRef<Pattern*> patterns) {
+  OutOfPlaceAxisSorter sorter(src);
+  int32 num_axes = patterns[0]->num_axes;
+  Pattern temp_pattern;
+  for (size_t i = 0; i < patterns->size(); i++) {
+    Pattern &this_pattern = *(patterns[i]);
+    for (int32 i = 0; i < num_axes; i++) {
+      int32 src_raxis = sorter.GetIndex(i);
+      temp_pattern.strides[i] = this_pattern.strides[src_raxis];
+      temp_pattern.dims[i] = this_pattern.dims[src_raxis];
+    }
+    for (int32 i = 0; i < num_axes; i++) {
+      this_pattern.strides[i] = temp_pattern.strides[i];
+      this_pattern.dims[i] = temp_pattern.dims[i];
+    }
+  }
+}
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/pattern-tuple-utils.h b/src/tensor/pattern-tuple-utils.h
new file mode 100644
index 00000000000..aa733fe3567
--- /dev/null
+++ b/src/tensor/pattern-tuple-utils.h
@@ -0,0 +1,768 @@
+// tensor/pattern-tuple-utils.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_PATTERN_TUPLE_UTILS_H_
+#define KALDI_TENSOR_TENSOR_PATTERN_TUPLE_UTILS_H_ 1
+
+#include "tensor/tensor-common.h"
+#include "tensor/pattern.h"
+#include "tensor/array-ref.h"
+
+
+// This header includes various functions operating on Patterns,
+// particularly ones relating to set-theoretic views of Patterns
+// and obscure, less-user-facing ones.
+
+
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   Returns true if there is overlap between pattern1 and pattern2,
+   meaning that pattern1's memory-index-set and pattern2's
+   memory-index-set have nonempty intersection.
+
+         @param [in] First pattern.  Must be valid.
+         @param [in] Second pattern.  Must be valid.
+         @return  Return if the two patterns' memory-index-sets'
+                  intersection is nonempty.
+ */
+bool PatternsIntersect(const Pattern &pattern1,
+                       const Pattern &pattern2);
+
+
+/**
+   This is a slow but simple version of PatternsIntersect(), with the same
+   interface.  it should not be called by users as it is slow.  It is exposed
+   here for testing purposes.
+*/
+bool PatternsIntersectSlow(const Pattern &pattern1,
+                           const Pattern &pattern2);
+
+
+/**
+   If i is an index-tuple (in the private numbering) valid for `pattern`,
+   returns the memory-index
+     `m = pattern[i] = pattern.offset + \sum_r i[r] * pattern.strides[r]`.
+   If `check_valid == true` this will crash for i not in the index-tuple-set
+   of `pattern`; if false it will just return the above expression computedn
+   for i and not check.
+ */
+int64 IndexPattern(const Pattern &pattern,
+                   const std::vector<int32> &i,
+                   bool check_valid = true);
+
+
+/**
+   FindOffsets() is a utility function used in computing pattern intersections
+   and set differences.  We will be using the notation described "Indexing a
+   Pattern" in pattern.h.  Let pattern1 and pattern2 be patterns satisfying
+   SameStrides(pattern1, pattern2).  Let n be the num-axes of the patterns.
+   Let Offsets(pattern1, pattern2) be the set of n-tuples o such that there
+   exists an i with pattern1[i + o] = pattern2[i], with of course i + o in the
+   index-tuple-set of pattern1 and i in the index-tuple-set of pattern2.
+   This function outputs the set of such offsets o.  The algorithm is a little
+   complicated so we describe it with the implementation.
+
+       @param [in] pattern1   First input pattern.  Must be valid-1 and
+                        normalized+ (i.e. HasNormalizedPositiveStrides(pattern1)).
+       @param [in] pattern2   Second input pattern.  Must be valid-1 and
+                        satisfy SameStrides(pattern1, pattern2).
+       @param [in] find_all_offsets  True if the user wants all of the
+                        offsets.  If false, this function may save computation
+                        by stopping after one or more offsets.  (Useful in
+                        testing whether patterns intersect).
+       @param [out] offsets   The offsets will be written to here in
+                         arbitrary order.  Each offset will be a vector with
+                         size() equal to the num_axes of the patterns; the
+                         elements may be positive or negative.
+
+   See also (not all of these are declared in headers), OffsetToPattern(),
+   OffsetToHyperrectangle()).
+ */
+bool FindOffsets(const Pattern &pattern1,
+                 const Pattern &pattern2,
+                 bool find_all_offsets,
+                 std::vector<std::vector<int32> > *offsets);
+
+
+
+/**
+   Returns information about whether pattern2's memory-index-set is a subset of
+   pattern1's memory-index-set.  See glossary in pattern.h for
+   explanation of memory-index-set.
+        @param [in] pattern1  First input pattern; must be valid.
+        @param [in] pattern2  First input pattern; must be valid.
+        @return   Returns:
+            0 if we determined that pattern1 does not include pattern2
+            1 if we determined that pattern1 includes pattern2
+           -1 if we could not compute the intersection (so our
+              algorithm could not determine whether one included the other).
+ */
+int32 PatternIncludes(const Pattern &pattern1,
+                      const Pattern &pattern2);
+
+/**
+   Inline function that sets dim=1, stride=0 for all axes with
+   num_axes <= raxis < KALDI_TENSOR_MAX_DIM.  Often useful.
+ */
+inline void SetUnusedDimsAndStrides(int32 num_axes,
+                                    Pattern *dest) {
+#pragma unroll(2)
+  for (int32 raxis = num_axes; raxis < KALDI_TENSOR_MAX_DIM; raxis++) {
+    dest->dims[raxis] = 1;
+    dest->strides[raxis] = 0;
+  }
+}
+
+/**
+   Inline function that sets dest->code = -1 and dest->properties = 0;
+   often saves coding in functions that create or modify patterns.
+ */
+inline void SetDefaultCodeAndProperties(Pattern *dest) {
+  dest->code = -1;
+  dest->properties = 0;
+}
+
+
+/**
+   Returns true if the two patterns are equivalent in the sense that their
+   memory-index-sets are the same.  See glossary in pattern.h for
+   explanation.
+
+   This function works by reducing both patterns to canonical form
+   and testing whether their canonical forms are equal.
+
+       @param [in] pattern1  First input pattern
+       @param [in] pattern2  Second input pattern
+       @return  Returns true if the patterns are equivalent, otherwise
+                false.
+ */
+bool PatternsEquivalent(const Pattern &pattern1,
+                        const Pattern &pattern2);
+
+
+/**
+   This function tries to compute the set-wise intersection between two patterns
+   (i.e. the intersection between their memory-index-sets).  On success it
+   outputs a vector of patterns rather than a single pattern, because this
+   intersection may be empty or may not be expressible as a single pattern but
+   only as a union of patterns (i.e. a union of the patterns this function
+   outputs).  This function may fail to compute the intersection in certain
+   very pathological cases (see documentation of return status).
+
+      @param [in] pattern1  The first of the two patterns of which
+                        we want the intersection; must be valid.
+      @param [in] pattern2  The first of the two patterns of which
+                        we want the intersection; must be valid.
+      @param [in]  keep_all_patterns   If this parameter is false,
+                       the algorithm will stop as soon as the
+                       `intersection` vector has one element.  This
+                       is used for a fast test whether an intersection
+                       is empty or not.
+      @param [out] intersection  On success, this function outputs
+                       a possibly-empty vector of patterns (in arbitrary
+                       order), the union of whose memory-index-sets (which
+                       will all be disjoint) equals the intersection of the
+                       memory-index-sets of `pattern1` and `pattern2`.
+                       (However, see `keep_all_patterns`).
+      @return  Returns true if the intersection could be computed, and
+               false otherwise.  This function will always return true if,
+               when the strides of pattern1 and pattern2 are sorted and
+               duplicates removed and listed in increasing order, each
+               stride divides the next one in the list exactly; but this is
+               not a necessary condition.   (The necessary condition
+               is that both patterns, when reduced and converted
+               to common strides, are "Regular" (c.f. "Regularity
+               property" in glossary).
+*/
+bool ComputeIntersection(const Pattern &pattern1,
+                         const Pattern &pattern2,
+                         bool keep_all_patterns,
+                         std::vector<Pattern> *intersection);
+
+
+
+/**
+   This function tries to compute the set-wise difference pattern1 - pattern2.
+   Viewed as memory-index-sets, it is trying to compute the set of
+   memory-indexes in pattern1 but not in pattern2.  This is computed as a list
+   of Patterns.  This function may fail to compute the set difference in
+   certain very pathological cases (see documentation of return status).
+
+      @param [in] pattern1  The pattern we are subtracting from;
+                       if it does not intersect with pattern2, the
+                       result will be identical to pattern1.
+                       Must be valid.
+      @param [in] pattern2  The pattern we are subtracting; must be valid.
+      @param [out] difference  On success, this function outputs
+                     a possibly-empty vector of patterns (in arbitrary
+                     order), the union of whose memory-index-sets (which
+                     will all be disjoint) equals the set-wise difference
+                     M(pattern1) - M(pattern2) of the memory-index-sets of
+                     `pattern1` and `pattern2`.
+      @return  Returns true if the intersection could be computed, and
+               false otherwise.  This function will always return true if,
+               when the strides of pattern1 and pattern2 are sorted and
+               duplicates removed and listed in increasing order, each
+               stride divides the next one in the list exactly; but this is
+               not a necessary condition.   (The necessary condition
+               is that both patterns, when reduced and converted
+               to common strides, are "Regular" (c.f. "Regularity
+               property" in glossary).
+*/
+bool ComputeDifference(const Pattern &pattern1,
+                       const Pattern &pattern2,
+                       std::vector<Pattern> *difference);
+
+
+
+
+/**
+   This function returns true if the memory-index-sets of pattern1 and pattern2
+   have nonempty intersection, and false otherwise.  Requires that
+   pattern1 and pattern2 be valid.
+
+      @param [in] pattern1  First pattern to compare; must be valid.
+      @param [in] pattern2  Second pattern to compare; must be valid.
+      @return               Returns true if the memory-index-set of
+                            pattern1 and pattern2 have nonempty intersection.
+ */
+bool PatternsIntersect(const Pattern &pattern1,
+                       const Pattern &pattern2);
+
+/**
+      @param [in] pattern   The pattern about whose memory-index-set
+                            we are asking.  Must be valid-1, or
+                            return status is undefined.
+      @param [in] mindex    The memory-index we are asking about
+      @return               Return true if the memory-index-set of `pattern`
+                            contains `mindex` (i.e. if there is an
+                            index-tuple i such that `pattern[i] == mindex`;
+                            see "Indexing a pattern" in the glossary.
+*/
+bool PatternContains(const Pattern &pattern,
+                     int64 mindex);
+
+
+/**
+   Returns true if the memory-index-set of pattern p is a subset of the
+   memory-index-set of pattern q.  Note: the algorithm is not super trivial or
+   fast (although the tiem taken doesn't grow with the dims or strides, only
+   with the number of axes).
+
+      @param [in] p   First pattern; must be valid.
+      @param [in] q   Second pattern; must be valid.
+      @return   Returns true if memory-index-set of p is a subset of
+                the memory-index-set of q (see pattern.h for definition;
+                of memory-index-set).
+ */
+bool PatternIsSubsetOf(const Pattern &p,
+                       const Pattern &q);
+
+
+/**
+   Compute the minimum and maximum memory-indexes present in
+   a pattern's memory-index-set (i.e. the minimum and maximum
+   indexes into the underlying array).
+
+      @param [in] pattern  The pattern whose minimum and maximum
+                           memory-index we are computing
+      @param [out] min_mindex  The minimum memory-index in the
+                           memory-index-set of the pattern.  Will be zero in
+                           Patterns with non-negative strides.  Will always be
+                           >= 0 in Patterns created by a program that's
+                           doing something that makes sense.
+      @param [out] max_mindex  The maximum memory-index in the
+                           memory-index-set of the pattern.  Will always be >=
+                           min_mindex.
+*/
+void ComputeMinAndMaxMindex(const Pattern &pattern,
+                            int64 *min_mindex,
+                            int64 *max_mindex);
+
+
+
+/**
+   Outputs the memory-index-set corresponding to the pattern 'pattern' to 's'.
+   See glossary in pattern.h for definitions.
+
+   This is strictly to be used in debugging code, as it is extremely
+   inefficient.
+
+      @param [in] pattern  The input pattern; must be valid
+      @param [out] s   The memory-index-set, represented as a vector
+                       of bool, actually stored as char.  This will be set to a
+                       vector at least as large as the maximum memory-index in
+                       `pattern`, containing 1 for memory-indexse in the set and 0 for
+                       those out of the set.
+ */
+bool ToMemoryIndexSet(const Pattern &pattern,
+                      std::vector<char> *s);
+
+/**
+   This function returns a memory-index randomly chosen
+   from the memory-index-set of `pattern`.
+     @param [in] pattern   Pattern; must be valid-1.
+     @return  Returns randomly chosen memory-index.
+ */
+int64 RandomMemoryIndex(const Pattern &pattern);
+
+
+
+/**
+   Outputs the memory-index-tuple-set corresponding to the pattern 'pattern' to
+   's' (see pattern.h for definition).
+
+   This function is strictly to be used in debugging code, as it is
+   extremely inefficient.
+
+      @param [in] pattern  The input pattern
+      @param [out] s   The memory-index-tuple-set
+ */
+bool ToMemoryIndexTupleSet(const ArrayRef<Pattern*>  patterns,
+                           std::unordered_set<std::vector<int32>, VectorHasher> *s);
+
+
+/**
+   Returns true if the two pattern-tuples are equivalent in the sense
+   that their memory-index-tuple-sets are the same.  See glossary
+   in pattern.h for explanation.
+ */
+bool PatternTuplesEquivalent(const ArrayRef<const Pattern*> patterns1,
+                             const ArrayRef<const Pattern*> patterns2);
+
+/**
+   Returns true if Pattern p is linear in Pattern q.  (Note:
+   this is a rather technical property, see pattern.h for definition).
+
+      @param [in] p  The first pattern.  Must be valid
+      @param [in] q  The second pattern.  Must be valid and must satisfy
+                     `PatternIsSubsetOf(p, q);`
+ */
+bool IsLinearIn(const Pattern &p,
+                const Pattern &q);
+
+/**
+   This function returns true if a Pattern is regular (see Regularity property
+   in the glossary in pattern.h) and false otherwise.  'pattern' must
+   have all positive strides, the strides must be in increasing order (in the
+   private numbering), and it must be valid-2 (see glossary).
+ */
+bool IsRegular(const Pattern &pattern);
+
+
+/**
+   This function returns true if a Pattern is valid-1 (see definition in
+   glossary); see also Pattern::Valid() and IsValid2().
+ */
+bool IsValid1(const Pattern &pattern);
+
+/**
+   This function returns true if a Pattern is valid-2 (see definition in
+   glossary); see also Pattern::Valid() and IsValid1().
+ */
+bool IsValid2(const Pattern &pattern);
+
+
+/**
+   This function attempts to convert a pattern 'pattern' in canonical form
+   (c.f. "Canonical form" in glossary, and CanonicalizePattern()) to a list of
+   valid-1 Patterns whose strides (in the private numbering) are equal to the
+   provided 'strides' vector, the union of whose memory-index-sets (which will
+   all be disjoint) is equal to the memory-index-set of the input Pattern, and
+   which are all linear in `pattern` (c.f. documentation of "Linear Property).
+
+   This function is not guaranteed to always succeed (return true), but it will
+   always succeed when people are doing "reasonable" things with Tensors.  It
+   will always succeed if each element in 'strides' divides the next element
+   exactly, although this is not a necessary condition for success.
+
+       @param [in] pattern  A valid Pattern in canonical form
+       @param [in] strides   A list of positive integers, sorted from
+                        smallest to greatest; it must contain all strides in
+                        `pattern`.
+       @param [out] patterns  On success (see documentation of return status)
+                        'patterns' will be set to a nonempty list of valid-1
+                        patterns, the union of whose memory-index-sets equals
+                        the memory-index-set of `pattern`; all of whose strides
+                        are equal to `strides`; and each of which is linear in
+                        `pattern` (see "Linear property").
+                           On failure, 'patterns' will be empty.
+       @return          Returns true if pattern strides could be converted using
+                        our algorithm, false if not.  This algorithm will work
+                        for any 'reasonable' request, but it doesn't attempt to
+                        cover the types of cases where, to solve them, we would
+                        have to output a number of patterns that couldn't be
+                        bounded given only the number of axes.
+  */
+bool ConvertPatternStrides(const Pattern &pattern,
+                           const ArrayRef<int32> strides,
+                           std::vector<Pattern> *patterns);
+
+/**
+   This function fills in any 'gaps' in the memory-indexes in 'src' and
+   shifts so the lowest memory-index is 0, copying the resulting pattern
+   to 'dest'.  It is used when constructing gradient Tensors for
+   base Variables whose data Tensor is not contiguous and justified.
+
+   The more mathematical description is as follows:
+   Let m be the memory-index-set of `src`, and let f
+   be the function that maps m to the set  \f$ [0, |m|-1] \f$ while
+   preserving the ordering of the elements.  Then the relationship
+   between 'src' and 'dest' is that 'dest' has the same num_axes and
+   dims and 'src', and the strides are such as to satisfy
+   \f$  dest[i] = f(src[i]) \f$,
+   where i is a valid Index-tuple for `src`.  See "Indexing a Pattern"
+   in the glossary in pattern.h for explanation of this notation.
+
+         @param [in] src  The source pattern.  Must be valid.
+         @param [out] dest  The destination pattern.  Will be identical
+                        to `src` if `CompactAndJustified(src)`, else
+                        will have the relationship explained above.
+                        Will satisfy `CompactAndJustified(*dest)`,
+                        and also `IsValid(*dest)`, assuming `IsValid(src)`.
+ */
+void MakeCompactAndJustified(const Pattern &src,
+                             Pattern *dest);
+
+
+/**
+   This function possibly modifies the offset of the pattern `p`
+   so that it will be justified (meaning: lowest-numbered
+   memory-index equals zero).
+
+     @param [in,out] p    A Pattern, must be valid at entry
+                         (`p->IsValid()`).  At exit, will be
+                         valid and also justified (`IsJustified(p)`).
+ */
+void MakeJustified(Pattern *p);
+
+
+/**
+   This function copies the Pattern 'src' from 'dest', preserving the
+   num_axes and dims while possibly modifying the strides and offset.  The
+   strides of 'dest' will be normalized (i.e. nonnegative with positive strides
+   strictly increasing in the private axis-numbering), the pattern will be
+   compact (no gaps) and the offset will be set to zero (making the pattern
+   justified, since strides are nonnegative).
+
+       @param [in] src  The source pattern.  Must be valid.
+       @param [out] dest  The destination pattern.  Will share
+                      num_axes and dims with src, but the strides
+                      will be normalized, the pattern will be compact
+                      (no gaps between memory-indexes) and offset will be 0.
+ */
+void MakeCompactNormalizedAndJustified(const Pattern &src,
+                                       Pattern *dest);
+
+
+/**
+   This function copies the Pattern 'src' from 'dest', preserving the
+   num_axes and dims while possibly modifying the strides and offset.  The
+   strides of 'dest' will be nonnegative but the ordering from least to greatest
+   of the nonzero strides will be the same as the ordering of the absolute
+   values of the strides in 'src'.  The output pattern will be compact (no gaps)
+   and justified (meaning offset == 0, since the strides will be nonnegative).
+
+       @param [in] src  The source pattern.  Must be valid.
+       @param [out] dest  The destination pattern.  Will share
+                  num_axes and dims with src, but the strides and
+                  offset may be different.
+*/
+void MakeCompactNonnegativeAndJustified(const Pattern &src,
+                                        Pattern *dest);
+
+
+
+
+/**
+   Class PatternRebaser is an object that converts Pattern
+   when memory layouts change.  The main use-case is when a base Variable
+   (c.f. variable.h for definition) has a Pattern that is not
+   contiguous (see pattern.h for definition of 'contiguous'), and
+   its gradient Tensor is allocated contiguously.  This class is
+   needed to convert patterns for Variables into patterns for their
+   corresponding gradients.
+
+   We make it an object rather than a function in order to avoid repetition when
+   multiple patterns need to be rebased.
+ */
+class PatternRebaser {
+
+  /*
+    Constructor.
+       @param [in] src_pattern  The pattern that we are converting *from*,
+                              e.g. the pattern of a Variable whose gradient
+                              has a different layout from itself.
+       @param [in] dest_pattern  The pattern that we are converting *to*.
+                              Must have the same num_axes and the same dims
+                              as 'src_pattern'.
+
+    Let t be a valid index-tuple for src_pattern/dest_pattern, determined
+    by their 'dims' and 'num_axes'.  Using t to index src_pattern and
+    dest_pattern gives memory-indexes:
+       m_src = src_pattern[t]
+       m_dest = dest_pattern[t]
+    View this object as a function from memory-indexes to memory-indexes
+    (m_src -> m_dest), whose domain is the memory-index-set of src_pattern
+    and whose range is the memory-index-set of dest_pattern.
+
+    The purpose of this object is to modify patterns in a way that maps
+    their memory-indexes with the same function.
+  */
+  PatternRebaser(const Pattern &src_pattern,
+                       const Pattern &dest_pattern);
+
+
+  /**
+     This function attempts to modify pattern->offset and pattern->strides in a
+     way that does the mapping of memory-indexes m_src -> m_dest that is implied
+     by the src_pattern and dest_pattern passed to the constructor.  That is,
+     for any index-tuple t valid for 'pattern', the memory-index `pattern[t]`
+     evaluated before and after calling this function gets mapped according
+     to the function (m_src -> m_dest) mentioned in our documentation for
+     the constructor.
+
+     @param [in,out]  pattern  The pattern to be rebased.  Must, at entry,
+                          satisfy `PatternIncludes(src_pattern, *pattern)`,
+                          where `src_pattern` was the pattern passed to the
+                          constructor.  On success (i.e. if this function
+                          returns true), the condition
+                          `PatternIncludes(dest_pattern, *pattern)` will
+                          be satisfied.  On failure, the contents of
+                          'pattern' is undefined.
+
+     @return  Returns true if the conversion was possible.
+   */
+  bool Rebase(Pattern *pattern);
+
+  private:
+
+  // TODO: remove src_pattern_ and dest_pattern_ once everything
+  // is debugged.  They are copies of the src_pattern and dest_pattern
+  // passed to the constructor.
+  Pattern src_pattern_;
+  Pattern dest_pattern_;
+
+  // If needs_conversion_ is false, it means the patterns don't need any conversion
+  // at all (this is an optimization).
+  bool needs_conversion_;
+
+  // The 'offset' value of src_pattern_reduced (i.e. the src_pattern passed
+  // to the constructor, which has been jointly reduced and normalized with
+  // dest_pattern (to make all src_strides positive).
+  int64 src_offset_;
+  // The 'offset' value of dest_pattern_reduced
+  int64 dest_offset_;
+
+  // num_axes_ is the number of axes, not in the original src_pattern /
+  // dest_pattern but after the two patterns have been jointly reduced and
+  // then sorted from smallest to greatest stride in src_pattern.
+  // src_strides_ are the resulting strides from src_pattern_reduced, and
+  // dest_strides_ are the resulting strides from dest_pattern_reduced.
+
+  // dest_pattern_ are the strides of the thus-modified src_pattern and
+  // dest_pattern.  As an optimization, if src_strides and dest_strides end up
+  // being the same, we set num_axes to zero and skip modifying the strides when
+  // ReducePattern() is called.
+
+  // Note: all of src_strides_[0] .. src_strides_[num_axes_ - 1] will be greater
+  // than zero.  We can guarantee this because src_pattern and dest_pattern as
+  // passed to the constructor had the same dims, so any axes with dim=1 would
+  // have had dim=1 for both src and dest, hence they would have been removed by
+  // ReducePatterns(), hence no strides would be zero after
+  // ReducePatterns(); and ReducePatterns() normalizes the signs of the
+  // strides so the first one (i.e. src_pattern) has positive strides.
+  int32 num_axes_;
+  int32 src_strides_[KALDI_TENSOR_MAX_DIM];
+  int32 dest_strides_[KALDI_TENSOR_MAX_DIM];
+
+  // The basic algorithm in Convert() is:
+  //  First, add offset_ to its offset.
+  //   Then:
+  //     For each nontrivial axis of 'pattern', we are going to modify
+  //     its stride as needed.
+  //     Let that stride be `stride`, and the corresponding dim `dim`.
+  //     Let `pstride = abs(stride)` be the absolute value of the stride
+  //     (we'll modify that, and then restore the sign.
+  //     positive.
+  //
+
+
+
+  // Converts a memory-index from the src to dest pattern.  This is applying,
+  // to a single arbitrary memory-index m_src, the mapping (m_src -> m_dest);
+  // see the comments above for explanation of this notation.
+  // It is required that m >= 0 (otherwise it would not have been inside
+  // the source pattern).
+  int64 ConvertMemoryIndex(int64 m);
+
+};
+
+/**
+   This object is to be instantiated when you want to know what permutation
+   you'd get if you were to change the ordering of axes so that the abs(stride)
+   were strictly increasing.  (Note: this is not a total order if there are >1
+   axes with stride=0, so the ordering may be somewhat arbitrary).
+
+   See the documentation for its GetIndex() function.
+ */
+class OutOfPlaceAxisSorter {
+ public:
+  // Constructor.
+  inline OutOfPlaceAxisSorter(const Pattern &src) {
+    int32 num_axes = src.num_axes;
+    for (int32 raxis = 0; raxis < num_axes; raxis++)
+      orig_raxis_[raxis] = raxis;
+    std::sort(orig_raxis_, orig_raxis_ + num_axes,
+              // a comparator (less-than) operator implemented as a lambda is
+              // below.  Sort from least to greatest abs(stride), disambiguating
+              // based on dim.
+              [src] (int32 raxis1, int32 raxis2) {
+                int32 abs_stride1 = std::abs(src.strides[raxis1]),
+                    abs_stride1 =  std::abs(src.strides[raxis2]);
+                if (abs_stride1 < abs_stride2) return true;
+                else if (abs_stride1 > abs_stride2) return false;
+                else return (src.dims[raxis1] < src.dims[raxis2]);
+              });
+  }
+  // Returns the 'source' raxis-index for a particular destination
+  // raxis-index, e.g..:  `src_raxis = GetIndex(dest_raxis)`.
+  // Copying as e.g. `dest.strides[dest_raxis] = src.strides[src_raxis]`,
+  // and the same for the dims, would give you a `dest` with axes
+  // sorted from smallest to greatest absolute value.
+  inline int32 GetIndex(int32 raxis) { return orig_raxis_[raxis]; }
+
+ private:
+  int32 orig_raxis_[KALDI_TENSOR_MAX_DIM];
+};
+
+
+
+/**
+   This function sorts the axes in 'patterns' (which must be a valid
+   pattern-tuple, see pattern.h for explanation) using TupleAxisComparator.  See
+   its documentation in pattern-tuple-utils.cc for description of what this
+   order is.
+
+     @param [in,out]  The patterns whose axes are to be sorted.  The same
+                     permutation will be applied to all the patterns.
+ */
+void SortPatternTupleAxes1(ArrayRef<Pattern*> patterns);
+
+/**
+   This function sorts the axes in 'patterns' (which must be a valid
+   pattern-tuple, see pattern.h for explanation) from least to
+   greatest abs(stride) in the first Pattern, using the abs(stride)
+   of the remaining patterns, lexicographically, to disambiguate
+   in case of ties in the 1st pattern.
+
+     @param [in,out]  The patterns whose axes are to be sorted.  The same
+                     permutation will be applied to all the patterns.
+ */
+void SortPatternTupleAxesSimple(ArrayRef<Pattern*> patterns);
+
+
+/**
+   TODO: remove this.
+
+   Sorts the axes of the pattern-tuple `patterns` in a way that we use for
+   elementwise operations writing to the first of the patterns.  Let pattern0 be
+   the first pattern in `patterns`.  This function requires that no axis in
+   pattern0 be a trivial axis (dim=1, stride=0); this is because we expect the
+   operation to be non-reducing and the pattern-tuple to be in reduced form
+   (c.f. ReducePatternTuple()).
+
+   The sorting of axes (expressed in the private numbering) is as follows:
+     - First the axis that has the smallest stride in pattern0.
+     - Then the remaining axes, in order from greatest to largest
+       dim in pattern0, using the stride in pattern0 to disambiguate
+    Thus, it is the strides
+
+ */
+void SortPatternTupleAxesForCuda(ArrayRef<Pattern*> patterns);
+
+
+/**
+   Reduces a Pattern-tuple by removing or combining as many axes as possible.
+   See the documentation for ReduceOnePattern() in pattern-utils.h basic
+   concept of reducing a single Pattern to a pattern with possibly fewer axes
+   (and maybe with negative strides converted to positive), which covers the
+   same set of memory locations as the original Tensor.
+
+   The difference with just calling ReduceOnePattern() several times is
+   that ReducePatterns() preserves the relationships between the tensors.
+   In the language developed in pattern.h, this means the memory-index-tuple-set
+   is preserved.
+
+   Note: while the first Pattern will have no negative strides at output,
+   the others may.
+
+     @param [in,out] patterns   An nonempty array of the patterns
+                         to be jointly reduced.
+
+      @return  Returns true if it made any change to the patterns,
+               false if they were unchanged.
+
+ Examples are below, where we write a Pattern as
+ `{{dim1,dim2,..}, {stride1,stride2,..}}`.
+
+\verbatim
+    src1                src2              dest1,offset1       dest2,offset2
+  {{10},{1}}           {{10},{1}}        {{10},{1}},0        {{10},{1}},0  # no-op
+  {{8},{1}}            {{1},{0}}         {{8},{1}},0         {{1},{0}},0   # no-op
+  {{7},{-1}}           {{7},{1}}         {{7},{1}},-6         {{7},{-1}},6 # flip sign
+ {{3,4},{4,1}}        {{3,4},{4,1}}      {{12},{1}},0         {{12},{1}},0 # combine dims
+ {{3,4},{4,1}}        {{3,1},{4,0}}      {{3,4},{4,1}}        {{3,1},{4,0}} # can't combine, would be incompatible
+ {{3,4},{4,1}}        {{1,1},{0,0}}      {{12},{1}}           {{1},{0}}    # combine
+\endverbatim
+
+   See also SortPatternTupleAxes() and NormalizePatternTuple().
+ */
+bool ReducePatternTuple(ArrayRef<Pattern*> patterns);
+
+
+/**
+   Reduces a pattern-tuple to a normalized form.  (Caution: this may not be 100%
+   deterministic, i.e. there may be two pattern-tuples in normalized form,
+   i.e. the form produced by this function, which share the same
+   memory-index-tuple-set but are not equal).
+
+   This just calls ReducePatternTuple() and then SortPatternTupleAxes().
+
+     @param [in,out] patterns.
+
+*/
+inline bool NormalizePatternTuple(ArrayRef<Pattern*> patterns) {
+  ReducePatternTuple(patterns);
+  NormalizePatternTupleAxes(patterns);
+}
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+// Include implementation of inline functions.
+#include "tensor/pattern-tuple-utils-inl.h"
+
+#endif  // KALDI_TENSOR_TENSOR_PATTERN_TUPLE_UTILS_H_
diff --git a/src/tensor/pattern-utils-inl.h b/src/tensor/pattern-utils-inl.h
new file mode 100644
index 00000000000..1eea287f97b
--- /dev/null
+++ b/src/tensor/pattern-utils-inl.h
@@ -0,0 +1,46 @@
+// tensor/pattern-utils-inl.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+// Do not include this header directly; it is only to be included by pattern-utils.h.
+
+#ifndef KALDI_TENSOR_TENSOR_PATTERN_UTILS_INL_H_
+#define KALDI_TENSOR_TENSOR_PATTERN_UTILS_INL_H_ 1
+
+
+namespace kaldi {
+namespace tensor {
+
+// See pattern-utils.h for documentation.
+inline bool ContainsNegativeStride(const Pattern &pattern) {
+  // 2048 is 1 << 11; 11th bit in code is set if code indicates negative stride.
+  if (pattern.code >= 0 && (pattern.code | 2048) != 0)
+    return true;
+  int32 num_axes = pattern.num_axes;
+  for (int32 raxis = 0; raxis < num_axes; raxis++)
+    if (pattern.strides[raxis] < 0)
+      return true;
+  return false;
+}
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+#endif KALDI_TENSOR_TENSOR_PATTERN_UTILS_INL_H_
diff --git a/src/tensor/pattern-utils-test.cc b/src/tensor/pattern-utils-test.cc
new file mode 100644
index 00000000000..4e0b1f3481f
--- /dev/null
+++ b/src/tensor/pattern-utils-test.cc
@@ -0,0 +1,85 @@
+// util/pattern-utils-test.cc
+
+// Copyright 2009-2011  Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/pattern.h"
+#include "tensor/pattern-utils.h"
+#include "base/kaldi-math.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+// We may later move this function to somewhere more permanent.
+void GenerateRandomPattern(Pattern *pattern) {
+
+  int32 num_axes = RandInt(0, KALDI_TENSOR_MAX_DIM);
+
+  // the 'cur_stride' stuff is a mechanism for generating strides that
+  // will satisfy the 'uniqueness' rule; we'll later randomize the
+  // order of axes.
+  int32 cur_stride = 1;
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    int32 dim = RandInt(1, 10);
+    pattern->dims[raxis] = dim;
+    if (dim > 1) {
+      cur_stride *= RandInt(1, 3);
+      pattern->strides[raxis] = cur_stride;
+      cur_stride *= dim;
+    } else {
+      pattern->strides[raxis] = 0;
+    }
+  }
+
+  for (int32 i = 0; i <= num_axes; i++) {
+    int32 raxis1 = RandInt(0, num_axes - 1),
+        raxis2 = RandInt(0, num_axes - 1);
+    if (raxis1 != raxis2) {
+      std::swap(pattern->dims[raxis1], pattern->dims[raxis2]);
+      std::swap(pattern->strides[raxis1], pattern->strides[raxis2]);
+    }
+  }
+  for (int32 raxis = num_axes; raxis < KALDI_TENSOR_MAX_DIM; raxis++) {
+    pattern->dims[raxis] = 1;
+    pattern->strides[raxis] = 0;
+  }
+  pattern->code = ComputePatternCode(*pattern);
+  if (RandInt(0, 1) == 0) {
+    KALDI_ASSERT(pattern->Check());
+  } else {
+    KALDI_ASSERT(pattern->Check(true));
+  }
+}
+
+
+void UnitTestGenRandomPattern() {
+  Pattern p;
+  for (int32 i = 0; i < 100; i++) {
+    GenerateRandomPattern(&p);
+  }
+}
+
+}  // namespace kaldi
+}  // namespace tensor
+
+int main(int argc, const char** argv) {
+  using namespace kaldi;
+  using namespace kaldi::tensor;
+  UnitTestGenRandomPattern();
+  return 0;
+}
diff --git a/src/tensor/pattern-utils.cc b/src/tensor/pattern-utils.cc
new file mode 100644
index 00000000000..e65550af561
--- /dev/null
+++ b/src/tensor/pattern-utils.cc
@@ -0,0 +1,672 @@
+// tensor/pattern-utils.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/pattern-utils.h"
+
+namespace kaldi {
+namespace tensor {
+
+int32 ComputePatternCode(const Pattern &pattern) {
+  int32 ans = 0;
+
+  int32 n = 0;
+  // n is going to be:
+  // n = 0 if no axis had stride=1, otherwise:
+  // n = 1 + the raxis index that had stride=1.
+
+  bool found_negative_dim = false;
+
+  // caution: this axis index is a shifted-to-the-right index,
+  // not the one that the public interface of Tensor exposes.
+  for (int32 raxis = 0; raxis < pattern.num_axes; raxis++) {
+    int32 dim = pattern.dims[raxis],
+        stride = pattern.strides[raxis];
+    if (dim != 1) {
+      ans |= 1;  // set least significant bit of 'ans' to 1.
+      if (dim < 0)
+        found_negative_dim = true;
+      if (stride == 1)
+        n = raxis + 1;  // Can happen only once, if pattern.Check() == true,
+                        // i.e. if pattern is valid.
+    }
+    ans <<= 1;  // shift left by one.
+  }
+
+  // add in the value 'n' shifted 8 bits to the left,
+  // and set the 11th bit if we found a negative dim.
+  ans |= (n << 8) |  (found_negative_dim ? 1 << 11 : 0);
+}
+
+
+void ComputeMinAndMaxMindex(const Pattern *pattern,
+                            int64 *min_mindex,
+                            int64 *max_mindex) {
+  KALDI_PARANOID_ASSERT(IsValid(pattern));
+  int32 num_axes = pattern.num_axes;
+  if (ContainsNegativeStride(pattern.code)) {
+    // The if-statement above may be read as "if either pattern.code is -1 or it
+    // indicates that `pattern` contains a negative stride.  That is, at this
+    // point all we know is that `pattern` *might* contain a negative stride.
+    int64 min_mindex_sum = 0, max_mindex_sum = 0;
+    for (int32 raxis = 0; raxis < num_axes; raxis++) {
+      int64 prod (pattern.dims[raxis] - 1) *
+          static_cast<int64>(pattern.strides[raxis]);
+      if (pattern.strides[raxis] > 0) max_mindex_sum += prod;
+      else min_mindex_sum += prod;
+    }
+    *min_mindex = min_mindex_sum;
+    *max_mindex = max_mindex_sum;
+  } else {
+    // This is a faster branch of the code where we know that all strides are
+    // nonnegative.
+    *min_mindex = 0;
+    int64 max_mindex_sum = 0;
+    for (int32 raxis = 0; raxis < num_axes; raxis++)
+      max_mindex_sum += (pattern.dims[raxis] - 1) *
+          static_cast<int64>(pattern.strides[raxis]);
+    *max_mindex = max_mindex_sum;
+  }
+}
+
+
+
+/**
+   This utility function used in CompressPatterns() normalizes the signs of the
+   strides in all the dimensions, prior to any merging of axes, and sets the
+   'data_offsets' variables.
+
+   Consider an axis-index i (i.e. an index into the patterns' dims or strides
+   vector).  We say that the strides for axis i
+   are normalized if either all patterns have zero stride for that axis
+   or the lowest-numbered pattern which has nonzero stride for that axis
+   has positive stride for that axis.
+
+   This type of normalization is done to increase the chance that we can combine
+   axes, because the rule we use for combining axes only applies if any nonzero
+   strides present have the same sign between the two axes.  In terms of being
+   able to combine the maximum number of axes this rule is optimal, because any
+   two axes where the pattern-index of the first pattern with a nonzero stride
+   for those axes is different, would *not* be combinable.  So for any pair of
+   axes that are potentially combinable according to that criterion and which
+   have any nonzero strides, our normalization rule ensures that at least one
+   pair of nonzero strides has the same sign.  If there were another pattern for
+   which the sign was opposite after applying our rule, those two axes would not
+   be combinable whatever the sign normalization.
+
+     @param [in,out] patterns  The patterns to have their strides normalized
+     @param [in]    max_num_axes  The maximum of any of the patterns'
+                          num_axes (provided so we don't have to work it
+                          out from 'patterns').
+     @param [in,out] data_offsets  Data offsets, an array of dimension
+                          patterns.size, which will be *added to* as needed by
+                          this function, by the amount required to ensure that
+                          the memory locations visited by the set of possible
+                          indexes into these patterns is the same before and
+                          after any change of sign.
+     @return   Returns true if it made a change, else false.
+
+   CAUTION!  Does not update the pattern code (the code for that is commented).
+   If this were moved to a header we would have to make it update the pattern
+   code.
+ */
+static inline bool NormalizeSigns(ArrayRef<Pattern*> patterns,
+                                  int32 max_num_axes,
+                                  int64 *data_offsets) {
+  bool changed = false;
+  size_t num_patterns = patterns.size;
+
+  for (int32 a = 0; a < max_num_axes; a++) {
+    for (size_t p = 0; p < size; p++) {
+      if (patterns[p]->strides[a] != 0) {
+        // We have identified the first pattern-index with nonzero
+        // stride for this axis
+        if (patterns[p]->strides[a] < 0) {
+          changed = true;
+          // The stride is negative, so we have to flip it for this axis.
+          // (Note: we flip it for all patterns, but we can ignore
+          // pattern-indexes q < p because we know all those strides are zero.
+          for (size_t q = p; q < size; q++) {
+            // cast to int64 before muiltiplication to avoid potential
+            // overflow
+            if (patterns[q]->strides[a] != 0) {
+              int64 this_offset =
+                  static_cast<int64>(patterns[q]->dims[a] - 1) *
+                  static_cast<int64>(patterns[q]->strides[a]);
+              data_offsets[q] += this_offset;
+              patterns[q]->strides[a] *= -1;
+              // patterns[q]->code = -1;  // A signal to recompute the code.
+            }
+          }
+        }
+        // break from loop over patterns; we identified the first pattern-index
+        // with nonzero stride for this axis, which is the only thing that
+        // determines whether we change the sign of this axis.
+        break;
+      }
+    }
+  }
+  //if(changed)
+  //  for (size_t p = 0; p < size; p++)
+  //    if (patterns[p]->code == -1)
+  //      patterns[p]->code == GetDimsCode(*(patterns[p]));
+  return changed;
+}
+
+
+/**
+   This is a note on the semantics of combining dimensions in CompressPatterns.
+   It is not a commutative property: Combinable(pattern, i, j) might not
+   equal Combinable(pattern, j, i).
+
+   We can only ever combine pairs of axes that were combinable for *all* patterns
+   passed to CompressPatterns().
+
+   Two axes are combinable if stride2 == stride1 * dim1.  Here, raxis1 is
+   required to be the axis with the smaller stride, which is the asymmetry
+   between them.
+
+   (We also require that the new dimension must not overflow an int32.)
+ */
+static inline bool Combinable(const Pattern &p,
+                              int32 raxis1, int32 raxis2) {
+  return pattern.strides[raxis2] == p.strides[raxis1] * p.dims[raxis1] &&
+      static_cast<int64>(p.dims[raxis1])*static_cast<int64>(p.dims[raxis2]) <
+    std::numeric_limits<int32>::max();
+}
+
+
+// Returns true iff the axis 'axis' has zero stride (and hence dim=1)
+// for all the supplied patterns.  An axis like this can be removed without
+// affecting the result.
+static inline bool AxisIsTrivial(ArrayRef<Pattern> patterns,
+                                 int32 raxis) {
+  for (size_t p = 0; p < patterns.size; p++)
+    if (patterns[p].strides[raxis] != 0)
+      return false;
+  return true;
+}
+
+// Combine the two axes raxis1 and raxis2 in all the patterns (which the user
+// asserts is possible); at exit, the higher numbered of the two raxes is
+// guaranteed to have dim=1, stride=0 in all patterns.  (we will later get rid
+// of that trivial axis).  axis1 is the one with the smaller stride, and is the
+// one whose stride we keep in the combined axis; that is the asymmetry
+// between axis1 and axis2.
+static inline void CombineAxes(ArrayRef<Pattern*> patterns,
+                               int32 raxis1, int32 raxis2) {
+  size_t num_patterns = patterns.size;
+#ifdef KALDI_PARANOID
+  for (size_t p = 0; p < num_patterns; p++) {
+    KALDI_ASSERT(Combinable(*(patterns[p]), raxis1, raxis2));
+  }
+#endif
+  if (raxis1 > raxis2) {
+    // keep raxis2, remove raxis1.
+    // We want the 'trivial' axis (the one with dim=1, stride=0 for all
+    // patterns) to be the higher-numbered axis (this helps reduce
+    // the chance of having to move dims/strides around when removing
+    // trivial axes later on.
+    for (size_t p = 0; p < num_patterns; p++) {
+      Pattern *pattern = patterns[p];
+      pattern->dims[raxis2] *= pattern->dims[raxis1];
+      pattern->strides[raxis2] *= pattern->strides[raxis1];
+      pattern->dims[raxis1] = 1;
+      pattern->strides[raxis1] = 0;
+    }
+  } else {
+    // keep raxis1, remove raxis2.
+    for (size_t p = 0; p < num_patterns; p++) {
+      Pattern *pattern = patterns[p];
+      pattern->dims[raxis1] *= pattern->dims[raxis1];
+      pattern->dims[raxis2] = 1;
+      pattern->strides[raxis2] = 0;
+    }
+  }
+}
+
+/**
+   Removes trivial axes, defined as axes for which, for all patterns, dim=1 and
+   stride=0.  Assumes the user has already found out which axes are trivial and
+   is passing in this information as the array 'trivial_raxis' (we include the r
+   to emphasize that we use the same reversed numbering as in
+   pattern.{dims,strides}).
+
+
+   This function removes those axes, shifts the dims and strides arrays to
+   the left as needed, and decreases the 'num_axes' of the patterns
+   appropriately (note: this is not as simple as just subtracting the number
+   of axes removed, because removing an raxis that was >= the num_axes
+   of a given pattern needs to be a no-op).
+
+   @param [in]  trivial_raxis    An array which identifies the axes to
+                       be removed.  At least one element must be true.
+                       Indexed by 'raxis'.
+   @param [in,out]  patterns    The patterns to be modified.
+
+   CAUTION: this function does not update the codes of 'patterns'.
+ */
+static void RemoveTrivialAxes(bool is_trivial_raxis[KALDI_TENSOR_MAX_AXES],
+                              ArrayRef<Pattern*> patterns) {
+  int32 first_trivial_raxis = -1;
+  for (int32 raxis = 0; raxis < KALDI_TENSOR_MAX_AXES; raxis++) {
+    if (is_trivial_axis[raxis]) {
+      first_trivial_raxis = raxis;
+      break;
+    }
+  }
+  KALDI_PARANOID_ASSERT(first_trivial_raxis >= 0);
+
+  for (size_t p = 0; p < patterns.size; p++) {
+    Pattern *pattern = patterns[p];
+    // Keep the axes right-justified.  We work from the right to the left.
+
+    // We do the loop over axes inside the loop over p for memory locality.
+    // We keep the axes shifted to the right so the loop goes backwards.
+    int32 raxis_out = first_trivial_raxis,
+        num_axes = pattern->num_axes;
+    for (int32 raxis_in = raxis_out; raxis_in < num_axes; raxis_in++) {
+      if (is_trivial_axis[raxis_in]) {
+        KALDI_PARANOID_ASSERT(pattern->dims[raxis_in] == 1);
+      } else {
+        if (raxis_out != raxis_in) {
+          pattern->dims[raxis_out] = pattern->dims[raxis_in];
+          pattern->strides[raxis_out] = pattern->strides[raxis_in];
+        }
+        raxis_out++;
+      }
+    }
+    pattern->num_axes = raxis_out;
+    // Make sure the axes we removed are set to dim=1, stride=0.
+    for (; raxis_out < num_axes; raxis_out++) {
+      pattern->dims[raxis_out] = 1;
+      pattern->strides[raxis_out] = 0;
+    }
+    KALDI_PARANOID_ASSERT(pattern->Check(false));
+  }
+}
+
+void CompressPatterns(ArrayRef<Pattern*> patterns,
+                      int64_t *data_offsets) {
+  size_t num_patterns = patterns.size;
+#ifdef KALDI_PARANOID
+  KALDI_ASSERT(num_patterns > 0);
+  for (size_t p = 0; p < num_patterns; p++) {
+    KALDI_ASSERT(patterns[p]->Check());
+    for (size_t q = p + 1; q < num_patterns; q++) {
+      KALDI_ASSERT(Broadcastable(*(patterns[p]), *(patterns[q])));
+    }
+  }
+#endif
+  for (size_t p = 0; p < num_patterns; p++)
+    data_offsets[p] = 0;
+
+  int32 max_num_axes = patterns[0]->num_axes,
+      combined_code = patterns[0]->code;
+  // combined_code is the '|' of the patterns' codes; it's
+  // not the same as what CombineCodes() would return.
+
+  for (size_t p = 1; p < num_patterns; p++) {
+    max_num_axes = std::max<int32>(max_num_axes, patterns[p]->num-axes);
+    combined_code |= patterns[p]->code;
+  }
+  bool changed = false;
+  if (ContainsNegativeStride(combined_code))
+    changed = NormalizeSigns(patterns, data_offsets);
+
+  // note: the codes won't be fully up to date at this point.
+
+  bool exists_trivial_axis = false;
+  // The = {} ensures (I believe) that they are all set to 0, meaning false.
+  bool is_trivial_raxis[KALDI_TENSOR_MAX_AXES] = {};
+  for (int32 raxis = 0, mask = 1; raxis < max_num_axes; raxis++, mask <<= 1) {
+    if ((combined_code | mask) == 0) {
+      is_trivial_raxis[raxis] = true;
+      exists_trivial_axis = true;
+    }
+  }
+
+  // The reason we go in reverse order is a small optimization; it
+  // means it's more straightforward, when combining, to 'make trivial'
+  // the higher-numbered raxis, which reduces the chances of having to
+  // copy axes to different positions later on to remove trivial axes.
+  // (If we went forward and did this, we'd have to repeat processing
+  // the current axis each time we combined, which would be a hassle).
+  for (int32 raxis1 = max_num_axes - 1; raxis1 >= 0; raxis1--) {
+    if (is_trivial_raxis[raxis1])
+      continue;
+
+    // see if axis i can be combined (in either direction) with any
+    // earlier-numbered axis.
+    for (int32 raxis2 = raxis1 - 1; raxis2 >= 0; raxis2--) {
+      if (is_trivial_raxis[raxis2])
+        continue;
+      bool combinable_12 = true;
+      for (size_t p = 0; p < num_patterns; p++) {
+        if (!Combinable(patterns[p], raxis1, raxis2)) {
+          combinable_12 = false;
+          break;
+        }
+      }
+      if (combinable_12) {
+        CombineAxes(patterns, raxis1, raxis2);
+        is_trivial_raxis[raxis1] = true;  // higher numbered raxis is removed.
+        exists_trivial_axis = true;
+        // Break from the loop over raxis2 and continue over the loop over
+        // raxis1, meaning we are done combining with axis 'raxis1' (it's
+        // trivial now).
+        break;
+      }
+      bool combinable_21 = true;
+      for (size_t p = 0; p < num_patterns; p++) {
+        if (!Combinable(patterns[p], raxis2, raxis1)) {
+          combinable_21 = false;
+          break;
+        }
+      }
+      if (combinable_21) {
+        CombineAxes(patterns, raxis2, raxis1);
+        is_trivial_raxis[raxis1] = true;  // higher numbered raxis is removed.
+        exists_trivial_axis = true;
+        break;
+      }
+    }
+  }
+  if (exists_trivial_axis) {
+    RemoveTrivialAxes(max_num_axes, is_trivial_raxis, patterns);
+    changed = true;
+  }
+  if (changed)
+    for (size_t p = 0; p < num_patterns; p++)
+      patterns[p]->code = ComputePatternCode(*(patterns[p]));
+  return changed;
+}
+
+
+void CompressOnePattern(Pattern *pattern,
+                        int64 *data_offset) {
+  // We may at some point implement this specially; doing this would be more efficient.
+  CompressPatterns({pattern}, data_offset);
+}
+
+
+void SortAxes(Pattern *pattern) {
+  int32 num_axes = pattern->num_axes;
+  switch(num_axes) {
+    case 0: case 1:
+      return;
+    case 2:
+      // Implement this as a special case, avoiding a temporary
+      if (pattern->strides[0] > pattern->strides[1] ||
+          (pattern->strides[0] == pattern->strides[1] &&
+           pattern->dims[0] > pattern->dims[1])) {
+        std::swap(pattern->strides[0], pattern->strides[1]);
+        std::swap(pattern->dims[0], pattern->dims[1]);
+      }
+      pattern->code = -1;
+      return;
+    default: {
+      std::pair<int32,int32> dims_strides[KALDI_TENSOR_MAX_DIM];
+      for (int32 i = 0; i < num_axes; i++) {
+        dims_strides[i].first = pattern->dims[i];
+        dims_strides[i].second = pattern->strides[i];
+      }
+      std::sort(dims_strides, dims_strides + num_axes,
+                // below is a C++11 lambda used as a comparator function, like
+                // the operator a < b.
+                [] (const std::pair<int32,int32> &a,
+                    const std::pair<int32,int32> &b) {
+                  int32 abs_stride_a = std::abs(a.second),
+                      abs_stride_b = std::abs(b.second);
+                  if (abs_stride_a < abs_stride_b) return true;  // a < b; sort on abs(stride) first.
+                  else if (abs_stride_a > abs_stride_b) return false;  // a > b
+                  else return (a.first < b.first);
+                  // sort on dim if strides are the same
+                  // (which should only be for stride=0 for any valid Pattern.
+                });
+      for (int32 i = 0; i < num_axes; i++) {
+        pattern->dims[i] = dims_strides[i].first;
+        pattern->strides[i] = dims_strides[i].second;
+      }
+      pattern->code = -1;
+      return;
+    }
+  }
+}
+
+void SortTupleAxes(ArrayRef<Pattern*> patterns) {
+  // TODO.
+}
+
+void TransposeR(int32 raxis1, int32 raxis2, Pattern *p,
+                bool increase_num_axes) {
+  if (!increase_num_axes) {
+    if (static_cast<uint32>(raxis1) >= static_cast<uint32>(p->num_axes) ||
+        static_cast<uint32>(raxis2) >= static_cast<uint32>(p->num_axes)) {
+      KALDI_ERR << "Invalid axes to transpose: raxis1="
+                << raxis1 << ", raxis2=" << raxis2
+                << ", num-axes = " << p->num_axes;
+    }
+  } else {
+    if (static_cast<uint32>(raxis1) >= KALDI_TENSOR_MAX_DIM ||
+        static_cast<uint32>(raxis2) >= KALDI_TENSOR_MAX_DIM) {
+      KALDI_ERR << "Invalid axes to transpose: raxis1="
+                << raxis1 << ", raxis2=" << raxis2
+                << ", num-axes = " << p->num_axes;
+    }
+  }
+  std::swap(p->strides[raxis1], p->strides[raxis2]);
+  std::swap(p->dims[raxis1], p->dims[raxis2]);
+  p->code = -1;
+  if (increase_num_axes) {
+    if (raxis1 >= p->num_axes) {
+      // checking both the conditionsbelow is redundant if the pattern is valid,
+      // but we don't assume that.
+      if (p->dims[raxis1] != 1 || p->strides[raxis1] != 0)
+        p->num_axes = raxis1 + 1;
+    }
+    if (raxis2 >= p->num_axes) {
+      // checking both the conditionsbelow is redundant if the pattern is valid,
+      // but we don't assume that.
+      if (p->dims[raxis2] != 1 || p->strides[raxis2] != 0)
+        p->num_axes = raxis2 + 1;
+    }
+  }
+}
+
+void Transpose(int32 axis1, int32 axis2, Pattern *p) {
+  int32 num_axes = p->num_axes;
+  // interpret negative axes as offsets from num_axes.
+
+  // Work out the reversed / private axis indexes that we physically use
+  // in the arrays.  This includes interpreting negative axis
+  // indexes as being relative to the number of axes.
+  int32 raxis1 = (axis1 < 0 ? axis1 + 1 : num_axes - 1 - axis1),
+      raxis2 = (axis2 < 0 ? axis2 + 1 : num_axes - 1 - axis2);
+  if (static_cast<uint32>(raxis1) >= static_cast<uint32>(p->num_axes) ||
+      static_cast<uint32>(raxis2) >= static_cast<uint32>(p->num_axes)) {
+    KALDI_ERR << "Invalid axes to transpose: axis1="
+              << axis1 << ", axis2=" << axis2 << ", num-axes = " << p->num_axes;
+  }
+  std::swap(p->strides[raxis1], p->strides[raxis2]);
+  std::swap(p->dims[raxis1], p->dims[raxis2]);
+  p->code = -1;
+}
+
+
+
+void RemoveTrivialAxes(Pattern *pattern) {
+  int32 num_axes = pattern->num_axes,
+      num_axes_out = 0;
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    int32 this_dim = pattern->dims[raxis];
+    if (this_dim != 1) {
+      if (num_axes_out != raxis) {
+        pattern->dims[num_axes_out] = this_dim;
+        pattern->strides[num_axes_out] = pattern->strides[raxis];
+      }
+    }
+  }
+  // It is a requirement of struct Pattern that dims and
+  // strides for raxis >= num_axes be 1 and 0 respectively.
+  for (int32 raxis = num_axes_out; raxis < num_axes; raxis++) {
+    pattern->dims[raxis] = 1;
+    pattern->strides[raxis] = 0;
+  }
+  pattern->num_axes = num_axes;
+  pattern->code = -1;
+}
+
+
+void RemoveTrivialAxes(const Pattern &pattern_in,
+                       Pattern *pattern_out) {
+  KALDI_PARANOID_ASSERT(pattern_out != &pattern_in);
+  int32 num_axes = pattern->num_axes,
+      num_axes_out = 0;
+  for (int32 raxis = 0; raxis < num_axes; raxis++) {
+    int32 this_dim = pattern_in.dims[raxis];
+    if (this_dim != 1) {
+      pattern_out->dims[num_axes_out] = this_dim;
+      pattern_out->axes[num_axes_out] = pattern_in.strides[raxis];
+    }
+  }
+  // It is a requirement of struct Pattern that dims and
+  // strides for raxis >= num_axes be 1 and 0 respectively.
+  for (int32 raxis = num_axes_out;
+       raxis < KALDI_TENSOR_MAX_AXES; raxis++) {
+    pattern_out->dims[raxis] = 1;
+    pattern_out->strides[raxis] = 0;
+  }
+  pattern_out->num_axes = num_axes_out;
+  pattern_out->code = -1;
+}
+
+int64 NumElements(const Pattern &pattern) {
+  int32 num_axes = pattern.num_axes;
+  int64 ans = 1;
+  for (int32 raxis = 0; raxis < num_axes; raxis++)
+    ans *= pattern.dims[raxis];
+  return ans;
+}
+
+void Select(int32 eaxis, int32 index,
+            const Pattern &src, Pattern *dest) {
+  KALDI_PARANOID_ASSERT(src.IsValid());
+  int32 num_axes = src.num_axes,
+      raxis = EaxisToRaxis(eaxis);
+  if (static_cast<uint32>(raxis) >= static_cast<uint32>(num_axes) ||
+      static_cast<uint32>(index) >= static_cast<uint32>(src.dims[axis])) {
+    // If raxis is not in the range [0, num_axes - 1] or the index
+    // is not in the range [0, src.dims[axis] - 1]...
+    KALDI_ERR << "Invalid args to Select(): axis=" << eaxis
+              << " index=" << index << " vs. pattern dims="
+              << DimsAsString(src);
+  }
+  dest->num_axes = src.num_axes - 1
+  for (int32 r = 0; r < raxis; r++) {
+    dest->dims[r] = src.dims[r];
+    dest->strides[r] = src.strides[r];
+  }
+  dest->offset = src.offset + index * src.strides[raxis];
+  for (int32 r = raxis + 1; r < num_axes; r++) {
+    dest->dims[r - 1] = src.dims[r];
+    dest->strides[r - 1] = src.strides[r];
+  }
+  for (int32 r = num_axes - 1; r < KALDI_TENSOR_MAX_DIM; r++) {
+    dest->dims[r] = 1;
+    dest->strides[r] = 0;
+  }
+  dest->code = -1;
+  dest->properties = 0;
+}
+
+void Slice(int32 axis, int32 start, int32 end, Pattern *pattern) {
+  int32 num_axes = pattern->num_axes,
+      raxis = EaxisToRaxis(eaxis);
+  KALDI_PARANOID_ASSERT(pattern->IsValid());
+  if (static_cast<uint32>(raxis) >= static_cast<uint32>(num_axes) ||
+      end <= start ||
+      static_cast<uint32>(end) >= static_cast<uint32>(src.dims[axis])) {
+    // If raxis is not in the range [0, num_axes - 1] or (end <= start)
+    // or end is not in the range [0, src.dims[axis] - 1]...
+    KALDI_ERR << "Invalid args to Slice(): axis=" << eaxis
+              << " start=" << start << " end=" << end << " vs. pattern dims="
+              << DimsAsString(*pattern);
+  }
+  int32 old_stride = pattern->strides[raxis];
+  pattern->offset += pattern->strides[raxis] * start;
+  int32 new_dim = end - start;
+  pattern->dims[raxis] = new_dim;
+  if (new_dim == 1) {
+    pattern->strides[raxis] == 0;
+    if (pattern->code >= 0) {
+      // If the code was set, the following keeps it up to date (it's faster
+      // then recomputing the whole thing.
+
+      // Make the bit in the code that says the dim was != 1, is not set.
+      pattern->code &= ~(1 << (raxis - 1));
+      if (old_stride == 1) {
+        // If the stride was 1 then the code would have had at least one of bits
+        // 8,9,10 set to indicate the value of 'raxis'.  Zero this out, since
+        // the stride is no longer 1.
+        pattern->code &= ~(0x700);
+      }
+    }
+  }
+  KALDI_PARANOID_ASSERT(pattern->IsValid());
+}
+
+
+void UnsqueezeR(int32 raxis, const Pattern &src, Pattern *dest) {
+  int32 num_axes_in = src.num_axes;
+  KALDI_ASSERT(static_cast<uint32>(raxis) <= num_axes_in &&
+               num_axes_in < KALDI_TENSOR_MAX_DIM);
+  KALDI_PARANOID_ASSERT(IsValid(src));
+  if (&src != dest) {
+    // Copy some things over
+    for (int32 r = 0; r < raxis; r++) {
+      dest->dims[r] = src.dims[r];
+      dest->strides[r] = src.strides[r];
+    }
+    for (int32 r = num_axes_in + 1; r < KALDI_TENSOR_MAX_DIM; r++) {
+      dest->dims[r] = 1;
+      dest->strides[r] = 0;
+    }
+  }
+  dest->num_axes = num_axes_in + 1;
+  for (int32 r = num_axes_in + 1; r > raxis; r--) {
+    // go in reverse order in case (&src == dest)
+    dest->dims[r] = src.dims[r - 1];
+    dest->strides[r] = src.strides[r - 1];
+  }
+  // The unsqueezed axis.
+  dest->dims[raxis] = 1;
+  dest->strides[raxis] = 0;
+  if (raxis != num_axes_in) {
+    dest->code = -1;
+    dest->properties = 0;
+  } else if (&src != dest) {
+    // code would be unaffected if raxis == num_axes_in.
+    dest->code = src.code;
+    dest->properties = src.properties;
+  }
+  KALDI_PARANOID_ASSERT(dest->IsValid());
+}
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/pattern-utils.h b/src/tensor/pattern-utils.h
new file mode 100644
index 00000000000..63e6a6c0165
--- /dev/null
+++ b/src/tensor/pattern-utils.h
@@ -0,0 +1,886 @@
+// tensor/pattern-utils.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_TENSOR_TENSOR_PATTERN_UTILS_H_
+#define KALDI_TENSOR_TENSOR_PATTERN_UTILS_H_ 1
+
+
+#include "tensor/tensor-common.h"
+#include "tensor/pattern.h"
+#include "tensor/array-ref.h"
+
+// This header includes various functions operating on Patterns.
+// See also pattern-extra-utils.h which contains the
+// more obscure and less user-facing functions.
+
+namespace kaldi {
+namespace tensor {
+
+
+// Returns true if the pattern code indicates that the pattern contains a
+// negative stride.
+inline bool ContainsNegativeStride(int32 pattern_code) {
+  // 2048 is 1 << 11; 11th bit in code is set if code indicates negative stride.
+  return (pattern_code | 2048) != 0;
+}
+
+
+/**
+   This function converts an eaxis-index into an raxis-index, with no error
+   checking (you would normally check afterward that the raxis-index is in the
+   correct range).  Find "Eaxis-index:" and "Raxis-index:" in pattern.h,
+   but basically and eaxis-index is an axis-index in the public numbering where
+   we allow negative values to mean offsets from the end.
+ */
+inline int32 EaxisToRaxis(int32 eaxis, int32 num_axes) {
+  return (eaxis < 0 ? 1 - eaxis : num_axes - 1 - eaxis);
+}
+
+/**
+   Returns true if the pattern code indicates that the pattern contains a
+   negative stride.  Caution: will return true if pattern_code was -1, so if you
+   call this on a code on a valid Pattern where the code might be -1, all it
+   means is that the Pattern "might" contain a negative stride.
+
+     @param [in] pattern  The input pattern.  Must be valid;
+                          return status is undefined otherwise.
+     @return         Returns true if either the pattern's code was
+                     -1 (meaning: not known), or if the code
+                     indicates that a negative stride was present.
+*/
+inline bool PatternMightContainNegativeStride(
+    const Pattern &pattern) {
+  // 2048 is 1 << 11; 11th bit in code is set if code indicates negative stride.
+  return (pattern.code | 2048) != 0;
+}
+
+
+/**
+   Returns true if the pattern contains a negative stride.
+   See pattern-utils-inl.h for implementation.
+
+      @param [in] pattern   Input pattern.  Must be valid;
+                            return status is undefined otherwise.
+                            TODO: if we need this to work for, e.g.
+                            valid- or valid-- patterns, find
+                            the exact conditions.
+      @return     Returns true if the pattern contained at
+                  least one negative stride, false otherwise.
+ */
+inline bool ContainsNegativeStride(const Pattern &pattern);
+
+
+// Returns true if the pattern code indicates that the raxis
+// numbered 'raxis' (the r refers to the backwards numbering used
+// in 'pattern') is 'trivial' (meaning: dim=1, stride=0).
+inline bool AxisIsTrivial(int32 pattern_code, int32 raxis) {
+  return (pattern_code | 1 << raxis) == 0;
+}
+
+
+
+/**
+   This function copies pattern_in to pattern_out while removing
+   trivial axes (i.e. axes with dim=1), reducing num_axes appropriately.
+
+     @param [in] pattern_in   Input pattern.
+     @param [out] pattern_out Output pattern; may not point to pattern_in.
+                        At exit it will be the same as pattern_in except any
+                        axes with dim=1 will have been removed and the num_axes
+                        reduced.  Will be valid at output if pattern_in was
+                        valid-1 at input.
+*/
+void RemoveTrivialAxes(const Pattern &pattern_in,
+                       Pattern *pattern_out);
+
+
+/**
+   This function removes trivial axes (i.e. axes with dim=1) from 'pattern'.
+   This version works in-place.
+
+     @param [in,out] pattern   Pattern to be modified.  Any axes with dim=1
+                         will be removed and the num_axes reduced.  Will be
+                         valid at output if it was valid-1 at input.
+ */
+void RemoveTrivialAxes(Pattern *pattern);
+
+
+/**
+   This function returns a code that compactly represents information about
+   which axes had dim != 1; which axis, if any, had stride == 1; and
+   whether any axis had stride < 0.
+
+   Let
+      n = 0 if no axis had stride=1, otherwise:
+      n = 1 + the raxis index which had stride=1.
+
+    (raxis is the axis index when accessing the axes in reversed order, as
+     stored in pattern.dims and pattern.strides).
+
+   For example if the strides were [10,3,1] we would have
+   n = 1; i if the strides were [10,1,3] we would have n = 2.
+
+   IMPORTANT NOTE ON ORDERING: lists of dims or strides in square
+   brackets, like [1,2], are in the non-reversed ordering as exposed
+   by the Tensor API.
+
+   The value 'n' occupies the bits starting from 8 in the returned code,
+   i.e. bits 8,9,10 (counting from the right, i.e. from the least to
+   most significant).
+
+   Bit 11 is 1 if any of the strides were negative, and zero otherwise.
+   (None of the example bit-patterns below have this bit set.)  The
+   underlying BLAS in most cases does not support negative strides so
+   we deal with it by copying the data to a temporary with positive
+   strides.
+
+   The low-order KALDI_TENSOR_MAX_DIM bits have a 1 corresponding to
+   axes where dim != 1, and a 0 if dim == 1 for that axis.  Axis zero
+   in the private numbering (equal to the highest-numbered axis in the
+   public numbering) is the rightmost/lowest-order of these bits.
+
+   The explanation below will use c++14 binary literals (like 0b010101), although the code
+   doesn't use them as we compile as c++11; we show the corresponding hex codes which
+   are used in the code (and anyway easier to parse).
+
+   In the notation below, in dims vectors, x or X is a stand-in for 'any number
+   not equal to 1', and upper-case X indicates that the axis has stride=1.  In
+   the example `dims` vectors below, we don't put any leading `dim=1` axes,
+   because they would not affect the code generated.  The list of numbers
+   in square brackets [] below may be interpreted as the sequence of dims for the
+   Tensor, in the non-reversed ordering that the Tensor API exposes.
+
+   The ' at the 8th bit is to make the bit-string easier to parse.
+
+    0b000'00000000  0x000  dims=[], a scalar
+    0b000'00000001  0x001  dims=[x], a vector with a stride
+    0b001'00000001  0x101  dims=[X], a vector
+    0b000'00000010  0x002  dims=[x,1], a vector with a stride
+    0b010'00000010  0x202  dims=[X,1], a vector
+    0b000'00000011  0x003  dims=[x,x], a matrix with a stride
+    0b001'00000011  0x103  dims=[x,X], a matrix
+    0b010'00000011  0x203  dims=[X,x], a transposed matrix
+    0b000'00000100  0x008  dims=[x,1,1], a vector with a stride
+    0b011'00000100  0x308  dims=[X,1,1], a vector
+    0b010'00000110  0x20B  dims=[x,X,1], a matrix
+    0b011'00000110  0x30B  dims=[X,x,1], a transposed matrix
+    0b000'00000110  0x10B  dims=[x,x,1], a matrix with column stride
+    0b001'00000101  0x109  dims=[x,1,X], a matrix
+    0b011'00000101  0x309  dims=[X,1,x], a transposed matrix
+    0b000'00000101  0x009  dims=[x,1,x], a matrix with column stride
+
+    ...
+ */
+int32 ComputePatternCode(const Pattern &pattern);
+
+
+inline int32 CombineCodes(int32 code1, int32 code2) {
+  return (code1 << 12) | code2;
+}
+
+inline int64 CombineCodes(int32 code1, int32 code2, int32 code3) {
+  return (static_cast<int64>(code1) << 24) |
+      static_cast<int64>(code2 << 12) |
+      static_cast<int64>(code3);
+}
+
+
+/**
+   Copies a Pattern from `src` to `dest` while modifying it by inserting
+   an axis with (dim=1,stride=0) at position `raxis` (specified in the
+   private numbering).
+
+     @param [in]    raxis   The index at which the extra axis is to appear.
+                            We require 0 <= raxis <= p->num_axes.
+     @param [in]    src    The source pattern.  Must be valid and have
+                           NumAxes() < KALDI_TENSOR_MAX_DIM.
+     @param [out]   dest   The destination pattern.  Is allowed to be the same
+                           object as `src`.  Will be valid at exit if src
+                           was valid at entry (which this function may not
+                           check).
+ */
+void UnsqueezeR(int32 raxis, const Pattern &src, Pattern *dest);
+
+
+/**
+   Modifies 'p' in-place by inserting an axis with (dim=1,stride=0) at the
+   specified axis-index (numbered in the public numbering).
+   Equivalent to PyTorch's unsqueeze(), including its behavior with
+   negative axis indexes (axis < 0 is interpreted as to num_axes + 1 - axis).
+
+   Showing just the dims in the pattern, in the non-reversed order as
+   exported by the API, some examples are:
+
+\verbatim
+    Unsqueeze([6,5], 0) -> [1,6,5]
+    Unsqueeze([3,4], 1) -> [3,1,4]
+    Unsqueeze([9,10], 2) -> [9,10,1]
+    Unsqueeze([9,10], -1) -> [9,10,1]
+\endverbatim
+
+     @param [in]    eaxis   The axis-index at which the extra axis is to appear,
+                           with negatives allowed (see: "Eaxis-index" in glossary
+                           in pattern.h).
+     @param [in,out] p      The pattern to which we are adding an axis.
+                            Will have its num_axes increased by 1
+                            at exit, possibly its dims and strides
+                            arrays changed, and its code updated.
+ */
+inline void Unsqueeze(int32 eaxis, Pattern *p) {
+  UnsqueezeR(EaxisToRaxis(eaxis, p->num_axes));
+}
+
+/**
+   Modifies 'p' in-place by removing an axis with dim=1 from the specified
+   position (in the reversed numbering physically used in the pattern).  Updates
+   p->code.  It is an error if 'p' did not, on entry, contain an axis with dim=1
+   as position 'raxis' in the array.
+
+
+   Modifies 'p' in-place by removing an axis with dim=1 from the
+   specified position specified in the reversed numbering physically used in the
+   pattern.  Updates p->code.  It is an error if 'p' did not initially contain
+   an axis with dim=1 at position 'raxis' in the array.
+
+   This function updates p->code.
+
+   In the example below we show the dims in the order they appear in the
+   physical array:
+\verbatim
+   SqueezeR(0, {1,3,4})  -> {3,4}
+   SqueezeR(1, {5,1,7})  -> {5,7}
+   SqueezeR(2, {8,1,9})  -> [error]
+\endverbatim
+     @param [in]    raxis   The reversed-order axis to be squeezed.
+                            We require 0 <= raxis < p->num_axes and
+                            p->dims[raxis] == 1.
+     @param [in,out] p      The pattern from which we are removing an
+                            axis.  Will have its num_axes reduced by 1
+                            at exit, possibly its dims and strides
+                            arrays changed, and its 'code' updated.
+*/
+void SqueezeR(int32 raxis, Pattern *p);
+
+
+/**
+   Modifies 'p' in-place by removing an axis with dim=1 (hence stride=0)
+   located at the specified axis (as numbered in the public numbering).
+   Equivalent to PyTorch's squeeze(), including its behavior with
+   negative axis indexes; axis < 0 is interpreted as to num_axes - axis,
+   i.e. the last axis.  It is an error if 'p' did not, on entry,
+   contain an axis with dim=1 at position 'axis' (in the public numbering).
+
+   Showing just the dims in the pattern, in the non-reversed order as
+   exported by the API, some examples are:
+\verbatim
+    Squeeze([1,6,5], 0) -> [6,5]
+    Squeeze([3,1,4], 1) -> [3,4]
+    Squeeze([9,1,10], 2) -> error
+    Squeeze([7,1], -1) -> [7]
+\endverbatim
+
+     @param [in]    axis    The index at which the extra axis is to appear.
+                            We require -p->num_axes <= axis < p->num_axes
+                            (negative axes are permitted, interpreted
+                            as an offset from p->num_axes).
+                            We require that the specified axis have
+                            dim=1.
+     @param [in,out] p      The pattern from which we are removing an
+                            axis.  Will have its num_axes reduced by 1
+                            at exit, possibly its dims and strides
+                            arrays changed, and its 'code' updated.
+ */
+inline void Squeeze(int32 axis, Pattern *p) {
+  if (axis < 0) SqueezeR(1 - axis, p);
+  else SqueezeR(p->num_axes - 1 - axis, p);
+}
+
+
+
+/** Transpose the two specified axes (specified in the private/reversed
+    numbering) of a Pattern.
+
+    @param [in] raxis1  First axis to be transposed; must be >=0,
+                        and if increase_num_axes is false, must be
+                        less than p->num_axes.
+    @param [in] raxis2  Second axis to be transposed; must be >=0,
+                        and if increase_num_axes is false, must be
+                        less than p->num_axes.
+                        If identical to axis1, nothing will be done.
+    @param [in,out] p  Pattern whose axes are to be transposed.
+    @param [in] increase_num_axes   If this is true, we allow
+                        raxis1 and/or raxis2 to be >= p->num_axes;
+                        we will increase p->num_axes as necessary
+                        if this operation results in any
+                        raxis >= p->num_axes becoming non-trivial.
+ */
+void TransposeR(int32 raxis1, int32 raxis2, Pattern *p,
+                bool increase_num_axes = false);
+
+
+/** Transpose the two specified axes (specified in the private/reversed
+    numbering) of a Pattern.
+
+    @param [in] axis1  First axis to be transposed; must be in range
+                       `[-p->num_axes, p->num_axes - 1]`,
+                       with negative axis being interpreted as an offset
+                       from p->num_axes.  This axis-index is in the
+                       public numbering, not the reversed numbering
+                       physically used in 'pattern'.
+    @param [in] axis2  Second axis to be transposed; must be in range
+                       `[-p->num_axes, t->num_axes - 1]`.
+                       If identical to axis1, nothing will be done.
+    @param [in,out] p  Pattern whose axes are to be transposed.
+                       p->code is updated.
+ */
+void TransposeR(int32 raxis1, int32 raxis2, Pattern *p);
+
+
+
+/**
+   Modifies 'p' in-place by removing an axis with dim=1 (hence stride=0)
+   located at the specified axis (as numbered in the public numbering).
+   Equivalent to PyTorch's squeeze(), including its behavior with
+   negative axis indexes; axis < 0 is interpreted as to num_axes - axis,
+   i.e. the last axis.  It is an error if 'p' did not, on entry,
+   contain an axis with dim=1 at position 'axis' (in the public numbering).
+
+   Showing just the dims in the pattern, in the non-reversed order as
+   exported by the API, some examples are:
+\verbatim
+    Squeeze([1,6,5], 0) -> [6,5]
+    Squeeze([3,1,4], 1) -> [3,4]
+    Squeeze([9,1,10], 2) -> error
+    Squeeze([7,1], -1) -> [7]
+\endverbatim
+
+     @param [in]    axis    The index at which the extra axis is to appear.
+                            We require -p->num_axes <= axis < p->num_axes
+                            (negative axes are permitted, interpreted
+                            as an offset from p->num_axes).
+                            We require that the specified axis have
+                            dim=1.
+     @param [in,out] p      The pattern from which we are removing an
+                            axis.  Will have its num_axes reduced by 1
+                            at exit, possibly its dims and strides
+                            arrays changed, and its 'code' updated.
+*/
+inline void Squeeze(int32 axis, Pattern *p) {
+  if (axis < 0) SqueezeR(1 - axis, p);
+  else SqueezeR(p->num_axes - 1 - axis, p);
+}
+
+/**  This function returns true if the dimensions of tensor patterns
+     a, b and c are broadcastable in the PyTorch sense (meaning;
+     after padding their dims on the left with ones to make them
+     have the same num-axes, corresponding dimensions are either
+     identical or 1).  The previous sentence is written in terms
+     of the public numbering; in the private numbering it just means:
+     for each index `raxis` into the dims vector,
+     either `a.dims[raxis] == b.dims[raxis]`, or one of them is 1.
+
+       @param [in] a  The pattern of the first Tensor
+       @param [in] b  The pattern of the second Tensor
+       @param [in] b_not_smaller   If true, then we do not allow a dim of
+                      b to be 1 while corresponding dim of a is >1.
+ */
+bool Broadcastable(const Pattern &a, const Pattern &b);
+
+
+/**  This function returns true if the dimensions of tensor patterns
+     a, b and c are broadcastable in the PyTorch sense, which is
+     the same as
+     `Broadcastable(a, b) && Broadcastable(b, c) && Broadcastable(a, c)`.
+     See the 2-argument version of Broadcastable for more information.
+
+       @param [in] a  The pattern of the first Tensor
+       @param [in] b  The pattern of the second Tensor
+       @param [in] c  The pattern of the third Tensor
+       @return  Returns true if a, b and c are broadcastable
+ */
+bool Broadcastable(const Pattern &a, const Pattern &b,
+                   const Pattern &c);
+
+
+/**
+   Returns true if for each raxis, a.dims[raxis] >= b.dims[raxis].
+ */
+bool DimsGeq(const Pattern &a, const Pattern &b);
+
+
+/**
+   Returns true if the shapes of a and b (see "Shape of a Pattern" in pattern.h)
+   are the same after adding 1's on the left (padding) as for broadcasting.  See
+   definition of "Dims-vector of a Pattern" in pattern.h, and the entry for
+   "Padding".  What this means in terms of the physical storage of the patterns
+   is that a->dims and b->dims contain the same elements, without requiring the
+   num_axes to be the same.
+
+   This is a stronger condition than Broadcastable(a, b).
+         @param [in] a  The first pattern.  Must be valid.
+         @param [in] b  The second pattern.  Must be valid.
+         @return      Return true if the shapes of
+                      a and b are the same after padding.
+   See also the 3-arg version of SamePaddedDims(), and SameDims().
+   This is a stronger condition than Broadcastable(a, b).
+*/
+bool SamePaddedDims(const Pattern &a, const Pattern &b);
+
+
+/**
+   Returns true if the 'dims' vectors of a, b and c are all the same
+   after padding with 1's on the left (in the public numbering) to
+   make the dims the same.  Equivalent to
+   SamePaddedDims(a, b) && SamePaddedDims(b, c).
+
+   This is a stronger condition than Broadcastable(a, b, c).
+ */
+bool SamePaddedDims(const Pattern &a, const Pattern &b,
+                    const Pattern &c);
+
+/**
+   Return true if the two provided patterns have the same dims-vectors
+   (meaning, effectively the same num_axes and the same dim for each
+   axis; see "Dims-vector" in pattern.h).
+
+      @param [in] a  The first pattern.  Must be valid.
+      @param [in] b  The second pattern.  Must be valid.
+      @return        Returns true if a.num_axes == b.num_axes and
+                     the elements of their 'dims' members are the same.
+   See also: SamePaddedDims().
+*/
+bool SameDims(const Pattern &a, const Pattern &b);
+
+/**
+   Returns true if pattern1 and pattern2 have the same num_axes and strides.
+   (i.e. the strides, viewed as a vector of dim num_axes, are identical).
+
+      @param [in] a  The first pattern.  Must be valid-2
+      @param [in] b  The second pattern.  Must be valid-2
+      @return        Returns true if a.num_axes == b.num_axes and
+                     the elements of their 'strides' members are the same.
+ */
+bool SameStrides(const Pattern &a,
+                 const Pattern &b);
+
+
+
+/**
+   Reduces a Pattern by removing or combining as many axes as possible.
+   This version is suitable for operations that do not rely on any kind of
+   structure, such as zeroing or nonlinearities; the only equivalence maintained
+   is equivalence of the set of memory locations covered (the memory-index-set).
+   The order of the (dim,stride) pairs in the input does not affect the
+   output.  The output (dim,stride) pairs will be ordered from
+   greatest to least stride (note: all output strides will be positive).
+
+      @param [in,out]  pattern   The pattern to be reduced
+
+   Examples are below, where we write a Pattern as
+
+   `{{dim1,dim2,..}, {stride1,stride2,..} [,offset] }`
+
+   (the offset is written only if nonzero).
+
+   (the curly braces in our notation imply that we are referring to the reversed
+   ordering physically used in 'pattern', but actually this doesn't affect
+   anything since the order of axes does not matter here as long as it is constent.
+
+\verbatim
+   Input pattern             Output pattern
+     {{10},{1}}               {{10},{1}}
+    {{3,4},{4,1}}             {{12},{1}}
+    {{4,3},{1,4}}             {{12},{1}}
+    {{9},{-1},8}                {{9},{1}}    // offset reduced by 8.
+   {{2,3,4},{100,4,1}}        {{2,12},{100,1}}
+\endverbatim
+ */
+void ReduceOnePattern(Pattern *pattern);
+
+
+
+/**
+   Sorts the axes in 'pattern' from most negative to most positive stride
+   in private numbering, equivalent to sorting from most positive to
+   most negative stride in public numbering.
+
+   TODO: decide whether to change this to sort on abs(stride), or
+   maybe create another version that does sort on abs(stride), if there
+   are situations where this turns out to be useful.
+
+     @param [in,out]  The pattern whose axes are to be sorted
+                   from most negative to most positive stride (in the
+                   physical ordering).
+ */
+void SortAxes(Pattern *pattern);
+
+
+/**
+   Returns the raxis with the smallest value of abs(stride[raxis]),
+   taking the lowest-numbered raxis in case of ties (which could only
+   happen in the case of stride == 0).  Requires Valid(pattern).
+ */
+int32 RaxisWithSmallestAbsStride(const Pattern &pattern);
+
+// TODO: document this.
+inline void CanonicalizePattern(Pattern *pattern) {
+  ReduceOnePattern(pattern);
+  SortAxes(pattern);
+}
+
+// TODO: document this.  This will later be replaced with
+// a more efficient version.
+inline void CanonicalizePattern(contst Pattern &pattern_in,
+                                Pattern *pattern_out) {
+  *pattern_out = pattern_in;
+  CanonicalizePattern(pattern_out);
+}
+
+/**
+   This pattern checks that 'pattern' is valid and in canonical form (see
+   glossary for the meaning).  CanonicalizePattern() will modify a valid pattern
+   to put it in canonical form.
+ */
+bool IsCanonical(const Pattern &pattern);
+
+
+/**
+   Returns the number of elements in the pattern, computed as the
+   product of the dims.  ('pattern' is expected to either be valid or
+   to at least satisfy the uniqueness property for this to actually give
+   the number of elements, but this is not checked).
+*/
+int64 NumElements(const Pattern &pattern);
+
+
+
+/**
+  Multiplies all strides and the offset in 'pattern' by 'scale', which must be >
+  0.  For now, will just crash if this causes integer overflow.
+
+  This function is used in the memory-locking code if the same storage location
+  is accessed using different dtypes (which is unlikely).
+ */
+void ScaleStridesAndOffset(int32 scale, Pattern *pattern);
+
+
+
+
+
+/// Hashing object, used when we need an unordered_map containing Pattern.
+class PatternHasher {
+  size_t operator () (const Pattern &pattern) const;
+};
+
+
+/*
+  ReduceTwoPatterns() is a special case of ReducePatterns() where there
+  are exactly two patterns to be jointly reduced.  See documentation of
+  ReducePatterns() for explanation.
+*/
+void ReduceTwoPatterns(Pattern *a,
+                         Pattern *b);
+
+
+/**
+   Reduces a Pattern by removing or combining as many axes as possible,
+   while preserving the memory-index-set of the pattern (see glossary for
+   explanation), and also while respecting certain invariances that are relevant
+   when constructing 'views' ('view' is PyTorch terminology; the NumPy
+   equivalent is 'reshape').  The "C" in the function name refers to C-style
+   arrays.  Basically what this function does is a highly restricted subset
+   of what ReduceOnePattern() does.
+
+   This function removes axes with dim=1.
+
+   This function combines successive axes if the relationship of their
+   dims and strides is what you would expect in a "C"-style array
+   when the axes are listed in their non-reversed ordering (i.e.
+   as exposed by class Tensor).
+
+   Suppose that in pattern 'p' we had two successive axes physically numbered
+   raxis, raxis+1, with p->dims[raxis] > 1 and p->dims[raxis+1] > 1
+   and p->strides[raxis + 1] == p->strides[raxis] * p->dims[raxis],
+   then this function will merge them into a single axis whose dimension
+   is the product of the dimensions of the two original axes.
+   (However, they won't be merged if it would
+   result in a dimension exceeding the range of int32).
+
+   TODO...  finish this if it turns out to be needed for something.
+   I'm not sure if it will be.
+
+
+   The output pattern 'dest' is what you get if you keep applying the
+   rules above until no further change is made.
+
+   Examples are below, where we write a Pattern as
+  `   {{dim1,dim2,..}, {stride1,stride2,..}}`.
+\verbatim
+   Input pattern             Output pattern
+     {{10},{1}}               {{10},{1}}
+    {{5,1},{1,1}}             {{5},{1}}
+    {{9},{-1}}                {{9},{-1}}
+   {2,3,4},{100,4,1}        {{2,12},{100,1}}
+   {2,3,4},{100,-4,-1}        {{2,12},{100,-1}}
+\endverbatim
+ */
+void ReducePatternC(Pattern *p);
+
+
+
+/**
+   Creates a Pattern corresponding to a requested 'view' of the matrix.
+   ('view' is PyTorch terminology; the NumPy equivalent is 'reshape').
+
+   The PyTorch/NumPy semantics are (I believe) as follows: Firstly, a view
+   can/should only be created for a tensor whose layout in memory is as for a
+   "C" array; suppose that the shape of array a is (9, 8), a "C" layout would
+   imply strides of (8, 1).  A 'view' of this array simply implies interpreting
+   the same block of memory as a "C" array with some other sequence of
+   dimensions, say (3, 3, 8) or (8, 9) or (1, 72); any sequence whose product
+   matches the number of elements in "a".
+
+   Our semantics of "view" is the same as that of PyTorch/NumPy except that we
+   impose fewer constraints on what strides the input Tensor cmay have.  Let the
+   'view' of the array 'a' be 'b'.  As long as it is possible to find a tensor
+   pattern for 'b' that would lead to the same relationship between the elements
+   of 'a' and 'b' as what you would get by asking for the same "view" in
+   PyTorch/NumPy assuming 'a' had had "C"-style strides (viewed in terms of
+   indexed elements of and b, without regard to the physical memory layout), we
+   allow it.
+
+
+   Notes on implementation (glossing over ones in 'dims' which are easy to
+   handle as a special case): we would first call ReducePattern on
+   'pattern_in'.  Then we would attempt to find a correspondence with
+   the dimensions of this reduced pattern and a partition of the
+   sequence 'dims'.  For example, suppose the reduced pattern
+   is (100, 9) and dims is (50, 2, 3, 3), then the partition would
+   be (50, 2), (3, 3).  If this is not possible (e.g. if dims
+   had been (30,10,3) instead), we return false.
+
+   @param [in]  pattern_in   The input pattern for which we are trying to
+                          find an alternative view
+   @param [in]  dims  The sequence of dimensions corresponding to the
+                      desired view.  Its product must be the same as the
+                      product of pattern_in.dims.
+   @param [out] pattern_out  The output pattern, if we were
+                      successful (otherwise undefined).  Its 'dims'
+                      will be the same as 'dims'.
+   @return           Returns true on success (i.e. such a view could be
+                     created), and false otherwise.  This function will
+                     never return false if 'pattern_in' had strides as
+                     for a "C" array (i.e., if HasCStrides(pettern_in)
+                     returns true).
+
+ */
+bool CreateViewPattern(const Pattern &pattern_in,
+                       ArrayRef<int32> dims,
+                       Pattern *pattern_out);
+
+
+/**
+   This is like PyTorch's slice() / narrow() functions.
+   It selects a range of dimensions on one of the axes.  It is similar to
+   indexing with a range in Python, like A[10:20].
+
+      @param [in] eaxis  Eaxis-index (see glossary in pattern.h) on which
+                         to possibly reduce the dimensionality.
+      @param [in] start  Starting index; must be in range [0, t->Dim(eaxis) - 1]
+      @param [in] end    Ending index; must be in the range [start + 1, t->Dim(eaxis)]
+      @param [in,out] pattern  Pattern to be modified.  Will be valid at
+                         exit if it was valid at entry.
+
+   See also: the other overloaded version of Slice() which accepts the 'step'
+   parameter; and Select(), which is similar but also reduces the num-axes.
+ */
+void Slice(int32 eaxis, int32 start, int32 end, Pattern *pattern);
+
+
+
+/**
+   Copy one Pattern to another while modifying it by by selecting one index from
+   a specified axis (specified in the public numbering), of a TensorImpl `t`,
+   reducing the num_axes by one.
+
+       @param [in] eaxis Eaxis-index (see glossary in pattern.h) on which
+                         to possibly reduce the dimensionality.
+       @param [in] index Index to select; must be in range
+                         [0, t->Dim(eaxis) - 1].
+       @param [in,out] src   Pattern which is to be copied; must be valid,
+                         but we don't guarantee to check this.
+       @param [out] dest Pattern which we are copying to and modifying.
+                         It is allowed to be the same object as 'src'.
+                         Will be valid if src was valid.
+*/
+void Select(int32 eaxis, int32 index,
+            const Pattern &src, Pattern *dest);
+
+
+/**
+   Infer an index-tuple from a memory-index m and a pattern p.
+   That is: find the index-tuple i such that src[i] = p[m]
+   and i[r] = 0 in all axes such that p.dims[r] == 1.
+   There is at most one such index-tuple, by the uniqueness property.
+   The numbering used here is by raxis (i.e. the private numbering).
+
+           @param [in] p  Input pattern.  Required to be valid.
+           @param [in] m  The memory-index we are querying.
+           @param [out] index_tuple  On success, the index-tuple will be
+                    written to here (it will have size equal to p.num_axes).
+                    Note: the indexing is by raxis (that is why there
+                    is "R" in the function name).
+                    On failure, the value at exit is undefined.
+           @return  Returns true on success, false if no
+                    such index-tuple existed.
+*/
+bool GetIndexTupleR(const Pattern &p,
+                    int64 m,
+                    std::vector<int32> *index_tuple);
+
+/**
+   Convert a memory-index from one pattern to another.  Specifically, it finds
+   an index-tuple i such that a[i] = mindex_a, and returns mindex_b = b[i], if
+   exactly one such mindex_b exists; otherwise it crashes.  (The caveat about
+   "if exactly one such index exists" has to do with the possibility that there
+   is an raxis r that is trivial for a but not for b).
+
+   These memory-indexes include the 'offset' members of a and b.  For a version
+   that does not include the offset (i.e. is invariant to the offset members),
+   see ConvertMindexDifference.
+
+             @param [in] a     Source Pattern, from which mindex_a is derived
+             @param [out] b    Destination Pattern, which we index to get
+                               the returned mindex
+             @return           Returns the memory-index mindex_b such that
+                               there exists an index-tuple i in the
+                               index-tuple-set of the tuple (a, b) satisfying
+                               a[i] == mindex_a and b[i] == mindex_b.  If it is
+                               not the case that exactly one such memory-index
+                               exists, it is an error and this function may
+                               crash.
+ */
+int64 ConvertMindex(const Pattern &a,
+                    const Pattern &b,
+                    int64 mindex_a);
+
+/**
+   Convert a difference between memory-indexes from one pattern to another.
+   This is equivalent to setting the 'offset' values of a and b to zero and
+   calling ConvertMindex() with the modified args, in cases where that inner
+   call does not crash.  (But this function also generalizes to 'out-of-range'
+   or negative memory-indexes, like a linear continuation of the function).
+
+   See documentation for ConvertMindex() for more explanation.
+*/
+int64 ConvertMindexDifference(const Pattern &a,
+                              const Pattern &b,
+                              int64 mindex_a);
+
+
+/**
+   This function returns true if 'pattern' has the same strides
+   as 'C' array with the same dimensions would have.  (Note:
+   we are referring here to the public numbering of the axes).
+   For example, an array of dims [3, 4, 5], if it were
+   a "C" array, would have strides of [20, 5, 1].  As a special
+   case, since our Patterns use stride=0 for axes with dim=1,
+   we treat that zero as a wildcard; that is, if there
+   is a stride value for which the array would have "C" strides
+   then we'll return true.
+
+     @param [in] pattern  The pattern we are checking.  It is expected
+                     to satisfy Valid(pattern), but this function does not
+                     check this.
+
+     @return  Returns true if this pattern has 'C' strides, and
+              false otherwise.   (See note above about axes
+              with dim=1).
+*/
+void HasCStrides(const Pattern &pattern);
+
+/**
+   Returns true if there is overlap between pattern1 and pattern2,
+   meaning that pattern1's memory-index-set and pattern2's
+   memory-index-set have nonempty intersection.
+ */
+bool PatternsOverlap(const Pattern &pattern1,
+                     const Pattern &pattern2);
+
+/**
+   Returns true if this is a valid pattern-tuple (see "Valid pattern-tuple"
+   in pattern.h)
+ */
+bool IsValidPatternTuple(ArrayRef<Pattern*> patterns);
+
+
+
+/**
+   Returns true if the memory-index-set of this pattern forms a contiguous
+   range, otherwise false.  (Note: this is not the same as PyTorch's notion of
+   contiguous; see HasCStrides()).  Caution: the interface may later be changed
+   to allow caching of this property in the 'properties' field.
+*/
+bool IsCompact(const Pattern &pattern);
+
+
+/**
+   Returns true if the lowest memory-index of 'pattern' is zero (see
+   "Justified" in glossary in pattern.h.
+   (see also: ComputeMinAndMaxMindex()).
+*/
+bool IsJustified(const Pattern &pattern);
+
+
+/**
+   This is the same is IsCompact(pattern) &&
+   IsJustified(pattern).
+*/
+bool IsCompactAndJustified(const Pattern &pattern);
+
+/**
+   Returns true if 'pattern' has normalized strides as defined in
+   pattern.h (i.e.: strides are nonnegative and the nonzero ones are in
+   strictly increasing order in the private numbering / decreasing in the
+   public).
+*/
+bool HasNormalizedStrides(const Pattern &pattern);
+
+/**
+   Returns true if the strides in 'pattern' are all positive and are in strictly
+   increasing order in the private numbering / decreasing in the public.
+ */
+bool HasNormalizedPostiveStrides(const Pattern &pattern);
+
+/**
+   Returns true if all the stides in 'pattern' are nonnegative.
+*/
+bool HasNonnegativeStrides(const Pattern &pattern);
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#include "tensor/pattern-utils-inl.h"
+
+#endif KALDI_TENSOR_TENSOR_PATTERN_UTILS_H_
diff --git a/src/tensor/pattern.cc b/src/tensor/pattern.cc
new file mode 100644
index 00000000000..de1d1eba307
--- /dev/null
+++ b/src/tensor/pattern.cc
@@ -0,0 +1,127 @@
+// tensor/pattern.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "tensor/pattern.h"
+#include "tensor/pattern-utils.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+bool Pattern::Check(bool check_code) {
+  if (num_axes < 0 || num_axes > KALDI_TENSOR_MAX_DIM)
+    return false;
+
+  int32 raxis;
+  for (raxis = 0; raxis < num_axes; raxis++) {
+    int32 dim = dims[raxis], stride = strides[raxis];
+    // All dims must be positive.  (We have no concept of
+    // an empty tensor; you would use NULL, or None, to represent
+    // that.
+    if (dim <= 0)
+      return false;
+    // If dim == 1, stride must be zero.  Otherwise, stride must be nonzero.
+    if (dim == 1) {
+      if (stride != 0) return false;
+    } else {
+      if (stride == 0) return false;
+    }
+
+  }
+  for (; raxis < KALDI_TENSOR_MAX_DIM; raxis++) {
+    // Check that all unused axes have dim=1, stride=0.
+    // Keeping them this way makes checks for broadcastability easier.
+    // We may later remove this requirement.
+    if (dims[raxis] != 1 || strides[raxis] != 0)
+      return false;
+  }
+
+  {
+    // Now check for potential overlap.  We take all the axes with dim != 1 and
+    // sort them from least to greatest stride, and check that for each i>0,
+    // abs(strides[i]) >= dims[i-1] * abs(strides[i-1]).
+    std::pair<int32, int32> abs_strides_and_dims[KALDI_TENSOR_MAX_DIM];
+    int32 num_nontrivial_axes = 0;
+    // The dims and strides are shifted to the right of the arrays 'dims' and
+    // 'strides', to make the broadcasting rules of toolkits like PyTorch (which
+    // left-pad to make the arrays have the same num-axes) easier to enforce.
+    for (int32 i = 0; i < num_axes; i++) {
+      if (dims[i] != 1) {
+        abs_strides_and_dims[num_nontrivial_axes].first = dims[i];
+        abs_strides_and_dims[num_nontrivial_axes].second = std::abs(strides[i]);
+        num_nontrivial_axes++;
+      }
+    }
+    // Sort on strides from least to greatest.
+    std::sort(abs_strides_and_dims, abs_strides_and_dims + num_nontrivial_axes);
+    for (int32 i = 1; i < num_nontrivial_axes; i++) {
+      // if (abs(strides[i]) < dims[i-1] * abs(strides[i-1])) return false;
+      if (abs_strides_and_dims[i].first <
+          abs_strides_and_dims[i-1].second * abs_strides_and_dims[i-1].first)
+        return false;
+    }
+  }
+
+  if (check_code)
+    return code == ComputePatternCode(*this);
+  else
+    return true;
+}
+
+
+int32 Pattern::GetCode() {
+  if (code < 0)
+    code = ComputePatternCode(*this);
+  return code;
+}
+
+// MAY DELETE THIS.  It's not up to date anyway.
+void PatternProperties::UpdateProperties(const Pattern &pattern) {
+  KALDI_PARANOID_ASSERT(pattern.IsValid());
+  int32 num_axes = pattern.num_axes;
+  int64 dim_prod = 1;
+  bool c_strides = true;
+  // 'element_range' is the distance (in elements) between the
+  // first and last elements of the array.
+  int64 element_range = 0;
+  for (int32 i = num_axes - 1; i >= 0; i--) {
+    int32 dim = pattern.dims[i], stride = pattern.strides[i];
+    if (dim != 1) {
+      if (pattern.strides[i] != dim_prod)
+        c_strides = false;
+      element_range += std::abs(static_cast<int64>(stride) *
+                                static_cast<int64>(dim - 1));
+    }
+    dim_prod *= dim;
+  }
+  this->num_elements = dim_prod;
+  this->has_c_strides = c_strides;
+  if (has_c_strides) {
+    KALDI_PARANOID_ASSERT(element_range + 1 == num_elements);
+    this->is_contiguous = true;
+  } else {
+    KALDI_PARANOID_ASSERT(element_range < num_elements);
+    this->is_contiguous = (element_range + 1 == num_elements);
+  }
+}
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/pattern.h b/src/tensor/pattern.h
new file mode 100644
index 00000000000..d1d3cca9411
--- /dev/null
+++ b/src/tensor/pattern.h
@@ -0,0 +1,709 @@
+// tensor/pattern.h
+
+//  Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_PATTERN_H_
+#define KALDI_TENSOR_TENSOR_PATTERN_H_ 1
+
+#include "tensor/tensor-common.h"
+#include <limits>
+
+
+namespace kaldi {
+namespace tensor {
+
+
+
+/*
+  PATTERN GLOSSARY   (note: see also TENSOR GLOSSARY in tensor.h)
+
+
+
+    Adjacent:         Two Patterns are said to be adjacent if their memory-index-sets
+                      are disjoint and a Pattern exists whose memory-index-set is
+                      their union.
+                      [TODO: come up with algorithm for testing adjacency and
+                      merging the adjacent Patterns.]
+
+    Axis:             An axis is one of the (dim, stride) pairs that form part
+                      of a Pattern.  We will sometimes use the word "axis"
+                      to refer to the integer index of the axis, as in, for example,
+                      in a Tensor with dims=[5 6 7], axis 0 has dim=5 and
+                      axis 2 has dim=7; but this should more precisely
+                      be called axis-index or raxis-index (see their own
+                      glossary entries; they respectively use the public
+                      numbering, or reversed private numbering).  To describe
+                      the number of axes of a Tensor, we use the term "num-axes" /
+                      "number of axes".
+
+    Axis-index:       An axis-index of a Pattern or Tensor (sometimes just "axis" for short,
+                      especially in code) is an index that identifies an axis in the
+                      public numbering (see "Public numbering").  A valid
+                      axis-index for a Pattern with `num_axes` axes is in the
+                      range [0, num_axes - 1].
+
+                      For an axis-index i, the corresponding raxis-index (c.f. "Raxis-index:"
+                      or "Private numbering:") would be num_axes - 1 - i.
+
+                      See also "Eaxis-index" for where we allow negative axis-indexes
+                      as offsets from the end.
+
+    axis-dominance property: search below for [Valid Pattern], point (v), for the main
+                      definition.
+          [axis-dominance property of an axis-index]:
+                      There is another sense in which we use the term
+                      'axis-dominance property': for a Pattern whose axes are sorted
+                      from least to greatest abs(stride) [in the private numbering],
+                      we say that "the axis-dominance property holds for axis-index r
+                      of that Pattern" if:
+                                 dim(r) * abs(stride(r)) <= abs(stride(r+1)).
+          [axis-dominance lemma]
+                      The axis-dominance lemma, of which we won't provide a proof
+                      of here as it's pretty obvious, is something you would need
+                      when showing that axis-dominance implies uniqueness.  It
+                      states that, for a pattern which is valid-1, for any
+                      any 0 <= r < num_axes,
+                          (\sum_{q < r} (dim(q) - 1) * stride(q))  <  stride(r).
+
+
+    Broadcasting:    A convention whereby for an operation on Tensors that would
+                     normally be required to have the same dimension, it's
+                     acceptable for, on some axis, one Tensor to have `dim = n`
+                     with `n != 1` and the other to have `dim = 1`.  I.e., two dims can be
+                     different as long as one of them is 1.  Most operations will
+                     take place as if the Tensor with `dim = 1` had been extended
+                     to `dim = n` by making identical copies.  However, if it is
+                     the output Tensor that has `dim = 1`, there would be summation
+                     or possibly some other appropriate reduction instead of making
+                     copies.  This is different from other toolkits (the fact that
+                     we extend the concept of broadcasting to encompass summation).
+                     See also: Broadcastable (which has a more precise definition);
+                     PyTorch-style broadcasting, extended indexing.
+
+    Broadcastable:   See documentation for function Broadcastable() in pattern-utils.h.
+                     Explaining it in terms of the public numbering: two
+                     Patterns are broadcastable if their dims (padded as
+                     necessary on the left by 1's to make them the same size)
+                     are, for each axis, either the same or one of them is 1.
+                     So for example, comparing ([ 3 4 ], [4]), we first pad on
+                     the left to get ([3 4], [1 4]); then we say they are
+                     broadcastable because 4 == 4 and in the remaining axis, one
+                     of the dimensions is 1.
+
+    Canonical form:  A Pattern is in canonical form if all pairs of axes that
+                     could be combined (without affecting its memory-index-set)
+                     have been combined; where there are no trivial axes; all
+                     strides are positive; and the axes are sorted in an order
+                     of stride that's increasing in the private numbering /
+                     increasing in the public numbering.
+                     See CanonicalizePattern().
+
+    Compact:         A Pattern is compact if its memory-index-set forms a contiguous
+                     range of integers (no gaps).  (We don't call this "contiguous"
+                     because PyTorch uses the same word with a different meaning).
+
+    Default strides:  The default strides for a pattern with provided dimensions are:
+                     of course, zero for any axis with dim=1; and otherwise (describing
+                     it in the public numbering of axes), each axis's stride is
+                     the product of the later-numbered axes' dims.  It corresponds
+                     to the strides of a "C" array.
+                     This is the policy that we will use when constructing new
+                     Tensors if only the dims are provided, which is why we call these
+                     the default strides.
+                     A Pattern having default strides is equivalent to its having
+                     normalized strides and also being compact.
+
+                     See also: Normalized strides, Compact.
+
+    Dereferencing a memory-index:
+                     Sometimes in formal explanations of algorithms we will use notation
+                     `*m` meaning, for a memory-index `m`, the location that it points to
+                     in the relevant storage region; we will assume that it is obvious
+                     from the context which storage region.   See also: "Storage region"
+
+    Disjoint Patterns:  When we speak of Patterns being disjoint we mean that
+                    their memory-index-sets are disjoint; see memory-index-set.
+
+    Eaxis-index / extended axis-index:
+                      We use the term Eaxis-index, or in code, eaxis_index, to
+                      mean an axis-index in the public numbering (c.f.:
+                      Axis-index) but where negative values are allowed, as in
+                      Python.  Negative values are interpreted as offsets from
+                      the num_axes of the Pattern in question, so for instance
+                      -1 would correspond to num_axes - 1.  Valid eaxis-indexes
+                      would be in the range [-num_axes, num_axes - 1].  See
+                      also: Axis-index, Raxis-index.
+
+    Extended indexing:  A convention whereby if we have a Tensor with, say,
+                      `dims = [5 1]`, we can index that Tensor with an index-tuple
+                      that:
+                       (1) may have nonzero index values in any axis for which
+                          dim=1, so `index_tuple = [4 100]` would be a valid
+                          index for this Tensor in extended indexing.
+                       (2) may have more elements than the Tensor's num-axes; the
+                         Tensor is implicitly extended with extra axes on the left
+                         (in the public numbering) / the right (in the private
+                         numbering) with dim=1.  See also: PyTorch-style broadcasting.
+
+    Index:            If this word is used unqualified in the context of a Pattern
+                      or tensor it will generally mean an integer that's part of an
+                      index-tuple, and is being used to index a particular axis of
+                      a Pattern.  For example, on an axis where the Pattern's dimension
+                      is `dim`, a valid index i would be in the range 0 <= i < dim.
+
+    Index-tuple:      A tuple of integers used as an index into a Tensor.  Must
+                      have at least as many elements as the Tensor's num_axes
+                      (see Extended indexing).  Elements of such tuples may
+                      not be negative.  The elements of an index-tuple are in
+                      the same order as the axes, and in some cases it may
+                      be necessary to disambiguate whether we are referring
+                      to the public numbering or the private numbering of the
+                      axes.
+
+    [Valid Index-tuple]: An index-tuple is *valid for a pattern* if it may be
+                      used to index that Pattern, allowing extended indexing.
+                      (see "Extended indexing" for details).
+
+    Indexing a Pattern:  For a pattern `p` and an index-tuple `i` that is valid
+                       for the pattern (see: "Valid Index-tuple"), we write
+                      `p[i] = m` meaning that when indexing a pattern `p`
+                      with index-tuple `i` we get memory-index `m`.
+                      `m` is of coure the sum of the pattern's offset plus
+                      the sum over all axis-indexes, of the element of the index-tuple
+                      multiplied by the Pattern's stride for that axis.
+
+    Index-tuple-set of a Pattern: The index-tuple-set I(p) of a Pattern p is the
+                      set of valid index-tuples minus those that require extended
+                      indexing.  For example, for a Tensor with `dims = [2]`, the
+                      index-tuple-set `{ (0), (1) }`; for
+                      a Tensor with `dims = [1 2]` the set of valid index-tuples
+                      is `{ (0,0), (0,1) }`.
+
+    Index-tuple-set of a Pattern-tuple:  The index-tuple-set I(P, Q) of a Pattern-tuple
+                      (P, Q) is the index-tuple-set that you would obtain for a
+                      Pattern whose dims equal the shape of that Pattern-tuple
+                      (See "Shape of a Pattern-tuple").
+
+    Justified:        We say that a Pattern is justified if least (i.e. most
+                      negative) memory-index in its memory-index-set is zero.  For
+                      Patterns with nonnegative strides, this is equivalent to
+                      its offset being zero.
+
+    Memory region:    A region of memory that will have been allocated with malloc()
+                      or some equivalent (or obtained from some memory-management
+                      code, in the case of GPU memory).  Objects of type `Storage`
+                      are responsible for allocating and deleting memory regions.
+
+    Memory-pointer:   A void* pointer to the start of a memory region.
+
+    Memory-index: (abbr: mindex)
+                      An integer (int64) index into a memory region viewed as a
+                      linear array.  For example, for a Tensor of floats, we'd
+                      cast the address of the memory-pointer to `float*` and
+                      then use the memory-index as an index into that array.  In
+                      code, this may be called 'mindex.'  For a Pattern p and an
+                      index-tuple i that is valid for p, we have a memory-index
+                      m = p[i], which is equal to the pattern's offset plus the
+                      sum over all axes of the product of the element of the
+                      index-tuple times the corresponding axis's stride.
+
+    Memory-index-tuple:  A tuple of Memory-indexes.  This concept is used in connection
+                      with Pattern-tuples.  For a pattern-tuple q = (p1, p2, p3)
+                      and an index-tuple i, we may write q[i] = (p1[i], p2[i] p3[i]),
+                      where expressions like p1[i] evaluate to a memory-index.
+
+    Natural order of index-tuples: Suppose we have a set of index-tuples, all with
+                    the same num-axes / length of tuple.  What we call the
+                    "natural order" (this is just a convenient name, it does not
+                    imply any objective naturalnesss) is a total order on
+                    index-tuples that corresponds to interpreting the
+                    index-tuples as indexes into a "C"-style array (in the
+                    public numbering of axes) or a Fortran-style one (in the
+                    private one) and comparing the memory addresses.  In
+                    the public numbering this order is the same as lexical
+                    order, e.g. ([0 0], [0 1], [1 0], [1 1]); in the private
+                    numbering it is lexical order but starting from the right,
+                    not the left.
+       [list:]      Given a set S of index-tuples, we will sometimes write
+                    list(S) to mean a list of index-tuples with the same
+                    elements as S, ordered in the natural order.
+
+    Num-axes:        The number of axes that a Tensor has.  This is a number in the
+                     range [0, KALDI_TENSOR_MAX_DIM], i.e. 0 through 6.
+
+    Offset:          The memory-index of the element with index-tuple = (all zeros)
+                     of a Tensor.  Offsets will always be >= 0 because they are to
+                     be used as an index into a memory-region, and negative
+                     index would be outside that region.
+
+    Padding:         This refers to the fact that when testing whether Patterns
+                     are broadcastable, if their num-axes are different we
+                     pad the shorter one by adding "1" on the left (in the public
+                     numbering).  So if we are doing an operation on Tensors
+                     with shapes [7 3 2]  and [3 2], we treat the second one
+                     as having shape [1 3 2].
+
+    Pattern:         An object representing the dims, strides and offset of a Tensor.
+                     (see struct Pattern).  The Pattern has
+                     an 'offset' which is the memory-index of the element of the Tensor
+                     whose index-tuple is all zeros; the Pattern also
+                     has a number of axes, `0 <= num_axes < KALDI_TENSOR_MAX_AXES`,
+                     and for each axis from 0 <= axis < num_axes, it has a dimension
+                     dim(axis) and stride(axis).
+
+                     Search below for 'Valid Pattern' for properties a Pattern must
+                     (in most circumstances) satisfy.
+
+
+    Pattern-tuple:    A pattern-tuple is tuple of Patterns, say:  (P, Q),
+                      where the patterns in the tuple are broadcastable, meaning,
+                      for example: Broadcastable(P, Q).  The order of the tuple
+                      must be at least one (i.e. at least one Pattern).
+
+           [Valid Pattern-tuple:]
+                     This describes the properties that Pattern-tuples are expected
+                     to satisfy in most situations where we might pass them into
+                     functions (this will usually be as ArrayRef<Pattern*>).
+                     The tuple most contain at least one pattern; each pattern must
+                     be valid; and they must be broadcastable, i.e.
+                     Broadcastable(p1, p2) for each pair of Patterns.
+
+
+    Public numbering: The numbering of axes used in the public interface of class
+                      Tensor.  We use the index `axis` when in the public numbering.
+                      We use square brackets when describing dims or strides ordered
+                      in the public numbering, e.g. dims=[3 4].
+                      See also: axis-index
+
+    Private numbering:  The reversed numbering of axes in struct Pattern.
+                      For an axis numbered `axis` in the public numbering, its
+                      reversed axis index is `raxis = num_axes - 1 - axis`.
+                      This reversal makes PyTorch-style broadcasting easier.
+                      We use curly brackets when describing dims or strides
+                      ordered in the private numbering, e.g. dims={4,3}; this
+                      is supposed to call to mind a C++ brace-initializer.
+                      See also: raxis-index
+
+    PyTorch-style broadcasting:  We use this name to refer to the fact that in
+                      PyTorch, if an operation is done on two Tensors with
+                      dims=[5 6] and dims=[6], the second one would be interpreted
+                      as having dims=[1 6].  That is: we pad with 1's on the left
+                      (See "Padding").  Note: whenever we refer to broadcasting
+                      we include this feature; this glossary entry exists just
+                      to explain it, not to imply that we have two different
+                      versions of broadcasting.
+
+    Raxis-index:      We use the term "raxis-index", often just "raxis" for short,
+                      to mean the index of an axis in the reversed, private numbering.
+                      This would usually be in the range [0, num_axes - 1] for
+                      a Pattern with `num_axes` axes, but for broadcasting purposes,
+                      if we are doing an operation between Tensors of different
+                      numbers of axes we may often use larger raxis values for the Tensor
+                      of smaller num_axes (see PyTorch-style broadcasting).
+
+    Set-equivalent:   Two Patterns are set-equivalent if their memory-index-sets
+                      are identical.
+                      Two Pattern-tuples are set-equivalent if their
+                      memory-index-tuple-sets are identical.
+
+
+    Shape of a Pattern: The vector of the dimensions of a Pattern: e.g. [] for
+                    a Pattern with num_axes = 1 or [2 3] for a Pattern with
+                    num-axes = 2.  Note: whenever we display dims vectors in
+                    square brackets or use "shape" without qualification, it
+                    implies we are displaying them in the public numbering.
+                    The "private shape" of a Pattern are be the same in the
+                    private numbering, which is in the reverse order to
+                    the public numbering and which we'd display in curly
+                    braces, like {3, 2}.
+
+    Shape of a Pattern-tuple:  The shape of a Pattern-tuple is
+                    formed by taking the shapes of each Pattern in the tuple,
+                    extending them on the left with 1's as necessary to make
+                    them the same size, then taking the largest dim on each axis
+                    (i.e. the one that is not equal to 1, if they are
+                    different).  For example, for a Pattern-tuple of Patterns
+                    whose shapes were ([4 1 5], [6 1], [5]), the shape of the
+                    tuple would be [4 6 5].  (Note: the Patterns in a
+                    Pattern-tuple must be broadcastable, so if the dims are
+                    different, one of them must be 1.)
+
+    Trivial axis:     An axis of a Pattern for which dim=1.  Such axes will have
+                      stride=0 if the Pattern is valid.
+
+    Memory-index-set of a Pattern:
+                      The memory-index-set M(p) of a Pattern p is
+                      the set of all memory-indexes obtained by indexing
+                      the pattern with all index-tuples in the index-tuple-set
+                      I(p) of the Pattern.  By extending the notion of indexing
+                      a Pattern (c.f. "Indexing a Pattern") to take set
+                      arguments, this could be written as M(p) = p[I(p)].  Note:
+                      by the uniqueness property, we always have |M(p)| = |I(p)|
+                      for a valid Pattern, i.e. the sizes of the sets are the
+                      same.
+
+    Memory-index-tuple-set of a Pattern-tuple:
+                      The set of all memory-index-tuples M(P, Q) obtained by indexing
+                      the Patterns in the tuple (P, Q) with all members of the
+                      index-tuple-set of the Pattern-tuple.  See "memory-index-tuple"
+                      and "index-tuple-set of a Pattern-tuple" for more information.
+                      View the notation M(P, Q) as shorthand for M((P, Q)).
+
+    Normalized strides:  We say that a Pattern has normalized strides if the
+                      strides are all nonnegative and the nonzero strides
+                      are in strictly increasing order in the private numbering
+                      (hence strictly decreasing in the public numbering).
+                      See also: Default strides (which is a stronger property).
+
+    Normalized+ strides:  Normalized+ strides are strides that are normalized
+                      but also positive.  I.e. the strides are positive, and
+                      increasing in the private numbering / decreasing in
+                      the public.
+
+    Linear property:
+                      This is a slightly technical property used in certain
+                      proofs involving patterns.
+                      Consider patterns P and Q with the property that the
+                      memory-index-set of P is a subset of the memory-index-set of
+                      Q.  If i is an index-tuple, let P(i) be the map from
+                      i to a memory-index, and let
+                            \f$   Q^{-1}(m)   \f$
+                      be the function that maps a memory-index m in the memory-index-set
+                      of Q to the index-tuple i in the index-tuple-set of Q such
+                      that Q(i) = m.  Then we say that P is linear in Q if
+                      for all index-tuples i and j such that i, j and i + j are
+                      in the index-tuple-set of P,
+                      \f$  Q^{-1}(P(i)) + Q^{-1}(P(j)) = Q^{-1}(P(i+j)) \f$.
+                      [Transitivity]
+                      It is easy to show that the linear property is transitive;
+                      that is if P is linear in Q and Q is linear in R, then
+                      P is linear in R.
+    Reduced pattern:
+                      A pattern is in reduced form if there is no
+                      set-equivalent pattern which has fewer axes.
+
+                      What this means more concretely is that the pattern has no
+                      trivial axes and has no pairs of axes which could be
+                      combined.  For example, a matrix where successive rows
+                      "touch" (i.e. they are not separated by a stride) can
+                      always be reduced.
+
+    Reduced pattern-tuple:
+                      A pattern-tuple is in reduced form if there is no
+                      set-equivalent pattern-tuple which has fewer axes
+                      (defining the num_axes of a pattern-tuple as the
+                      greatest of the num_axes of the patterns in the tuple).
+
+                      What this means more concretely/intuitively is that there
+                      are no axes which are trivial for all patterns and can be
+                      removed; and there are no pairs of axes which can be
+                      combined for all patterns in the tuple.
+
+
+
+    Regularity property:   This is a property of Patterns that is relevant when
+                      reducing Patterns to a common set of strides.  It can
+                      be thought of as a relaxed version of the axis-dominance
+                      property.
+
+                      We formulate the regularity property to only apply for
+                      Patterns which are valid-2 and which have positive strides
+                      in increasing order.  The stipulation on having postive,
+                      sorted strides is just for convenience, since we happen to
+                      need this property only in that case and it's easier to
+                      formulate in that case.
+
+                      A Pattern is regular if, in addition to satisfying the
+                      properties mentioned above, for each axis-index
+                        0 <= i < num_axes - 1,
+                      there is an integer k with i < k <= num_axes, such that:
+                        (i) For all j with i < j < k, stride(i) divides stride(j)
+                            exactly and dim(j) = 1.
+                        (ii) Either k == num_axes, or dim(i) * stride(i) <= stride(k),
+
+                      The reader may notice that if we were to restrict k to
+                      equal i + 1, then this would be equivalent to the
+                      axis-dominance property (property (v)) plus the
+                      requirement that the strides be positive and sorted (which
+                      we only added for convenience).
+
+    Storage region:   A Tensor, in addition to a Pattern, has a storage region
+                      that can be though of as a pointer (say, to float) which
+                      we index with a memory-index: say, p[m], if s is the
+                      pointer and m is the memory-index.  See storage.h.
+                      See also "Dereferencing a memory-index".
+
+    Stride:           A stride is the distance, in elements, between successive
+                      elements of a Tensor along a particular dimension.
+                      For example, a Tensor with one axis having dim=3 and
+                      stride=2 would have its elements laid out in memory
+                      as:  `[ element0  xxx   element1  xxx  element2 ]`,
+                      where `xxx` means an element that is not part of the
+                      Tensor.  Axes with dimension=1 always have stride=0
+                      in this toolkit.  Tensors with negative strides may be created,
+                      although they will be copied to temporaries with
+                      positive stride in linear algebra operations where
+                      necessary (since most BLAS implementations do not support
+                      negative stride).
+
+   Uniqueness property:  A property of a Pattern that there does not exist
+                      two index-tuples i1 and i2 which are different in non-trivial
+                      axes of the pattern (i.e. i1[r] != i2[r] for some r that is
+                      a non-trivial raxis of the pattern), which when
+                      used to index the Pattern, generate the same memory-index.
+
+                      The axis-dominance property is sufficient, but not
+                      necessary, to ensure the uniqueness property.  (The
+                      uniqueness property is probably not easy to test for
+                      efficiently in the general case where axis dominance does
+                      not hold).
+
+    Valid Pattern:
+                     A valid Pattern must be as follows.  Think of this as the mathematical definition;
+                     see the declaration of struct Pattern for additional details about how
+                     it is stored.
+
+                          (i)  The num_axes must satisfy 0 <= num_axes <= KALDI_TENSOR_MAX_DIM
+                          (ii) The dims must all be >0a
+                          (iii) We require dim[i] == 1 and strides[i] == 0 for
+                               num_axes < i < KALDI_TENSOR_MAX_DIM
+                          (iv) We require that no memory index reachable through the pattern
+                               be negative, which can be expressed as:
+                               offset + \sum_{i=0}^{num_axes - 1} min(0, strides[i]*(dims[i]-1)) >= 0
+                          (v) The strides must be nonzero for axes i with dim[i] != 1.
+                          (vi) the axis-dominance property.   This property is sufficient, but not
+                               necessary, to ensure the uniqueness property.  It requires that
+                               when the axes are sorted from least to greatest value of abs(stride),
+                               for each axis-index 0 <= r < num_axes - 1 (using the private numbering
+                               of axis-indexes),
+                                    dim(r) * abs(stride(r)) <= abs(stride(r+1)).
+                               (Note: this property doesn't require that the axes be sorted that
+                               way; if you need that, search for "Canonical form").
+                         (vii) the strides must be zero for axes with dim=1.
+
+
+     Valid-1 Pattern:
+                      A Pattern is valid-1 (read as: valid minus one) if it
+                      satisfies properties (i) through (vi) of a valid Pattern
+                      (i.e. it may have nonzero strides for axes with dim=1, but
+                      must otherwise be valid).  A valid pattern is also valid-1.
+
+     Valid-2 Pattern:
+                      A Pattern is valid-2 (read as valid minus two) if it
+                      satisfies properties (i) through (v) of a valid Pattern
+                      and also satisfies the uniqueness property.  That is, it must
+                      be a valid Pattern, except:
+                       - it may have nonzero strides for axes with dim=1, since
+                         we don't require property (vi)
+                       - it does not have to satisfy the axis-dominance property
+                         (property (vi)).
+                      However, it must still satisfy the uniqueness property
+                      (see its glossary entry); we don't normally explicitly
+                      require the uniqueness property because it is implied by
+                      the axis-dominance property.
+                      A pattern that is valid or valid-1 is also valid-2.
+ */
+
+
+/*
+  This struct stores the dimension and strides of a Tensor.
+
+  Below we describe the the properties that a Pattern is required to have.
+  Most of them are described in the glossary in the entry for "Valid Pattern",
+  but there are a couple more that have to do with the specifics of how we
+  store things in this struct.
+
+  These properties are stricter than some other frameworks, such as PyTorch,
+  which allow the users to manually add dimensions with stride 0 and dim > 1 so
+  that a lower-dimensional quantity can masquerade as one with a higher
+  dimension.  (This framework allows the same kinds of operations, they are just
+  not done by the same mechanism).   We
+  also don't allow zero dims (i.e. a Tensor that is initialized must not have
+  num_elemnts==0).  If you want an empty Tensor, just use a null pointer.  In
+  addition, we require that the stride equal zero for any axis that has dim = 1.
+  There is also the "axis-dominance" property (see its glossary entry for more info).
+
+  Our requirements of a Pattern are:
+
+    0 <= num_axes <= KALDI_TENSOR_MAX_DIM.
+
+    for 0 <= i < num_axes:
+       dims[i] > 0
+       if dims[i] == 1, then strides[i] = 0.
+       if dims[i] != 1, then strides[i] != 0
+
+    for num_axes <= i < KALDI_TENSOR_MAX_DIM:
+       dims[i] == 1
+       strides[i] == 0
+
+    offset >= 0
+
+    The axis-dominance property (see property (v) in "Valid Pattern" above)
+
+  Note: in the public interface of class Tensor, if you ask for Dim(i) it will
+  return pattern.dims[pattern.num_axes - i], i.e. the interface uses the public
+  numbering, while the axes are physically stored using the reversed "private
+  numbering".   This reversal makes it much easier to implement
+  PyTorch-style broadcasting where in an operation on Tensors of dims,
+  say, (3,4) and (4), the (4) is interpreted as (1,4).
+*/
+struct Pattern {
+  int32 num_axes;
+  int32 dims[KALDI_TENSOR_MAX_DIM];     // the dims in reversed order, indexed
+                                        // by 'raxis' (reversed axis)
+  int32 strides[KALDI_TENSOR_MAX_DIM];  // the strides in reversed order,
+                                        // indexed by 'raxis' (reversed axis)
+  int64 offset;  // Offset of the element with all-zero indexes
+                 // from the start of the originally allocated memory
+                 // region
+
+  int32 code;  // pattern code; from user-level code it should be accessed via
+               // GetCode(), which ensures it is set.  See documentation for
+               // ComputePatternCode() in pattern-utils.h for details of what
+               // this represents.  If this is negative then it means it has not
+               // been computed.  In a valid Pattern the code will always be
+               // either negative or up-to-date; GetCode(), which assumes the
+               // Pattern was valid, computes the code if it was negative.
+
+  int32 properties;  // More occasionally-needed properties.  This is similar to
+                     // OpenFst's notion of properties, where we compute them
+                     // only on demand.  In a valid Pattern the properties
+                     // will always be accurate, but see "Accurate properties"
+                     // in glossary above for definition (it can be zero).
+
+  // Returns true if the Pattern is valid.  This includes all the
+  // mathematical conditions on a valid Pattern (search above for "Valid
+  // Pattern"), plus extra conditions related to struct Pattern,
+  // namely: dims and strides with index >= num_axes should be
+  // 1 and 0 respectively; and the code should either be -1 or or
+  // be the same as ComputePatternCode() returns on this pattern.
+  // See also IsCanonical() in pattern-utils.h.
+  bool IsValid();
+
+  // Returns the pattern's code (the correct code, not -1).  Requires
+  // the Pattern to be valid (and in a valid Pattern, the code must be -1
+  // or the correct code, so we may assume a code >= 0 is the correct one.
+  int32 GetCode();
+
+  // This comparator induces a total ordering on valid Patterns.  It is a
+  // lexical comparison on the offset, num_axes, dims and strides.  (The code
+  // does not need to be compared because, if not -1, it is a function of the
+  // dims and strides).
+  bool operator < (const Pattern &other) const;
+
+
+  // Equality operator on Pattern.  Compares the num_axes, offset, and dims and
+  // strides indexed [0... num_axes-1].  (In patterns that satisfy IsValid(),
+  // the remaining dims and strides would be 1 and 0 respectively, so checking
+  // them is pointless).
+  bool operator == (const Pattern &other) const;
+
+  inline bool operator != (const Pattern &other) const {
+    return !(*this == other);
+  }
+
+  // Assignment operator (copies all members).
+  bool operator = (const Pattern &other) const;
+};
+
+
+/// Returns a string representing a Pattern, of the form:
+/// "offset=a dims=[b c d] strides=[e f g]"; this is for debugging
+/// purposes.
+std::string PatternAsString(const Pattern &pattern);
+
+/// Returns a string representing the dims of a Pattern, something like
+/// "[10 20 100]"
+std::string DimsAsString(const Pattern &pattern);
+
+/// Returns a string representing the strides of a Pattern, something like
+/// "[1 10 200]"
+std::string StridesAsString(const Pattern &pattern);
+
+
+
+// We may later get rid of this struct and just have functions to get
+// these properties.
+struct PatternProperties {
+  // Below are cached properties that are derived from the underlying data in
+  // struct Pattern.
+
+  // The number of elements in the Tensor, which equals the product
+  // of dims[0] .. dims[num_axes - 1].  Will always be >0.
+  int64 num_elements;
+
+
+  // Binary code describing the pattern, see GetPatternCode() in
+  // pattern-utils.h.
+  int32 code;
+
+  // is_contiguous means that the data form a contiguous block in memory; it is
+  // not the same as PyTorch's is_contiguous which is a stronger condition,
+  // implying also that the strides are as for a `C-style` array.
+  // TODO: see if this is even needed; it may not be.
+  bool is_contiguous;
+
+  // has_c_strides means that the strides of all axes i with dim[i] != 1,
+  // equal the product of all later-numbered dims, i.e.
+  // \f$ strides[i] = \prod_{j>i} dim[j] \f$, or `strides[i] = 0` if
+  // dim[i] == 1 (since we use the convention that axes with dim=1 always
+  // have stride=0.
+  // has_c_strides is the equivalent of PyTorch's is_contiguous.
+  // this->has_c_strides implies this->is_contiguous.
+  // TODO: see if this is even needed; it may not be.
+  bool has_c_strides;
+
+  // Sets the members of *this to be the properties of pattern 'pattern'.
+  // Ignores the previously existing values of *this.
+  void UpdateProperties(const Pattern &pattern);
+};
+
+
+
+/**
+   Returns a hash value for hashing Pattern.  Depends on num_axes,
+   offset, and dims and strides indexed [0... num_axes-1].  pattern does
+   not have to be valid.
+ */
+size_t GetHash(const Pattern &pattern);
+
+// C++ hashing object for Pattern
+struct PatternHasher {
+  size_t operator (const Pattern &pattern) { return GetHash(pattern); }
+};
+
+// C++ hashing object for Pattern*; requires the pointer
+// be non-NULL and to point to a Pattern.
+struct PatternPtrHasher {
+  size_t operator (Pattern *pattern) { return GetHash(*pattern); }
+};
+
+struct PatternPtrEqual {
+  size_t operator (Pattern *pattern1,
+                   Pattern *pattern2) {
+    *pattern1 == *pattern2;
+  }
+};
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_PATTERN_H_
diff --git a/src/tensor/scalar.h b/src/tensor/scalar.h
new file mode 100644
index 00000000000..6e53291b5ca
--- /dev/null
+++ b/src/tensor/scalar.h
@@ -0,0 +1,59 @@
+// tensor/scalar.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_H_
+#define KALDI_TENSOR_TENSOR_H_ 1
+
+#include "tensor/tensor-common.h"
+#include "tensor/pattern.h"
+#include "tensor/tensor-impl.h"
+#include "tensor/storage.h"
+
+
+
+
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   Scalar is how we wrap user-supplied constant scalar value.  Right now this
+   basically wraps a double, but for future extensibility to ints, complex
+   numbers and so on, we make it a class.
+*/
+class Scalar {
+ public:
+  Scalar(float f): value_(f) { }
+  Scalar(double d): value_(d) { }
+
+
+  float operator float() const (return value_);
+  float operator double() const (return value_);
+  // DataType Dtype() { return dtype_; }
+ private:
+  double value_;
+  // DataType dtype_;
+};
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_H_
diff --git a/src/tensor/storage.cc b/src/tensor/storage.cc
new file mode 100644
index 00000000000..494a3347382
--- /dev/null
+++ b/src/tensor/storage.cc
@@ -0,0 +1,29 @@
+// tensor/storage.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/storage.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
diff --git a/src/tensor/storage.h b/src/tensor/storage.h
new file mode 100644
index 00000000000..7add77c5510
--- /dev/null
+++ b/src/tensor/storage.h
@@ -0,0 +1,215 @@
+// tensor/storage.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_STORAGE_H_
+#define KALDI_TENSOR_STORAGE_H_ 1
+
+#include <functional>
+#include "tensor/tensor-common.h"
+#include "tensor/memory-checker.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+struct StorageAux;
+
+// 'Storage' contains a single allocated region (on CPU or GPU, according to
+// 'device').
+class Storage {
+ public:
+
+  // This initializes a ChangeTracker object in this->tracker if it
+  // does not already exist, and returns its address.
+  ChangeTracker *GetChangeTracker();
+
+  inline bool Allocated() { return (data != NULL); }
+
+  // Returns the raw data pointer.
+  inline void *Data() {
+    if (data) {
+      return data;
+    } else {
+      Allocate();
+      if (zero_on_allocation_)
+        Zero();
+      return data;
+    }
+  }
+
+  /**
+     This is called from TensorImpl when we call AllowUndefined() on it.
+     It gives the framework a free pass to not do zero-on-allocation
+     on the part of memory underlying this particular TensorImpl.  It
+     will also cause data_ to be allocated if it was not already allocated.
+  */
+  inline void AllowUndefined(const TensorImpl &impl) {
+    if (data_ == nullptr && zero_on_allocation_) {
+      Allocate();
+      ZeroEverythingElse(impl);
+    }
+  }
+
+  /**
+     Creates a Storage object for device 'device' with size 'num_bytes'.
+     The actual data will not be allocated until someone calls this->Data().
+
+       @param [in] device  The device on which the data is to be allocated
+       @param [in] num_bytes  The number of bytes to be allocated; must be >0.
+  */
+  Storage(Device device, size_t num_bytes);
+
+  /**
+     This constructor is intended for use with data allocated by code outside
+     this codebase (for instance in external toolkits).
+
+          @param [in] device  The device on which this data exists
+          @param [in] data    Pointer to the data to be held
+          @param [in] num_bytes  The number of bytes held in this region
+                              (does not have to be exact, but should be
+                              at least the number of bytes in the part of
+                              this memory block that is going to be accessed
+                              through this Storage object.
+          @param [in] deallocator A std::function, which, if not nullptr,
+                              will be invoked in
+   */
+  Storage(Device device,
+          void *data,
+          size_t num_bytes,
+          DeallocatorFunc deallocator):
+      data(NULL), num_bytes(0),
+      device(device),
+      deallocator(deallocator) { }
+
+  // Returns true if the data has already been allocated.  I am hoping that it
+  // will never be necessary to call this.
+  bool IsAllocated();
+
+
+  /**
+     The user can call this as a low-cost mechanism to (conceptually) zero the
+     data in a storage region.  Rather than physically zeroing the data, it
+     records the intention to zero it as soon as it is allocated (see "Lazy
+     allocation" in tensor.h).  Later on, when the data is allocated, it may
+     actually not have to be zeroed if AllowUndefined() is called by the
+     operation that acts on it.
+
+     This is anticipated to be used mostly in backprop code, for deriv_
+     matrices, since conceptually the main operation we do on deriv_ matrices is
+     to add to them.
+  */
+  inline void ZeroOnAllocation() { zero_on_allocation_ = true; }
+
+
+
+  // Deallocates the data.  This is user-callable because our autograd mechanism
+  // deletes the underlying data of gradients that are no longer needed, while
+  // keeping around the metadata in cases where it is instructed to retain the
+  // autograd graph.  Conceptually we think of this as simply zeroing the
+  // relevant gradients, since any data that is deallocated is implicitly
+  // treated as zero.
+  // Calling this is an error if a deallocator function was provided
+  // to the constructor of this object.
+  void Deallocate();
+
+  // Destructor that frees any data held.
+  ~Storage();
+
+  inline int64 Id() { return id_; }
+
+ private:
+
+  // Allocate the data.  It is an error to call this if data_ != NULL.
+  void Allocate();
+
+  // Zero all the data held here, which is required to have already been
+  // allocated.
+  void Zero();
+
+  // Zero all the data held here *except* possibly the memory region
+  // underlying `impl` (although if it's more convenient, this function is
+  // allowed to zero it; at exit its contents will be undefined).
+  // data_ is required to have already been allocated.
+  void ZeroEverythingElse(const TensorImpl &impl);
+
+
+  // 'data_' is either 'nullptr' or the actual data pointer.  Due to lazy allocation,
+  // the 'data' pointer will remain NULL until it is actually needed.  Lazy
+  // allocation makes it much easier to set up the autograd graph without
+  // allocating the memory for the gradients.
+  void *data_;
+
+  // The tick (see GetTick()) at which this Storage region was created; serves
+  // as a unique identifier.
+  int64 id_;
+
+  bool zero_on_allocation_;
+
+  // num_bytes is the number of bytes in the region we have allocated
+  // (or are going to allocate).
+  size_t num_bytes;
+
+  // the device the data is located on (or is to be located on).
+  Device device;
+
+  // contains some extra, less-often-used fields
+  std::unique_ptr<StorageAux> extras;
+
+};
+
+
+
+// struct StorageAux contains some rarely-needed extra fields that we didn't
+// want to keep in class Storage; we store them separately, holding a
+// possibly-NULL pointer to struct StorageAux, to reduce the size of struct
+// Storage in the normal case.
+struct StorageAux {
+  using DeallocatorFunc = std::function<void()>;
+
+  // 'change_tracker' is used in debug mode to detect when data that might be
+  // required in the backprop phase has changed before we read it.
+  std::unique_ptr<ChangeTracker> change_tracker;
+
+  // 'uninitialized_checker' is used in debug mode to detect when data
+  // that has been allocated but never written to is read.
+  // required in the backprop phase has changed before we read it.
+  std::unique_ptr<UninitializedDataChecker> uninitialized_checker;
+
+  // 'invaliated_checker' is used in debug mode to detect when parts of
+  // derivatives that have been invalidated are read; read the
+  // comment for that class, in memory-checker.h, for complete
+  // info.
+  std::unique_ptr<InvalidatedDataChecker> invalidated_checker;
+
+  // 'deallocator' is to be used with external toolkits, for example, to
+  // decrease the refcount.  In normal cases it will be nullptr.
+  // If non-NULL, it will be invoked when we want to deallocate the
+  // storage object.
+  DeallocatorFunc deallocator;
+
+  // TODO: allow for some kind of name here, reflecting where it came from?
+};
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+#endif  // KALDI_TENSOR_STORAGE_H_
diff --git a/src/tensor/tensor-common.h b/src/tensor/tensor-common.h
new file mode 100644
index 00000000000..bd2884a8a72
--- /dev/null
+++ b/src/tensor/tensor-common.h
@@ -0,0 +1,167 @@
+// tensor/tensor-common.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_COMMON_H_
+#define KALDI_TENSOR_TENSOR_COMMON_H_ 1
+
+#include <cstdint>
+#include <vector>
+#include <string>
+#include "kaldi/base/kaldi-error.h"
+
+/**
+   This is some notes on plans for kaldi10 tensor stuff, nothing is fully fleshed out.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+typedef int64_t int64;
+typedef uint64_t uint64;
+typedef int32_t int32;
+typedef uint32_t uint32;
+
+
+
+enum DeviceType {
+  kCpuDevice = 0,
+  kCudaDevice = 1
+};
+
+
+// We may later add a device number (like which GPU we are using),
+// once we support multiple GPUs.
+struct Device {
+  DeviceType device_type;
+
+  Device(): device_type(kCpuDevice) { }
+  Device(DeviceType t): device_type(t) { }
+
+  std::string ToString() const;
+
+  // TODO: operator ==
+  // maybe in future we'll make a way to set the default device.
+};
+
+
+// TODO: use NumPy data-type enum, for greater compatibility.
+
+enum DataType {
+  // We will of course later extend this with many more types, including
+  // integer types and half-precision floats.
+  kFloatDtype = 1,
+  kDoubleDtype = 2,
+  kInt32Dtype = 3,
+
+
+  // The following enum members are to be used when we want a case statement
+  // over pairs of dtypes, say dtype1 and dtype2.  We would do this as: DataType
+  // pair_dtype = static_cast<DataType>(int32(dtype1) + (int32(dtype2) << 4));
+  kFloatFloatDtypes = 0x11,
+  kFloatDoubleDtypes = 0x12,
+  kFloatInt32Dtypes = 0x13,
+  kDoubleFloatDtype = 0x21,
+  kDoubleDoubleDtype = 0x22,
+  kDoubleInt32Dtype = 0x23
+};
+
+
+
+inline int32 SizeOf(DataType dtype) {
+  switch(dtype) {
+    case kFloatDtype: return 4;
+    case 1: return 8;
+    case 2: KALDI_ERR << "Invalid data-type " << int32(dtype); return 0;
+  }
+}
+
+
+/// Enumeration that says what strides we should choose when
+/// allocating a Tensor that is a copy of a provided Tensor.
+enum StridePolicy {
+  kKeepStrideOrder,  // Means: keep the size-ordering of the strides from the
+                     // source Tensor (but the chosen strides will all be
+                     // positive even of some of the source Tensor's strides
+                     // were negative).
+  kNormalized,    // Means: strides for dimensions that are != 1 are ordered from
+                 // greatest to smallest as in a "C" array in the public
+                 // numbering, or smallest to greatest in the private numbering.
+                 // Per our policy, any dimension that is 1 will be given a zero stride.
+                 // C.f. "Normalized strides" in pattern.h
+  kCopyStrides   // Means: use the exact strides provided.
+};
+
+/// Enumeration that says whether to zero a freshly initialized Tensor.  Note:
+/// the Tensor won't actually be zeroed when you construct it, it will be zeroed
+/// whenever it's actually needed (delayed allocation).
+enum InitializePolicy {
+  kZeroData,
+  kUninitialized
+};
+
+
+/// This enumeration value lists the unary functions that we might
+/// want to apply to Tensors; it exists so that much of the glue
+/// code can be templated.
+enum UnaryFunctionEnum {
+  kUnaryFunctionExp,
+  kUnaryFunctionLog,
+  kUnaryFunctionRelu,
+  kUnaryFunctionInvert,
+  kUnaryFunctionSquare
+  // TODO: add more.
+};
+
+
+
+/// This enumeration value lists the binary functions that we might
+/// want to apply to Tensors; it exists so that much of the glue
+/// code can be templated.  (Note: multiplication is not counted
+/// here; that is a special case as it will genearlly go to BLAS).
+enum BinaryFunctionEnum {
+  kBinaryFunctionAdd,
+  kBinaryFunctionDivide,
+  kBinaryFunctionMax,
+  kBinaryFunctionMin
+};
+
+
+enum TensorUseEnum {
+  kRead,
+  kReadWrite,
+  kWrite,
+  kCheckUninitialized,
+  kInitialize,
+  kReadAndInvalidate,
+  kInvalidate
+};
+
+
+// In practice we don't expect user-owned tensors with num-axes greater than 5
+// to exist, but there are certain manipulations we do when simplifying matrix
+// multiplications that temporarily add an extra dimension, and it's most
+// convenient to just increase the maximum.
+#define KALDI_TENSOR_MAX_AXES 6
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_COMMON_H_
diff --git a/src/tensor/tensor-functions.cc b/src/tensor/tensor-functions.cc
new file mode 100644
index 00000000000..5611becbd49
--- /dev/null
+++ b/src/tensor/tensor-functions.cc
@@ -0,0 +1,66 @@
+// tensor/tensor-functions.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+inline static void SclalarMultiply(
+    float alpha, float beta,
+    const Tensor &a, const Tensor &b, Tensor *c) {
+
+}
+
+
+
+void AddProductReducing(float alpha, float beta,
+                        const Tensor &a, const Tensor &b, Tensor *c){
+  CheckDeviceAndDtype(a, b, *c);
+
+  int32 a_pcode = a.PatternCode(), b_pcode = b.PatternCode(),
+      c_pcode = c->PatternCode();
+  int64 combined_pcode = (int64(a_pcode) << 24) + b_pcode << 12 + c_pcode;
+
+  // Each group of 3 hex numbers describes on of the argument Tensors,
+  // so it's 0xAAABBBCCC.
+  //
+  switch (combined_pcode) {
+
+    case 0x000000000:
+      // scalar multiplication
+
+
+
+
+  }
+
+
+  SubTensor a_temp(a), b_temp(b), c_temp(*c);
+
+  PadAxes(&(a.pattern), &(b.pattern), &(c.pattern));
+
+  CompressPatterns({&a_temp, &b_temp, &c_temp});
+}
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/tensor-functions.h b/src/tensor/tensor-functions.h
new file mode 100644
index 00000000000..e534414bc72
--- /dev/null
+++ b/src/tensor/tensor-functions.h
@@ -0,0 +1,521 @@
+// tensor/tensor-functions.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_FUNCTIONS_H_
+#define KALDI_TENSOR_FUNCTIONS_H_ 1
+
+#include "tensor/tensor.h"
+
+namespace kaldi {
+namespace tensor {
+
+// This file contains functions that operate on Tensors in various ways.  To
+// avoid class Tensor blowing up hugely, we implement these things outside
+// class Tensor.
+
+
+// Note: we use the distinction between references and pointers the same way as
+// you might expect from Google-style-guide code, to reflect which Tensors'
+// contents are changed (so a pointer argument might have its contents changed.
+// But these are in most cases pointers to const Tensors; they can be
+// donst because the metadata is not changed, even if the data is.
+
+
+// Sets all elements of the tensor to zero.
+void SetZero(const Tensor *tensor);
+
+// Sets all elements of the tensor to value f (cast to whatever type this Tensor
+// has).
+void Set(float f, const Tensor *tensor);
+
+
+/** Transpose the two specified axes of a Tensor
+
+    @param [in] axis1  First axis to be transposed; must be in range
+                       `[-t->NumAxes(), t->NumAxes() - 1]`,
+                       with negative axis being interpreted as an offset
+                       from t->NumAxes().
+    @param [in] axis2  Second axis to be transposed; must be in range
+                       `[-t->NumAxes(), t->NumAxes() - 1]`.
+                       If identical to axis1, nothing will be done.
+    @param [in,out] t     Tensor whose axes are to be transposed.
+ */
+inline void Transpose(int32 axis1, int32 axis2, Tensor *t) {
+  Transpose(axis1, axis2, &(t->impl_));
+}
+
+/**
+   Copy the data from tensor 'src' to tensor 'dest', allowing broadcasting
+   or summation.  Requires Broadcastable(src, *dest).
+
+   Does not require that the Dtype() or Device() of src and dest be the same
+   (i.e. does not require Compatible(src, *dest)).  This is the only way in
+   which Copy() is more general than Add(); otherwise, what Copy() does is a
+   strict subset of what Add(1.0, 0.0, ...)  can do.
+*/
+void Copy(const Tensor &src, const Tensor *dest);
+
+
+/**
+   Template used to implement unary functions such as Log, Relu, and
+   so on (this avoids boilerplate).
+
+   Implements dest = F(src), where the F is applied elementwise.
+
+     @param [in] src  Source Tensor
+     @param [out] dest  Destination Tensor.  We require
+                       SameDim(src, *dest).  May be the same
+                       Tensor as 'src' (but must not partially
+                       overlap in memory with 'src').
+
+ */
+template <UnaryFunctionEnum F>
+void UnaryFunctionTpl(const Tensor &src, const Tensor *dest);
+
+
+/*
+   Implements *dest = exp(src), applied elementwise.
+
+     @param [in] src  Source Tensor
+     @param [out] dest  Destination Tensor.  We require
+                       SameDim(src, *dest).  May be the same
+                       Tensor as 'src' (but must not partially
+                       overlap in memory with 'src').
+ */
+inline void Exp(const Tensor &src, const Tensor *dest) {
+  UnaryFunctionTpl<kUnaryFunctionExp>(src, dest);
+}
+
+// TODO: other unary function wrappers.
+
+
+
+/**
+   Template used to implement binary functions such as division,
+   taking to a power, max, min.
+
+   Implements c = F(a, b), where F is some function of two scalars
+   that returns a scalar.
+
+     @param [in]  a  First source Tensor
+     @param [in]  b  Second source Tensor
+     @param [out] c  Destination Tensor.
+                   We require Broadcastable(a, b, c, true).
+*/
+template <BinaryFunctionEnum F>
+void BinaryFunctionTpl(const Tensor &a, Tensor &b, const Tensor *c);
+
+
+
+/*
+   Implements c = a / b, applied elementwise.
+
+     @param [in] a  First source Tensor
+     @param [in] b  Second source Tensor
+     @param [out] c   Destination Tensor.  We require Broadcastable(a, b, c, true).
+                    'c' does not have to be initialized on entry and is allowed
+                    to be the same Tensor as one of a or b.
+ */
+inline void Div(const Tensor &a, Tensor &b, const Tensor *c) {
+  BinaryFunctionTpl<kBinaryFunctionDivide>(a, b, c);
+}
+
+
+
+/**
+   This is like PyTorch's slice() / narrow() functions.
+   It selects a range of dimensions on one of the axes.  It is similar to
+   indexing with a range in Python, like A[10:20].
+
+      @param [in] axis   Axis on which to possibly reduce the dimensionality;
+                         require -t->NumAxes() <= axis < t->NumAxes(), with
+                         negative axis interpreted as an offset from t->NumAxes().
+      @param [in] start  Starting index; must be in range [0, t->Dim(axis) - 1]
+      @param [in] end    Ending index; must be in the range [start + 1, t->Dim(axis)]
+      @param [in,out] t  Tensor whose metadata is to be modified.  Its NumAxes()
+                         is not changed by this function (unlike Select()).
+
+   See also: the other overloaded version of Slice() which accepts the 'step'
+   parameter; and Select(), which also reduces the num-axes.
+ */
+inline void Slice(int32 axis, int32 start, int32 end, Tensor *t) {
+  Slice(axis, start, end, &(t->impl_));
+}
+
+
+/**
+   This is a version of Slice() which also takes a 'step' argument to support
+   things like taking every other element.  See the documentation for the other
+   Slice() for more context.   This is related to indexing with a range
+   in Python: for example, A[0:6:2], selecting elements [0, 2, 4] of A.
+
+      @param [in] axis   Axis on which to possibly reduce the dimensionality;
+                         require -t->NumAxes() <= axis < t->NumAxes(), with
+                         negative axis interpreted as an offset from t->NumAxes().
+      @param [in] start  Starting index; must be in range [0, t->Dim(axis) - 1]
+      @param [in] end    Ending index.  If `step > 0` must be in the range
+                         [start + 1, t->Dim(axis)]; if step  < 0, must be
+                         in the range [start - 1, -1].
+      @param [in] step   Nonzero number that indicates the subsampling of elements
+                         (and possible axis flipping).
+      @param [in,out] t  Tensor whose metadata is to be modified.  Its NumAxes()
+                         is not changed by this function (unlike Select()).
+
+   See the other version of Slice(), and Select().
+ */
+inline void Slice(int32 axis, int32 start, int32 end, int32 step, Tensor *t) {
+  Slice(axis, start, end, stride, &(t->impl_));
+}
+
+
+/**
+   Select one element from an axis of Tensor 't', reducing t->NumAxes() by
+   one.
+
+       @param [in] axis Axis from which to select an element; require
+                         -t->NumAxes() <= axis < t->NumAxes(), with negative
+                         axis interpreted as an offset from t->NumAxes().
+       @param [in] index  Index in t to select; must be in range
+                         [0, t->Dim(axis) - 1].
+       @param [in,out]  t   Tensor whose metadata is to be modified.
+ */
+inline void Select(int32 axis, int32 index, Tensor *t) {
+  Select(axis, index, &(t->impl_));
+}
+
+
+/**
+   Scales each element of the Tensor `dest` by the scalar alpha.
+   Equivalent to a special case of CopyScaled() where src and dest
+   are the same.
+*/
+void Scale(Scalar alpha, const Tensor *dest);
+
+
+/**
+   Copy `src` to `dest` with broadcasting and possibly summation depending on
+   the dims.  Equivalent to a special case of Add() with `alpha == 1.0` and
+   `beta == 0.0.`
+
+   Formally equivalent to the following; for the notation, the most relevant
+   glossary entries in pattern.h are "Dereferencing a memory-index" and
+   "Memory-index-tuple-set of a Pattern-tuple".
+       (1)  For each memory-index `m` in `dest`, do: `*m = 0.0`
+       (2)  For each memory-index-tuple `(m_src, m_dest)` in the memory-index-tuple-set
+            `M(src, dest)`, do: `*m_dest += *m_src`.
+
+     @param [in] src     Source Tensor.
+     @param [out] dest   Destination Tensor.  Must satisfy
+                        `BroadcastableAndCompatible(src, *dest) && !Overlap(src, *dest)`
+ */
+void Copy(const Tensor &src, const Tensor *dest);
+
+/**
+   Copy with a scale, `dest := src * alpha`, where the scale is a
+   user-supplied scalar constant.
+   This copying may involve broadcasting and/or summation depending on the dims.
+   Equivalent to a special case of Add() with `beta == 1.0`.
+
+   Formally equivalent to the following; for the notation, the most relevant
+   glossary entries in pattern.h are "Dereferencing a memory-index" and
+   "Memory-index-tuple-set of a Pattern-tuple".
+       (1)  For each memory-index `m` in `dest`, do: `*m = 0.0`
+       (2)  For each memory-index-tuple `(m_src, m_dest)` in the memory-index-tuple-set
+            `M(src, dest)`, do: `*m_dest += alpha * *m_src`.
+
+     @param [in]  alpha   Scale used in the operation
+     @param [in]  src     Source Tensor.
+     @param [out] dest   Destination Tensor.  Must satisfy
+                        `BroadcastableAndCompatible(src, *dest) &&
+                         !Overlap(src, *dest) || Identical(src, *dest))`
+ */
+void CopyScaled(Scalar alpha, const Tensor &src, const Tensor *dest);
+
+/**
+   Copy with a scale, where the scale is a Tensor that the user asserts has only
+   one element.  (E.g. a previously computed scalar value).
+
+   This copying may involve broadcasting and/or summation depending on the dims.
+   Equivalent to a special case of Add() with `beta == 1.0`.
+
+   Formally equivalent to the following; for the notation, the most relevant
+   glossary entries in pattern.h are "Dereferencing a memory-index" and
+   "Memory-index-tuple-set of a Pattern-tuple".
+       (1)  For each memory-index `m` in `dest`, do: `*m = 0.0`
+       (2)  For each memory-index-tuple `(m_src, m_dest)` in the memory-index-tuple-set
+            `M(src, dest)`, do: `*m_dest += alpha * *m_src`.
+
+     @param [in]  alpha   Scale used in the operation, supplied as a Tensor.
+     @param [in]  src     Source Tensor.
+     @param [out] dest   Destination Tensor.  Must satisfy
+                        `BroadcastableAndCompatible(alpha, src, *dest) &&
+                         !Overlap(src, *dest) || Identical(src, *dest))`
+
+ */
+void CopyScaled(const Tensor &alpha, const Tensor &src, const Tensor *dest);
+
+/**
+   Does
+
+       dest := alpha * src  +  beta * dest
+
+   while supporting broadcasting and summation, as dictated by the shapes
+   of src and dest.  If beta == 0, guarantees that NaN's or inf's will
+   not be propagated from the original data in 'dest' (so it works with
+   uninitialized 'dest' if beta == 0).
+
+   Requires `Broadcastable(src, *dest), Compatible(src, *dest)` and
+   `Overlap(src, *dest) || Identical(src, *dest)`.  [Note: in the
+   case where `Identical(src, *dest)`, i.e. they are the same Tensor
+   with the same memory, you could also use Scale().
+
+      @param [in] alpha  Scale on 'src'
+      @param [in] beta   Scale on 'dest'
+      @param [in] src    Source Tensor, to be added to 'dest'
+      @param [in,out] dest  Destination Tensor.  Must satisfy
+                     `BroadcastableAndCompatible(src, *dest) &&
+                     !Overlap(src, *dest) || Identical(src, *dest))`,
+*/
+void AddTo(Scalar alpha, Scalar beta, const Tensor &src, const Tensor *dest);
+
+
+/**
+   Does
+
+       dest := alpha * src  +  beta * dest
+
+   while supporting broadcasting and summation, as dictated by the shapes
+   of src and dest.  If beta == 0, guarantees that NaN's or inf's will
+   not be propagated from the original data in 'dest' (so it works with
+   uninitialized 'dest' if beta == 0).
+
+   Requires `Broadcastable(src, *dest)`, alpha and beta
+   to have one element each, all arcs be Compatible() with each other,
+   `Overlap(src, *dest) || Identical(src, *dest)`, and for neither alpha
+   nor beta to overlap with src or dest. [Note: in the
+   case where `Identical(src, *dest)`, i.e. they are the same Tensor
+   with the same memory, you could also use Scale().
+
+      @param [in] alpha  Scale on 'src', supplied as a Tensor; must
+                         have
+      @param [in] beta   Scale on 'dest'
+      @param [in] src    Source Tensor, to be added to 'dest'
+      @param [in,out] dest  Destination Tensor.  Must satisfy
+                     `BroadcastableAndCompatible(src, *dest) &&
+                     !Overlap(src, *dest) || Identical(src, *dest))`,
+*/
+void AddTo(const Scalar &alpha, const Scalar &beta,
+           const Tensor &src, const Tensor *dest);
+
+/**
+   Does
+       dest += src
+   (note: this may involve broadcasting or summation depending on
+   the dimensions of dest and src.  Viewing dest and src
+   as patterns, the technical definition, with respect to the
+   notation in pattern.h, is: for each index-tuple i in the
+   index-tuple-set of (src, *dest), do: dest[i] += src[i].
+
+   Requires BroadcastableAndCompatible(src, *dest) && !Overlap(src, *dest).
+
+ */
+void AddTo(const Tensor &src, const Tensor *dest);
+
+
+
+
+
+/**
+  If possible, creates a new Tensor that has the requested dimensions,
+  as a 'view' of the provided Tensor; else returns NULL.  (For
+  explanation of the return type, see "Optional Tensor" in glossary
+  in tensor.h.)
+
+  The quick way to describe the semantics is: first, in the case where
+  'src' is laid out as a contiguous "C"-style array (w.r.t. the
+  public axis numbering), return a Tensor that's also a contiguous
+  "C"-style array looking at the same memory, with the provided
+  dims.  Then generalize this concept to when 'src' isn't laid out
+  as a "C"-style array, to preserve the same relationship between
+  the index-tuples that index "src" and the returned Tensor.
+
+  We can desribe this more precisely as follows: Consider the index-tuple-set
+  I(src) of the pattern `src`; and let list(I(src)) be that set considered as a
+  list sorted according to (the natural ordering c.f. "Natural order of
+  index-tuples").  Let I(dest) be the index-tuple-set of a Pattern with the
+  provided dimensions `dims`, and let list(I(dest)) be that set considered as an
+  ordered list as above.  Extend the notion of indexing a Pattern
+  (c.f. "Indexing a Pattern") to accept, and return, ordered lists in the
+  obvious way.  Then this function attempts to return a pointer to a TensorImpl
+  sharing the same storage as 'src', having a Pattern with the provided dims
+  `dims` satisfying dest[list(I(dest))] = src[list(I(src))] if such a Pattern
+  exists; and if that is not possible, returns NULL.
+
+
+     @param   [in] src  The source Tensor that we are attempting to
+                        construct a view of
+     @param   [in] dims  The dimensions requested of the destination
+                        Tensor.  Must be list of positive integers of size
+                        not exceeding KALDI_TENSOR_MAX_DIM, whose product
+                        equals NumElements(src).  The order is according
+                        to the public numbering of axes.
+     @return            Returns a `shared_ptr<TensorImpl>` of the constructed
+                        view, or NULL if that was not possible.
+
+# TODO: check that the following is valid.
+<code>
+    Tensor a({90}, kFloatDtype, kCpuDevice);
+    Tensor v = View(a, {9,5,2});  // Tensor constructor will crash if
+                                  // View returned NULL
+</code>
+ */
+std::shared_ptr<TensorImpl> View(const Tensor &src, ArrayRef<int32> dims);
+
+
+/**
+   Attempts to create a Tensor containing a new view of the data in the source
+   Tensor in which the axes numbered
+   (axis1, axis1+1, ... axis1+num_axes_to_merge-1) are merged.  This is
+   a special case of View(), provided for convenience.  For explanation of
+   the return type, search for "Optional Tensor" in tensor.h.
+
+   This attempt will only succeed if
+   `src.Stride(axis1) == src.Stride(axis1 + 1) * src.Dim(axis1 + 1)`, i.e.
+   if the two axes were laid out like a "C"-style array.
+
+   More formally, we can express the relationship as follows.  Suppose this
+   function returns a Tensor called `dest`; and write d = src.Dim(axis1).
+   For an index-tuple i in I(src) [c.f.: "Index-tuple-set of a Pattern" in
+   pattern.h], split up its indexes as:
+      i = j + k + l
+   where '+' in this context means appending the tuples, and 'k' corresponds
+   to the range of axes (axis1, axis1+1, ... axis1+num_axes_to_merge-1).
+   Let K be the set of such k values encountered from splitting up each
+   i in I(src) this way, and let f be a function from tuples to integers
+   that maps list(K) to a sequence of consecutive integers starting from
+   zero (search for "list:" in pattern.h for explanation).
+   Let g be a function from tuples to possibly-shorter tuples that
+   maps j + k + l to j + (f(k),) + l, here using Python-like notation to
+   interpret (x,) as a tuple with a single element x and "+" meaning appending.
+   Then this function returns a Tensor sharing the same storage as `src`
+   and with a Pattern such that dest[g(i)] = src[i] for all i in I(src) and
+   I(dest) = g(I(src)).
+
+      @param [in] src  Source Tensor which we are attempting
+                      to construct a view of
+      @param [in] axis1  Axis-index, in the public numbering.
+                      Must satisfy 0 < axis1 and
+                      axis1 + num_axes_to_merge <= src.NumAxes().
+                      The axes axis1 and axis1 + 1 will be merged.
+      @param [in] num_axes_to_merge   Default: 2.  Must be >= 1;
+                      if 1, the returned Tensor will be the same
+                      as 'src'.
+      @return         Returns a new TensorImpl that can be used to
+                      construct a Tensor with the axes merged
+                      as requested, or NULL if that was not possible.
+<code>
+    Tensor a({3,4,5}, kFloatDtype, kCpuDevice);
+    Tensor b = MergeAxes(0, &a);  // a now has dims {12,5}.
+</code>
+ */
+std::shared_ptr<TensorImpl> MergeAxes(const Tensor &src, int32 axis1,
+                                      int32 num_axes_to_merge = 2);
+
+/**
+   Modifies a Tensor by splitting the axis numbered `axis` into
+   multiple axes as supplied in the `dims` array.
+   The interpretation will be as for a "C"-style array; so, for instance,
+   if the dimensions of `src` were (10,12) and you called
+   `SplitAxis(src, 1, 3, 4)` resulting in a Tensor of dimensions
+   (10,3,4), the indexes along the original axis of dimension 12 would be
+   interpreted as 3 blocks of size 4.  (This is the normal semantics
+   of things like NumPy's reshape or PyTorch's view.)  Note:
+   the strides in the returned Tensor will be negative if the stride
+   of axis `axis` of `src` was negative.
+
+   More formally the relationship is as follows (most readers will want to skip
+   this).  Let `dims` be the vector of dims supplied; let I(dims) be the
+   memory-index-set of a Pattern with dimensions equal to `dims`; let
+   list(I(dims)) be that set ordered as in the natural ordering (c.f. "Natural
+   order of index-tuples" in pattern.h), and let f(i) be the function
+   from index-tuple to integers that when applied to list(I(dims)), produces a
+   sequence of consecutive integers starting from zero.  Let g be the
+   function from index-tuples to index-tuples that when applied on an
+   index-tuple i = (j, k, l), produces something like i = (j, k1, k2, k3, l)
+   where the tuple (k1,k2,k3) = f^{-1}(k), where of course f^{-1} is the inverse
+   function of f.  Then this function returns a Tensor `dest` sharing the same
+   storage as `src`, such that dest[g(i)] = src[i] for i in I(src) and
+   I(dest) = g(I(src))
+   (Relevant glossary entries in pattern.h to understand the notation
+   include "Index-tuple-set of a Pattern" and "Indexing a Pattern").
+
+      @param [in] src   The source Tensor whose axis is to be split
+      @param [in] axis  The index of the axis to be split; must
+                        satisfy `0 <= axis < src.Dims().`
+      @param [in] dims  The dimensions desired in the axes that
+                        replace axis 'axis'.  Their product must
+                        equal `src.Dim(axis)`.
+      @return           Returns a Tensor whose axis is split as
+                        requested.
+
+  Example:
+<code>
+  Tensor a({10,3}, kFloatDtype, kCpuDevice);
+  Tensor b = SplitAxis(a, 0, {2,5};  // b has dims {2,5,3}.
+</code>
+*/
+Tensor SplitAxis(const Tensor &src, int32 axis, ArrayRef<int32> dims);
+
+
+
+
+
+
+/**
+   Does:
+
+    `c := alpha (a * b)  +  beta c`
+
+   where '*' is elementwise multiplication subject to broadcasting rules.  This
+   supports reducing operations, and is the underlying implementation used in
+   things like matrix-matrix or matrix-vector product.
+
+   @param [in] alpha  Value that scales a * b
+   @param [in] beta   Value that scales the initial value of c
+   @param [in] a      First input tensor
+   @param [in] b      Second input tensor
+   @param [out] c     Tensor to be added to.  We require Broadcastable(a, b, c).
+                      Either its data must be initialized to a known
+                      value (if beta != 0) or it must be known to not contain NaN (if
+                      beta == 0).   We require BroadcastCompatible(a, b, c, true).
+                      'c' is const because its metadata is not changed; it is
+                      a pointer as a hint to the user that its data is changed.
+ */
+void AddProduct(float alpha, float beta,
+                const Tensor &a, const Tensor &b, const Tensor *c);
+
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_FUNCTIONS_H_
diff --git a/src/tensor/tensor-impl-linear.cc b/src/tensor/tensor-impl-linear.cc
new file mode 100644
index 00000000000..87139271306
--- /dev/null
+++ b/src/tensor/tensor-impl-linear.cc
@@ -0,0 +1,121 @@
+// tensor/tensor-impl-linear.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor-impl-linear.h"
+#include "tensor/tensor-impl-wrappers.h"
+
+namespace kaldi {
+namespace tensor {
+
+
+inline static void AddProductScalar3(
+    float alpha, float beta,
+    const TensorImpl &a, const TensorImpl &b, const TensorImpl *c) {
+  switch (a.device.device_type) {
+    case kCpuDevice:
+      AddProductScalar3Cpu(alpha, beta, a, b, c);
+      return;
+#ifdef HAVE_CUDA
+    case kCudaDevice:
+      AddProductScalar3Gpu(alpha, beta, a, b, c);
+      return;
+#endif
+    default:
+      KALDI_ERR << "Unsupported device type " << a.ToString();
+  }
+}
+
+
+void AddProduct(float alpha, float beta,
+                const TensorImpl &a, const TensorImpl &b, const TensorImpl *c){
+
+  if (a.pattern.code < b.pattern.code) {
+    // Ensure, via a recursion that a.pattern.code >= b.pattern.code.
+    // This avoids us having to test for the swapped versions of the patterns.
+    AddProduct(alpha, beta, b, a, c);
+    return;
+  }
+
+  CheckDeviceAndDtype(a, b, *c);
+
+
+  int64 combined_code = CombineCodes(a.pattern.code, b.pattern.code,
+                                     c->pattern.code);
+
+  /*
+    The case-statement values in the switch statement below may be
+    interpreted in groups of 3 hex characters, are 0xAAABBBCCC,
+    pertaining to Tensors a, b and c respectively.  See
+    GetPatternCode() in pattern-utils.h for documentation on
+    the meanings of the values and our notation with X,x,1.
+   */
+  switch(combined_code) {
+    case 0x000000000:
+      // () * () -> ()
+      // scalar * scalar -> scalar
+      AddProductScalar3(a, b, c);
+      return;
+    case 0x101000101:
+      //  (X) * ()-> (X)
+      // vector * scalar -> vector
+      AddProductVecScalarVec(a, b, c);
+      return;
+    case 0x101101101:
+      // (X) * (X) -> (X)
+      // vector .* vector -> vector
+      AddProductVec3(a, b, c);
+      return;
+    case 0x103101202:
+      // (x,X) * (X)  -> (X,1)
+      // vector * matrix -> vector.unsqueeze(-1)
+      AddProductMatVecVec(a, b, c);
+      return;
+    case 0x203101202:
+      // (X,x) * (X) -> (X,1)
+      // transposed-matrix * vector -> vector.unsqueeze(-1)
+      AddProductTmatVecVec(a, b, c);
+      return;
+    case 0x202101103:
+      // (X,1) * (X) -> (x,X)
+      // vector * vector -> matrix (outer product)
+      AddProductVec2Mat(a, b, c);
+      return;
+
+
+    default:
+      break;
+
+  }
+
+  // If we reached this point, it means we could
+  // not handle this request with any of the basic operations above.
+  // Something is a little differ
+
+
+  SubTensor a_temp(a), b_temp(b), c_temp(*c);
+
+  PadAxes(&(a.pattern), &(b.pattern), &(c.pattern));
+
+  CompressPatterns({&a_temp, &b_temp, &c_temp});
+}
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/tensor-impl-linear.h b/src/tensor/tensor-impl-linear.h
new file mode 100644
index 00000000000..0334f9a31a2
--- /dev/null
+++ b/src/tensor/tensor-impl-linear.h
@@ -0,0 +1,144 @@
+// tensor/tensor-impl-linear.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_IMPL_LINEAR_H_
+#define KALDI_TENSOR_IMPL_LINEAR_H_ 1
+
+#include "tensor/tensor.h"
+
+
+/**
+   This header contains basic linear-algebra and copying types of operations
+   on TensorImpl objects.  See also tensor-impl-nonlinear.h
+*/
+
+namespace kaldi {
+namespace tensor {
+
+
+
+/**
+   Does:
+
+    `c := alpha (a * b)  +  beta c`
+
+   where '*' is elementwise multiplication subject to broadcasting rules.  This
+   version supports reducing and broadcasting operations, and is where
+   matrix multiplication actually gets implemented; see Matmul().
+
+   The Tensors do not all have to have the same NumAxes(); they will
+   (conceptually) be made the same size by padding on the left with trivial axes
+   (dim=1;stride=0) to make them the same size.  (Physically, we'd pad
+   on the right, since the axes are stored in reversed order).
+
+   The Tensors need to have the same Dtype() and Device().
+
+   @param [in] alpha  Value that scales a * b
+   @param [in] beta   Value that scales the initial value of c
+   @param [in] a      First input tensor
+   @param [in] b      Second input tensor.
+   @param [out] c     Tensor to be added to; we require Broadcastable(a, b, c).
+                      and either c's data must be initialized to a known
+                      value (if beta != 0) or known to not contain NaN (if
+                      beta == 0); but we have to figure out whether we can drop
+                      the NaN requirements as some BLAS's may treat beta=0
+                      specially.
+ */
+void AddProduct(float alpha, float beta,
+                const TensorImpl &a, const TensorImpl &b,
+                const TensorImpl *c);
+
+
+/**
+   Copy elements from Tensor a to Tensor b, possibly broadcasting
+   or summing
+
+      @param [in]  a    The source Tensor.
+      @param [out] b   The destination Tensor.  We require
+                       Broadcastable(a, b, true).
+
+   See also Add(), which is more general than Copy.
+ */
+void Copy(const TensorImpl &a, const TensorImpl *b);
+
+
+/**
+   Add elements from Tensor a to Tensor b, broadcasting or summing
+   as dictated by the dimensions involved; does
+
+      \f$  b := \alpha a + \beta b.  \f$
+
+      @param [in]  a    The source Tensor.
+      @param [out] b   The destination Tensor.  We require
+                       Broadcastable(a, b).  It's OK for b's data to
+                       be uninitialized at entry if beta == 0.
+ */
+void AddTo(float alpha, float beta,
+           const TensorImpl &a,
+           const TensorImpl *b);
+
+
+/**
+   Version of Add that does a simple sum of two Tensors and writes to the
+   product to a third location i.e. does:
+
+        *c = alpha a  +  beta b
+
+   Requires Broadcastable(a, b, c).
+ */
+void Add(float alpha,
+         const TensorImpl &a,
+         float beta,
+         const TensorImpl &b,
+         const TensorImpl *c);
+
+
+
+/**
+   Matrix multiplication; does
+
+     \f$  c := \alpha a b   +  \beta c  \f$
+
+   where `a b` is interpreted as matrix multiplication.  This generalizes in the
+   same way as PyTorch's matmul does if there are extra dimensions in the args.
+   In fact it generalizes more than that, encompassing cases where the matrix
+   product may be summed over certain dimensions.
+
+   The implementation is just:
+
+     Tensor a_tmp(a), c_tmp(c);
+     Unsqueeze(0, &a_tmp);  // the 0 is in reversed numbering;
+                            // means introduce final dim=1 axis
+     Unsqueeze(1, &c_tmp);   // the 1 is in reversed numbering;
+                            // means introduce penultimate dim=1 axis.
+     Unsqueeze(2, &b_tmp);
+     AddProduct(alpha, beta, a_tmp, b, c_tmp);
+
+ */
+void Matmul(float alpha, float beta,
+            const TensorImpl &a, const TensorImpl &b,
+            const TensorImpl *c);
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_IMPL_LINEAR_H_
diff --git a/src/tensor/tensor-impl-utils.cc b/src/tensor/tensor-impl-utils.cc
new file mode 100644
index 00000000000..78f62479100
--- /dev/null
+++ b/src/tensor/tensor-impl-utils.cc
@@ -0,0 +1,52 @@
+// tensor/tensor-impl-utils.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor-impl-utils.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+void Slice(int32 axis, int32 start, int32 end, TensorImpl *t) {
+  int32 num_axes = t->num_axes;
+  int32 raxis = (axis >= 0 ? num_axes - 1 - axis : - 1 - axis);
+  if (static_cast<uint32>(raxis) >= static_cast<uint32>(num_axes)) {
+    KALDI_ERR << "Axis out of range: " << axis << ", num-axes = "
+              << num_axes;
+  }
+  int32 dim = t->dims[raxis], stride = t->strides[raxis];
+  if (end <= start || start < 0 || end > dim) {
+    KALDI_ERR << "Slice() parameters out of range: start,end = "
+              << start << "," << end << ", dim = " << dim;
+  }
+  AddToPointer(stride * static_cast<int64>(start), t);
+
+  int32 new_dim = end - start;
+  t->dims[raxis] = new_dim;
+  if (new_dim == 1)
+    t->strides[raxis] = 0;
+}
+
+
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/tensor-impl-utils.h b/src/tensor/tensor-impl-utils.h
new file mode 100644
index 00000000000..f31c71ae79f
--- /dev/null
+++ b/src/tensor/tensor-impl-utils.h
@@ -0,0 +1,242 @@
+// tensor/tensor-impl-utils.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_IMPL_UTILS_H_
+#define KALDI_TENSOR_IMPL_UTILS_H_ 1
+
+#include "tensor/tensor-impl.h"
+#include "tensor/patterns-utils.h"
+
+
+/**
+   This header contains mostly functions for usage by other code in the
+   framework, that operate on Tensors; see tensor-functions.h for more
+   user-facing functions.
+*/
+namespace kaldi {
+namespace tensor {
+
+
+/**
+  This function returns true if a and b have the same dtype
+  and device.  See also Broadcastable().
+*/
+inline bool Compatible(const TensorImpl &a, const TensorImpl &b);
+
+
+/*
+  This function returns true if a, b and c have the same dtype
+  and device; equivalent to Compatible(a, b) && Compatible(b, c).
+*/
+inline bool Compatible(const TensorImpl &a, const TensorImpl &b,
+                       const TensorImpl &c);
+
+
+
+/**
+  This function returns true if the patterns of a and b are broadcastable.
+  See similar function in pattern-utils.h for more information.
+*/
+inline bool Broadcastable(const TensorImpl &a, const TensorImpl &b,
+                          bool b_non_reducing = false);
+
+/**
+  This function returns true if the patterns of a, b and c are broadcastable.
+  See similar function in pattern-utils.h for more information.
+*/
+inline bool Broadcastable(const TensorImpl &a, const TensorImpl &b,
+                          const TensorImpl &c, bool c_non_reducing = false);
+
+
+/**
+   This function creates the appropriate storage object for the Tensor described
+   in 'impl', and sets impl->storage to that value.  Due to lazy allocation (see
+   "Lazy allocation" in glossary in tensor.h) the underlying memory won't be
+   allocated, but the meta-information is set up.
+
+      @param [in,out] impl   The TensorImpl object we are allocating for.
+                      Any previous value of impl->storage is ignored and
+                      overwritten.  Must satisfy impl->IsValid(false).
+      @return         Returns a newly allocated Storage object that
+                      manages this memory block.  When this object is deleted,
+                      the memory block will be deallocated using a
+                      method appropriate for the device.
+
+   This function throws on error.
+
+   See also AllocateTensorDataShared().
+ */
+void CreateTensorStorage(TensorImpl *impl);
+
+
+/**
+   Returns true if the provided TensorImpl covers the whole of the allocated
+   storage region, i.e. if every byte of the storage region is accessible
+   through `impl`.
+ */
+bool IsWhole(const TensorImpl &impl);
+
+
+/**
+   Modifies 't' in-place by inserting an axis with (dim=1,stride=0) at the
+   specified position.  Updates the code.
+
+   A negative axis-index i is interpreted (like PyTorch) as (num_axes + 1 - i).
+
+   Showing just the dims in the tensor for some examples:
+
+\verbatim
+    Unsqueeze({3,4}, 0)  -> {1,3,4}
+    Unsqueeze({3,4}, 1)  -> {3,1,4}
+    Unsqueeze({3,4}, 2)  -> {3,4,1}
+    Unsqueeze({3,4}, -1)  -> {3,4,1}
+    Unsqueeze({3,4}, -2)  -> {3,1,4}
+\endverbatim
+ */
+inline void Unsqueeze(TensorImpl *t, int32 axis) {
+  Unsqueeze(&(t->pattern), axis);
+}
+
+
+/**
+   Modifies 't' in-place by removing an axis with (dim=1,stride=0) from the
+   specified position.  It is an error if 't' did not initially contain
+   such an axis.  This function updates the code.  See also the same-named
+   function that operates on Pattern.
+
+   Showing just the dims in the tensor for an example:
+
+\verbatim
+    Squeeze(0, {1,3,4})  -> {3,4}
+    Squeeze(1, {3,1,4})  -> {3,4}
+    Squeeze(2, {3,1,4})  -> [error]
+\endverbatim
+ */
+void Squeeze(int32 axis, TensorImpl *t);
+  Squeeze(&(t->pattern), axis));
+}
+
+
+/** Transpose the two specified axes of a TensorImpl
+
+    @param [in] axis1  First axis to be transposed; must be in range
+                       `[-t->NumAxes(), t->NumAxes() - 1]`,
+                       with negative axis being interpreted as an offset
+                       from t->NumAxes().
+    @param [in] axis2  Second axis to be transposed; must be in range
+                       `[-t->NumAxes(), t->NumAxes() - 1]`.
+                       If identical to axis1, nothing will be done.
+    @param [in,out] t    TensorImpl whose axes are to be transposed.
+ */
+inline void Transpose(int32 axis1, int32 axis2, TensorImpl *t) {
+  Transpose(axis1, axis2, &(tensor->pattern));
+}
+
+
+/**
+   This is like PyTorch's slice() / narrow() functions.
+   It selects a range of dimensions on one of the axes.  It is similar to
+   indexing with a range in Python, like A[10:20].
+
+      @param [in] axis   Axis on which to possibly reduce the dimensionality;
+                         require -t->NumAxes() <= axis < t->NumAxes(), with
+                         negative axis interpreted as an offset from t->NumAxes().
+      @param [in] start  Starting index; must be in range [0, t->Dim(axis) - 1]
+      @param [in] end    Ending index; must be in the range [start + 1, t->Dim(axis)]
+      @param [in,out] t  TensorImpl whose metadata is to be modified.  Its num_axes
+                         is not changed by this function (unlike Select()).
+
+   See also: the other overloaded version of Slice() which accepts the 'step'
+   parameter; and Select(), which also reduces the num-axes.
+ */
+void Slice(int32 axis, int32 start, int32 end, const TensorImpl &src,
+           TensorImpl *dest);
+
+
+/**
+   This is a version of Slice() which also takes a 'step' argument to support
+   things like taking every other element.  See the documentation for the other
+   Slice() for more context.   This is related to indexing with a range
+   in Python: for example, A[0:6:2], selecting elements [0, 2, 4] of A.
+
+      @param [in] axis   Axis on which to possibly reduce the dimensionality;
+                         require -t->NumAxes() <= axis < t->NumAxes(), with
+                         negative axis interpreted as an offset from t->NumAxes().
+      @param [in] start  Starting index; must be in range [0, t->Dim(axis) - 1]
+      @param [in] end    Ending index.  If `step > 0` must be in the range
+                         [start + 1, t->Dim(axis)]; if step  < 0, must be
+                         in the range [start - 1, -1].
+      @param [in] step   Nonzero number that indicates the subsampling of elements
+                         (and possible axis flipping).
+      @param [in,out] t  TensorImpl whose metadata is to be modified.  Its num_axes
+                         is not changed by this function (unlike Select()).
+
+   See the other version of Slice(), and Select().
+ */
+void Slice(int32 axis, int32 start, int32 end, int32 step,
+           const TensorImpl &src, TensorImpl *dest);
+
+
+/**
+   Copy metadata from one TensorImpl to another, while modifying it
+   by selecting one index from a specified axis of a TensorImpl `t`, reducing
+   the num_axes by one.
+
+       @param [in] axis Axis from which to select an element; require
+                         `-t->NumAxes() <= axis < t->NumAxes()`, with negative
+                         axis interpreted as an offset from t->NumAxes().
+       @param [in] index  Index in t to select; must be in range
+                          [0, t->Dim(axis) - 1].
+       @param [in] src    TensorImpl which is to be copied
+       @param [out] dest  TensorImpl which we are copying to.  It is allowed
+                          to be the same object as 'src'.
+*/
+void Select(int32 axis, int32 index, const TensorImpl &src,
+            TensorImpl *dest);
+
+
+
+/**
+   This is to be called when any operation makes use of the memory underlying a
+   Tensor.
+      kRead
+      kReadWrite
+      kReadInvalidate
+      kInvalidate
+*/
+inline void RecordUse(const TensorImpl &impl,
+                      TensorUseEnum use_type) {
+  if (DebugMode()) {
+    impl.storage_->GetMemoryChecker()->RecordUse(
+        SizeOf(impl.dtype), impl.pattern);
+  }
+}
+
+
+inline int64 NumElements(const TensorImpl &a) {
+  return NumElements(a.pattern);
+}
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_IMPL_UTILS_H_
diff --git a/src/tensor/tensor-impl.cc b/src/tensor/tensor-impl.cc
new file mode 100644
index 00000000000..1d7cbbbb8af
--- /dev/null
+++ b/src/tensor/tensor-impl.cc
@@ -0,0 +1,72 @@
+// tensor/tensor-impl.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor-impl.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+TensorImpl(const TensorMeta &meta,
+           StridePolicy sp):
+    dtype(meta.dtype),
+    device(meta.device) {
+  switch (sp) {
+    case kKeepStrideOrder:
+      MakeCompactNonnegativeAndJustified(meta.pattern, &pattern);
+      break;
+    case kNormalized:
+      MakeCompactNormalizedAndJustified(meta.pattern, &pattern);
+      break;
+    case kCopyStrides:
+      pattern = meta.pattern;
+      MakeJustified(&pattern);
+      break;
+    default:  // would be code error.
+      KALDI_ERR << "Stride policy out of range";
+  }
+  CreateTensorStorage(this);
+  KALDI_PARANOID_ASSERT(this->IsValid());
+}
+
+TensorImpl::TensorImpl(const TensorMeta &meta,
+                       const std::shared_ptr<Storage> &storage):
+    pattern(meta.pattern),
+    dtype(meta.dtype),
+    device(meta.device),
+    storage(storage) {
+  KALDI_PARANOID_ASSERT(this->IsValid());
+}
+
+
+TensorImpl::TensorImpl(const TensorMeta &meta,
+                       const std::shared_ptr<Storage> &&storage):
+    // todo: ask @kkm if this will actually do move construction on the
+    // shared_ptr.
+    pattern(meta.pattern),
+    dtype(meta.dtype),
+    device(meta.device),
+    storage(storage) {
+  KALDI_PARANOID_ASSERT(this->IsValid());
+}
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/tensor-impl.h b/src/tensor/tensor-impl.h
new file mode 100644
index 00000000000..5501a7c9c4a
--- /dev/null
+++ b/src/tensor/tensor-impl.h
@@ -0,0 +1,201 @@
+// tensor/tensor-impl.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_IMPL_H_
+#define KALDI_TENSOR_TENSOR_IMPL_H_ 1
+
+#include "tensor/tensor-common.h"
+#include "tensor/pattern.h"
+
+namespace kaldi {
+namespace tensor {
+
+// Metadata for a Tensor.  It's occasionally convenient to have this
+// in a struct (it's the same as a Tensor without the 'data' pointer.
+// The members must stay in sync with the corresponding members of
+// TensorImpl, as we have code that does reinterpret_cast on
+// these types.  (We don't use base-classing as it would make the code
+// harder to read).
+struct TensorMeta {
+  Pattern pattern;
+  DataType dtype;
+  Device device;
+};
+
+/**
+   TensorImpl is the core part of a Tensor, without the wrapping code and
+   storage management in Tensor.h.  Most of the core implementation deals
+   directly with TensorImpl to avoid the overhead of shared_ptr management
+   and the need to deal with accessors and the like, but TensorImpl
+   is intended for use in the tensor/ directory, to implement Tensor
+   internals, and not for users of this library.
+*/
+struct TensorImpl {
+  Pattern pattern;
+  DataType dtype;
+  Device device;
+  std::shared_ptr<Storage> storage;  // 'storage' points to a shared Storage object
+                                     // that contains (or eventually will contain,
+                                     // due to lazy allocation) the actual data
+                                     // pointer.
+
+  inline int32 NumAxes() { return pattern.num_axes; }
+
+  // Returns the dimension on the supplied axis, using the public axis
+  // numbering, with negative index interpreted as an offset from the end.
+  //
+  //  @param [in] eaxis  Eaxis-index (see definition in pattern.h)
+  //                    Require -NumAxes() <= eaxis < NumAxes().
+  //  @return        Returns the dimension on this axis, a number >= 1.
+  inline int32 Dim(int32 eaxis);
+
+  // Returns the stride (== distance between successive elements) on the
+  // supplied axis, using the public axis numbering, with negative index
+  // interpreted as an offset from the end.
+  //
+  //  @param [in] eaxis  Eaxis-index (see definition in pattern.h)
+  //                    Require -NumAxes() <= eaxis < NumAxes().
+  //  @return          Returns the stride on this axis, which will be 0 if
+  //                   Dim(axis) == 1, and otherwise nonzero.
+  inline int32 Stride(int32 axis);
+
+
+  // Returns the data pointer corresponding to the element whose index
+  // is all zeros.  [TODO: maybe have overloads of this for different types.]
+  // CAUTION: this function may allocate the data if it has not yet been
+  // allocated.
+  inline void* GetData() const;
+
+  inline int32 GetCode() { return pattern.GetCode(); }
+
+  /**
+    Returns true if this TensorImpl is valid, false otherwise.
+
+       @param [in] check_storage   You can set this to false to disable
+                     checks related to the `storage` element (that
+                     it's non-NULL and covers the memory range used
+                     by the pattern.
+       @return   Return true if the TensorImpl is valid (requires
+                pattern.Valid(), plus checks on dtype and device,
+                plus checks on the storage object if check_storage == true.
+  */
+  bool IsValid(bool check_storage = true) const;
+
+
+  /**
+     This is to be called by users if they are about to do an operation on this
+     Tensor which writes to its underlying memory but does not read from it.
+     It gives the framework a free pass to not zero the part of memory covered
+     by this Tensor, even if it was instructed to zero the entire storage
+     region upon allocation.  Note: calling this will cause the storage region
+     to be allocated if it was not already allocated, so only call this
+     if you are about to actually use the data for something.
+
+     This function is const, like most operations on TensorImpl, because it doesn't
+     change the metadata, only (possibly) the Storage object.
+  */
+  inline void AllowUndefined() const { storage->AllowUndefined(*this); }
+
+  const TensorMeta &Meta() const {
+    return reinterpret_cast<const TensorMeta&>(*this);
+  }
+
+  // Note: a copy constructor for TensorImpl might not be needed as we store
+  // shared_ptrs to it and just reuse the same object.
+
+  // Constructor that is used when copying the meta-info from one source
+  // but the storage from another; this version does move-construction
+  // on 'storage'.
+  TensorImpl(const TensorMeta &meta,
+             const std::shared_ptr<Storage> &storage);
+
+  // Constructor that is used when copying the meta-info from one source
+  // but the storage from another; this version does move-construction
+  // on 'storage'.
+  TensorImpl(const TensorMeta &meta,
+             std::shared_ptr<Storage> &&storage);
+
+  // Constructor that copies the meta-info provided; if create_storage
+  // == true it creates the storage reason, else leaves it NULL.
+  TensorImpl(const TensorMeta &meta,
+             bool create_storage = true);
+
+  /**
+     Initializes a TensorImpl with the provided dimensions, creating a new
+     storage object for it.  The strides will be as for a "C" array; see
+     "Default strides:" in pattern.h.
+
+        @param [in] dims  The dimensions for each axis (in the public
+                       numbering).  All elements must be nonnegative,
+                       and we require `0 <= dims.size < KALDI_TENSOR_MAX_DIM`.
+        @param [in] opts  Options class to set device and dtype;
+                          see examples below
+<code>
+   TensorImpl *t = new TensorImpl({10,20}),
+       *u = new TensorImpl({9}, {kCudaDevice});
+       *v = new TensorImpl({9}, {kDoubleDtype, kCudaDevice});
+</code>
+  */
+  TensorImpl(ArrayRef<int32> dims,
+             TensorOptions opts = TensorOptions());
+
+  /**
+    This constructor initializes a TensorImpl with dtype, device and dims taken
+    from an existing TensorImpl, but a new storage object, and strides
+    determined by the StridePolicy provided.
+
+       @param [in] meta  Meta-info of another TensorImpl; the num_axes,
+                        dims, dtype and device will be taken from here
+                        and the strides may be inspected, depending
+                        on `sp`.
+       @param [in] sp   Stride policy (briefly as follows; see more by
+                      declaration of StridePolicy in tensor-common.h).
+                      kKeepStrideOrder -> use the same order of abs(stride) as
+                                          in 'meta'
+                      kNormalized -> use normalized strides (see definition
+                       in pattern.h); basically, the normal order we'd use
+                       for a new Tensor.
+                      kCopyStrides -> use the exact strides from the source
+                       pattern.
+  */
+  TensorImpl(const TensorMeta &meta,
+             StridePolicy sp);
+
+  // Default constructor
+  TensorImpl() { }
+
+};
+
+
+inline int32 TensorImpl::Dim(int32 eaxis) {
+  int32 raxis = EaxisToRaxis(eaxis);
+  if (raxis >= pattern.num_axes)
+    KALDI_ERR << "Invalid axis given to Dim(): "
+              << eaxis << ", num_axes = "
+              << pattern.num_axes;
+  return pattern.dims[num_axes];
+}
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_IMPL_H_
diff --git a/src/tensor/tensor-linear-ops.h b/src/tensor/tensor-linear-ops.h
new file mode 100644
index 00000000000..de2efd7274f
--- /dev/null
+++ b/src/tensor/tensor-linear-ops.h
@@ -0,0 +1,501 @@
+// tensor/tensor-linear-ops.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_LINEAR_OPS_H_
+#define KALDI_TENSOR_TENSOR_LINEAR_OPS_H_ 1
+
+#include "tensor/tensor.h"
+
+
+// Note: user-level code will not interact directly with these Ops.  See
+// tensor-linear.h for the user-level code.
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   Add operation taking two Tensors (T), i.e. b += a, which
+   may include summation and/or broadcasting depending on
+   the dimensions of b and a.
+
+   May not be used if b and a overlap.  Probably should not be used if b is
+   known to be zero (e.g. if it hasn't been allocated)-- in that case, it's
+   better to use CopyOp.
+*/
+class AddTTOp {
+ public:
+
+  AddTTOp(const Tensor &a, Tensor &b):
+      a_(a), b_(b) {
+    KALDI_ASSERT(!Overlap(a, b) &&
+                 BroadcastableAndCompatible(a, b));
+  }
+  void Do() const override {
+
+  }
+
+
+ private:
+   Tensor a_;
+   Tensor b_;
+};
+
+
+/**
+   class Op is a base-class for objects that are created when we do operations
+   on Variables.  The important thing to know here is that the Variables in
+   question will always have been allocated with particular dimensions,
+   and possibly even contain defined values, before we get to the Op.
+   Examples of Ops include,
+      a := b * c
+      a += b
+      a *= b
+   where the interpretation of the commands above will depend on the
+   dimensions of the Tensors involved.
+
+   Notice that all the member functions of class Op are `const`, i.e. they
+   shouldn't change this class (although of course they may change the
+   underlying Tensor data).  This is to remind users that Ops are supposed
+   to be reusable, and calls to this object shouldn't affect the behavior
+   of subsequent calls, except to the extent that the underlying Tensor
+   data has been changed.
+ */
+class Op {
+ public:
+
+  /**
+     Do whatever it is that this Op does (e.g. execute the command `a += b`,
+     if that was what this Op did)
+   */
+  virtual void Do() const;
+
+  /**
+     Return a copy of this object.  (This won't be needed very often but might
+     possibly be needed in the context of computing higher-order derivatives).
+  */
+  virtual Op *Copy() const;
+
+  /**
+     This is for forward-mode automatic differentiation (a rarely-used thing).
+     It appends to 'ops' the commands corresponding to the forward-mode
+     automatic differentiation w.r.t. this Op.
+
+       @param [in,out] 'map' is the map that maps from tensors to the
+             corresponding derivative values.  May be modified by adding
+             new key/value pairs.
+       @param [out] ops  This funtion will *append* to `ops` the
+             commands for computing the derivatives associated with
+             this Op in forward-mode automatic differentiation.  If none
+             of the inputs to the Op were tracked w.r.t. `map`,
+             nothing will be done.
+
+     Example: if the command was "a += b", the derivative operation would
+     be: deriv(a) += deriv(b).  In most cases these Ops would be executed
+     immediately and then deleted.
+   */
+  virtual void GetForwardDerivOps(DerivMap *map,
+                                  std::vector<std::unique_ptr<Op> > *ops) const;
+
+
+
+  /**
+     This is for reverse-mode automatic differentiation (the normal type of
+     autograd).
+
+       @param [in,out] map   This object maps from tensors to the
+                       corresponding derivative values.  It may be changed by
+                       adding new elements to the map, if its Deriv() function
+                       is called.
+       @param [out]    ops  This function may *append* to 'ops' the commands
+                       used in the reverse-mode automatic differentiation.
+                       (Note: nothing will be appended if none of the inputs
+                       to the Op were already tracked w.r.t. 'map'.)
+
+     Example: if the command was "a += b * c", the operations added to
+     'ops' would correspond to `deriv(b) += deriv(a) * c` and
+     `deriv(c) += deriv(a) * b`.
+  */
+  virtual void GetBackwardDerivOps(DerivMap *map,
+                                   std::vector<std::unique_ptr<Op> > *ops) const;
+
+
+  /** Destructor.  It's important for efficiency of memory use to destroy Ops as
+      soon as you won't need them any more, because it may trigger the freeing
+      of Tensors and hence Storage regions.
+  */
+  virtual ~Op();
+};
+
+
+
+class Op {
+
+  Op(): tick_(GetTick()) { }
+
+  /// InputIteratorBegin() and InputIteratorEnd() form the begin and
+  /// end points of a list of Variables that were inputs of this Op
+  /// but were not outputs.  This is used by the backprop code when finding
+  /// the topological order of ops.  (Note: output variables themselves
+  /// refer to Ops, so if we included them in the input list we'd
+  /// get a cycle in the graph).  These Variables are expected to
+  /// still have their graph information (i.e. sub-classes of class Op
+  /// class must not call RemoveGraph() on the members of this list).
+  virtual Op *DepIteratorBegin() = 0;
+  virtual Op *DepIteratorEnd() = 0;
+
+
+
+  // This number >= 0 is used to determine the order of Ops in a graph; each
+  // time we generate an Op we increment a global counter.  Doing it this way,
+  // rather than via topological sorting, is simpler.
+  int64 GetTimestamp() const final { return tick_; }
+
+  virtual void Backprop();
+
+ protected:
+
+  /**
+     The time (`GetTick()`) at which this Op was created; should be set
+     in child classes by doing:
+      `tick_ = GetTick()`
+     as the last statement of the constructor.   (This ensures the
+     tick is later-numbered than any ticks stored in the ChangeTracker
+     code by operations called from the constructor.)
+  */
+  int64 tick_;
+
+
+  /*
+    This function intended to be called from the Backprop() routines
+    of child classes, for example:
+       ` if (DebugMode()) {  CheckTensorTime(*a_);  } `
+    This will die if the memory underlying the Tensor being checked has been
+    modified more recently than tick_.
+  */
+  inline void CheckTensorTime(const Tensor &tensor) {
+    if (DebugMode()) {
+    }
+  }
+
+
+
+
+};
+
+
+template <class OpImpl>
+class OpPointer {
+
+  std::shared_ptr<OpImpl>
+
+}
+
+
+
+/**
+   This is a special version of base-class Op that is created when
+   any SharedGrad is allocated for a non-leaf Variable.  Its purpose
+   is to ensure that, when we get to this Op in the backprop, we deallocate
+   the data underlying the gradient Tensor (so we don't keep gradient
+   Tensors around for longer than is needed).
+*/
+class DeallocateOp: public Op {
+
+  // This operator has no dependencies as it will be created when a SharedGrad
+  // is first initialized, when no Ops have been done on it.
+  Op *DepIteratorBegin() override { return NULL; }
+  Op *DepIteratorEnd() override { return NULL; }
+
+  void Backprop() override {
+    if (auto s = tensor_to_deallocate_.lock())
+      ZeroDeallocating(s.get());
+  }
+
+ private:
+  // Since we just want to deallocate its underlying data, there is no point
+  // increasing its ref-count; we can just shrug our shoulders if it has
+  // already been deleted.d
+  std::weak_ptr<Tensor> tensor_to_deallocate_;
+};
+
+
+/**
+   A slight simplification of class UnaryOp for cases where it's
+   done in-place.
+ */
+class InPlaceUnaryOp: public Op {
+
+};
+
+
+class UnaryOp: public Op {
+
+  //
+  UnaryOp(const Variable &input, const Variable &output) {
+    if
+
+
+
+    if (SameVariable(input, output)) {
+
+    } else {
+    }
+  }
+
+ public:
+
+  std::shared_ptr<Op> op1_;
+  std::shared_ptr<Op> op2_;
+
+
+
+
+}
+
+class GenericOp: public Op {
+
+  // GenericOp is a child of class Op that is intended as a generic base-class
+  // for expressions.
+
+
+
+ protected:
+  // Constructor, to be used from child classes.  This base-class takes care
+  // of storing the list of input Variables for purposes of tracing dependencies;
+  //
+  //  @param [in] input_vars  The list of input Variables (meaning: Variables
+  //                   that are inputs to, but not outputs of, i.e. not modified
+  //                   by, this Op).
+  //  @param [in] output_var  The output Variable of this Op, i.e. the Variable
+  //                   which is modified or set by it.  We may provide another
+  //                   constructor taking ArrayRef<Variable> in this position,
+  //                   as and when we need to support Ops that operate on
+  //                   multiple output Variables.
+  void Op(const ArrayRef<Variable> &input_vars,
+          const Variable &output_var);
+
+
+  // TODO: maybe have a constructor of Op that takes an ArrayRef of the inputs
+  // that are not also outputs?  Could use that for graph traversal.
+
+ private:
+
+  // num_inputs_ is the number of base Variables that are the base Variables of
+  // inputs of this Op (but not of outputs).  These are stored in the
+  // array 'inputs_'.
+
+  // inputs_ is a pointer to an array of shared_ptr<Variable> of size num_inputs_, which
+  // will be be allocated by new [] in the constructor and deleted by delete []
+  // in the destructor.
+
+  // This is a list of the Op-input-nodes (see glossary in tensor.h for explanation).
+  // We don't store the Op-output-nodes here; instead, they refer to this Op in
+  // their op_lists.
+  // (We don't store the Node(s) that is(are) the outputs of the Op here; its own
+  // op_list refers to this Op).
+  std::shared_ptr<Node> *inputs_;
+
+  int32 num_inputs_;
+
+  // If num_inputs_ is 1, then inputs_ is
+  void *inputs_;
+
+  int64 n_;  // initialized from the counter when this object is created.
+  std::shared_ptr<Op> tail_;  // TODO: make it unique_ptr?
+ protected:
+  // Return true if this is not the last Op in the list of Ops attached to this
+  // base Variable (can be useful to know whether we need bother to scale the
+  // derivative in a scaling operation, for instance).
+  bool HasTail() const { return tail_ != nullptr; }
+};
+
+
+class AddToOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b  :=  alpha a  +   beta b.  \f$
+  // with broadcasting or summation depending on the dimensions
+  // involved.  Alpha and beta are constants, and differentiation w.r.t. them is
+  // not supported (you wouldn't reach this code if a or b were actual
+  // variables.)
+  //
+  // The Op is only constructed if b.Tracked() (which it would normally if
+  // a.Tracked()).
+  AddToOp(float alpha, float beta,
+          const Variable &a, const Variable &b):
+      Op({a}),
+      alpha_(alpha),
+      beta_(beta),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+
+    Add(alpha, beta, *a_data_, b_data_.get());
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+class CopyOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b := a  \f$
+  // with broadcasting or summation depending on the dimensions.
+  //
+  // Constructing this Op will make b tracked if it was already.
+  CopyOp(const Variable &a, const Variable &b):
+      Op({a}),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+    Copy(a_data_, b_data_);
+
+      `tick_ = GetTick()`
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+class CopyOp: public Op {
+ public:
+
+  // This Op corresponds to the computation:
+  //   \f$  b  :=  alpha a  +   beta b.  \f$
+  // with broadcasting or summation depending on the dimensions
+  // involved.  Obviously alpha and beta are constants,
+  // and differentiation w.r.t. them is not supported.
+  //
+  // The Op is only constructed if b_.Tracked() (which it
+  // would normally if a_.Tracked()).
+  AddToOp(float alpha, float beta,
+          const Variable &a, const Variable &b):
+      Op({a}),
+      alpha_(alpha),
+      beta_(beta),
+      a_data_(a.GetData()),
+      a_grad_(a.GetGradIfPresent()),
+      b_data_(b.GetData()),
+      b_grad_(b.GetGrad()) {
+
+    Add(alpha, beta, *a_data_, b_data_.get());
+  }
+
+
+  void Backward() {
+    // Do: a_grad += alpha * b_grad.
+    if (a_grad_ != nullptr)
+      AddTo(alpha_, 1.0, b_grad, &a_grad);
+
+    if (beta_ != 1.0)
+      Scale(beta_, b_grad.get());
+  }
+
+ private:
+
+  float alpha_;
+  float beta_;
+
+  // We hold onto all inputs that are not also outputs
+  // (here just a_) for dependency tracking.
+  Variable a_;
+
+  std::shared_ptr<Node> a_node_;
+
+  std::shared_ptr<Tensor> a_data_;
+  // a_grad_ will be NULL if a was not tracked.
+  std::shared_ptr<Tensor> a_grad_;
+  std::shared_ptr<Tensor> b_data_;
+  std::shared_ptr<Tensor> b_grad_;
+
+  Variable b_;
+  bool must_scale_b_grad_;
+
+};
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_LINEAR_OPS_H_
diff --git a/src/tensor/tensor-settings.cc b/src/tensor/tensor-settings.cc
new file mode 100644
index 00000000000..d67ed7bb388
--- /dev/null
+++ b/src/tensor/tensor-settings.cc
@@ -0,0 +1,27 @@
+// tensor/tensor-settings.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor-settings.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+}  // namespace tensor
+}  // namespace kaldi
diff --git a/src/tensor/tensor-settings.h b/src/tensor/tensor-settings.h
new file mode 100644
index 00000000000..43d4bea7e77
--- /dev/null
+++ b/src/tensor/tensor-settings.h
@@ -0,0 +1,116 @@
+// tensor/tensor-settings.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_SETTINGS_H_
+#define KALDI_TENSOR_TENSOR_SETTINGS_H_ 1
+
+#include <cstdint>
+#include <vector>
+#include <string>
+#include "tensor/tensor-common.h"
+
+
+/**
+   This file contains certain mechanisms to set settings about default
+   data types and devices within scopes, some related things like
+   an equivalent of PyTorch's .no_grad().  Also the `Tick()` mechanism
+   is here.
+*/
+
+namespace kaldi {
+namespace tensor {
+
+
+
+// Global variable, initialized from zero, that is used in GetTick().
+// This is defined in tensor-settings.cc.
+extern int64 g_tick_counter;
+inline int64 NextTick() { return ++g_tick_counter; }
+
+
+// debug_mode activates code that checks for invalidated data in the backprop
+// pass; see "Invalidated:" in glossary in tensor.h.
+// Don't access this variable directly.
+extern bool g_debug_mode;     // Do not access directly!
+extern int64 g_debug_start_tick;   // Do not access directly!
+
+inline bool DebugMode() {
+  return debug_mode;
+}
+inline void SetDebugMode(bool b) {
+  if (!debug_mode)
+    debug_start_tick = NextTick();
+  debug_mode = b;
+}
+/**
+   Returns the tick at which debug mode most recently changed from false to
+   true.
+ */
+inline int64 DebugTick() {
+  KALDI_PARANOID_ASSERT(debug_mode);
+  return debug_start_tick;
+}
+
+class WithDebugModeAs {
+ public:
+  // Example:
+  // {
+  //   WithDebugModeAs _(true);
+  //   // code in this block uses debug mode.
+  //   // variable name is _ because we won't use it.
+  // }
+  inline WithDebugModeAs(bool b):
+      prev_default_(DebugMode()) {
+    SetDebugMode(b);
+  }
+  ~WithDebugModeAs() { SetDebugMode(prev_default_); }
+
+ private:
+  bool prev_default_;
+};
+
+
+inline bool DebugMode() {
+  return debug_mode;
+}
+inline void SetDebugMode(bool b) {
+  if (!debug_mode)
+    debug_start_tick = NextTick();
+  debug_mode = b;
+}
+
+extern bool g_reference_mode;     // Do not access directly!
+
+// Gets 'reference mode' bool.  If true, the simple reference implementation
+// will be used instead of the more optimized (e.g. BLAS-based) implementation.
+// This will typically affect the Expand() call of Ops instead of their
+// Do() call.
+inline bool ReferenceMode() {
+  return reference_mode;
+}
+inline void SetReferenceMode(bool b) {
+  reference_mode = b;
+}
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_SETTINGS_H_
diff --git a/src/tensor/tensor-utils.cc b/src/tensor/tensor-utils.cc
new file mode 100644
index 00000000000..fa3e7090e44
--- /dev/null
+++ b/src/tensor/tensor-utils.cc
@@ -0,0 +1,71 @@
+// tensor/tensor-utils.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor-utils.cc"
+
+namespace kaldi {
+namespace tensor {
+
+void DebugNormalOpInternal(const Tensor &a, TensorUseEnum a_use,
+                           const Tensor &b, TensorUseEnum b_use) {
+  if (!Broadcastable(a, b))
+    KALDI_ERR << "Tensors in Operation do not have broadcastable shapes.";
+  if (Overlap(a, b))
+    KALDI_ERR << "Tensors in Operation overlap.";
+  if (!Broadcastable(a, b))
+    KADLDI_ERR << "Tensors in Operation do not have broadcastable shapes.";
+  if (a.Dtype() != b.Dtype())
+    KALDI_ERR << "Tensors in Operation have different data-types";
+  if (a.Device() != b.Device())
+    KALDI_ERR << "Tensors in Operation have different device";
+  RecordUse(a, a_use);
+  RecordUse(b, b_use);
+}
+
+
+void DebugNormalOpInternal(const Tensor &a, TensorUseEnum a_use,
+                           const Tensor &b, TensorUseEnum b_use,
+                           const Tensor &b, TensorUseEnum c_use) {
+  if (!Broadcastable(a, b, c))
+    KALDI_ERR << "Tensors in Operation do not have broadcastable shapes.";
+  bool a_written = (a_use == kWrite || a_use == kReadWrite);
+  bool b_written = (b_use == kWrite || b_use == kReadWrite);
+  bool c_written = (b_use == kWrite || b_use == kReadWrite);
+
+  if ((a_written || b_written) && Overlap(a, b))
+    KALDI_ERR << "Tensors a and b in Operation overlap.";
+  if ((b_written || c_written) && Overlap(b, c))
+    KALDI_ERR << "Tensors b and c in Operation overlap.";
+  if ((a_written || c_written) && Overlap(a, c))
+    KALDI_ERR << "Tensors a and c in Operation overlap.";
+
+  if (a.Dtype() != b.Dtype())
+    KALDI_ERR << "Tensors in Operation have different data-types";
+  if (a.Device() != b.Device())
+    KALDI_ERR << "Tensors in Operation have different device";
+  RecordUse(a, a_use);
+  RecordUse(b, b_use);
+  RecordUse(c, c_use);
+}
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
diff --git a/src/tensor/tensor-utils.h b/src/tensor/tensor-utils.h
new file mode 100644
index 00000000000..6ba5956201b
--- /dev/null
+++ b/src/tensor/tensor-utils.h
@@ -0,0 +1,310 @@
+// tensor/tensor-utils.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_UTILS_H_
+#define KALDI_TENSOR_TENSOR_UTILS_H_ 1
+
+
+#include "tensor/tensor-impl.h"
+#include "tensor/pattern-utils.h"
+#include "tensor/tensor.h"
+
+namespace kaldi {
+namespace tensor {
+
+
+/**
+  This function returns true if a and b have the same dtype
+  and device.  See also Broadcastable().
+*/
+inline bool Compatible(const Tensor &a, const Tensor &b) {
+  return Compatible(*a.impl_, *b.impl_);
+}
+
+/**
+  This function returns true if the Patterns of a and b are
+  broadastable.
+*/
+inline bool Broadcastable(const Tensor &a, const Tensor &b) {
+  return Broadcastable(*a.impl_, *b.impl_);
+}
+
+
+/**
+  This function returns true if a and b have the same dtype
+  and device and are broadcastable; equivalent to
+  `Broadcastable(a, b) && Compatible(a, b)`.
+*/
+inline bool BroadcastablAendCompatible(const Tensor &a, const Tensor &b) {
+  return Compatible(*a.impl_, *b.impl_) &&
+      Broadcastable(*a.impl_, *b.impl_);
+}
+
+
+inline bool Overlap(const Tensor &a, const Tensor &b) {
+  return a.impl_->storage.get() == b.impl.storage.get() &&
+      PatternsOverlap(a.impl_->pattern, b.impl_->pattern);
+}
+
+
+/**
+   Returns true if the Tensor t covers its entire allocated storage region,
+   meaning every byte of the storage region is accessible through t.
+*/
+inline bool IsWhole(const Tensor &t) {
+  return IsWhole(*t.impl_);
+}
+
+/*
+  This function returns true if a, b and c have the same dtype
+  and device; equivalent to Compatible(a, b) && Compatible(b, c).
+*/
+inline bool Compatible(const TensorImpl &a, const TensorImpl &b,
+                       const TensorImpl &c) {
+  return Compatible(*a.impl_, *b.impl_, *c.impl_);
+}
+
+
+/**  This function returns true if the dimensions of tensor patterns
+     a and b are broadcastable in the PyTorch sense.  What this means
+     for tensors with the same num-axes is that dims for axis i
+     must either be the same or one of them must be 1.  For tensors
+     with different num-axes we (conceptually) check this after
+     padding with leading (dim=1)'s; for
+     instance, dims=[2,8,3] and dims=[8,1] would be broadcastable because
+     the [8,1] would be interpreted as [1,8,1].  (The examples above
+     are in the public ordering, not the reversed private ordering.)
+
+     If 'b_non_reducing' is true, then we do not allow any dim of
+     b to be 1 where the corresponding dim of a was not 1.
+ */
+inline bool Broadcastable(const Tensor &a, const Tensor &b,
+                          bool b_non_reducing = false) {
+  return Broadcastable(a.impl_.pattern, b.impl_.pattern,
+                       b_non_reducing);
+}
+
+/**  This function returns true if the dimensions of Tensors
+     a, b and c are broadcastable in the PyTorch sense (meaning;
+     after padding their dims on the left with ones to make them
+     have the same num-axes, corresponding dimensions are either
+     identical or 1).  See the version of Broadcastable() above
+     for more information.
+
+       @param [in] a  The first Tensor
+       @param [in] b  The second Tensor
+       @param [in] c  The third Tensor
+       @param [in] c_non_reducing   If true, then we do not allow a dim of
+                      c to be 1 while corresponding dims of a or b
+                      are > 1.
+ */
+inline bool Broadcastable(const Tensor &a, const Tensor &b,
+                          const Tensor &c, bool c_non_reducing = false) {
+  return Broadcastable(a.impl_.pattern, b.impl_.pattern,
+                       c.impl_.pattern, c_non_reducing);
+}
+
+/**
+   Returns true if the 'dims' vectors of a and b are the same.
+   Does not require the number of axes to be the same, so effectively
+   it's testing that the dims are the same after padding on the left
+   with dim=1 (here referring to the public, non-reversed numbering
+   of the dims).
+
+   This is a stronger condition than Broadcastable(a, b).
+ */
+inline bool SameDim(const Tensor &a, const Tensor &b) {
+  return SameDim(a.impl_.pattern, b.impl_.pattern);
+}
+
+/**
+   Returns true if the 'dims' vectors of a, b and c are all the same.
+   Does not require the number of axes to be the same, so effectively
+   it's testing that the dims are the same after padding on the left
+   with dim=1 (here referring to the public, non-reversed numbering
+   of the dims).
+
+   This is a stronger condition than Broadcastable(a, b, c).
+ */
+inline bool SameDim(const Tensor &a, const Tensor &b,
+                    const Tensor &c) {
+  return SameDim(a.impl_.pattern, b.impl_.pattern);
+}
+
+inline void CheckUnchangedSince(int64 tick, const Tensor &a) {
+  // TODO.  Access its storage and check not changed since then.
+}
+
+
+/**
+   This is to be called from any routine that writes to the memory underlying a
+   Tensor; in debug mode it registers that the Tensor has been changed, which
+   will later be used to check that the preconditions of the autograd framework
+   (in terms of in-place operations) are satisfied.
+ */
+inline void RegisterTensorChange(const Tensor &a) {
+  RegisterTensorChange(*a.impl_);
+}
+
+
+/**
+   Returns the number of elements in the Tensor, which equals the product of its
+   dimensions, i.e. the product from `axis = 0 ... a.NumAxes() - 1`, of
+   `a.Dim(axis)`.
+ */
+inline int64 NumElements(const Tensor &a) {
+  return NumElements(*a.impl_);
+}
+
+/**
+   This is the Tensor-level version of CanonicalizePattern() from
+   pattern-utils.h.  It ensures that the Tensor's pattern is canonical.
+   If this changes the Pattern, this will involve allocating a new
+   TensorImpl (since we always assume that TensorImpl's may be shared
+   by other Tensors).
+*/
+void CanonicalizeTensor(Tensor *tensor);
+
+/**
+   This is the Tensor-level version of CompressPatterns() from pattern-utils.h.
+   It ensures that the Tensors
+ */
+void CompressTensors(ArrayRef<Tensor*> tensors);
+
+
+/**
+   Returns a Tensor referencing a new TensorImpl; it will be as t except the
+   pattern will be the one provided.
+ */
+Tensor WithPattern(const Tensor &t, const Pattern &pattern);
+
+
+/**
+   This is to be called when any operation makes use of the memory underlying a
+   Tensor.
+      kRead
+      kWrite
+      kReadWrite
+      kReadInvalidate
+      kInvalidate
+*/
+inline void RecordUse(const Tensor &tensor,
+                      TensorUseEnum use_type) {
+  if (DebugMode()) {
+    tensor.impl_->storage_->GetMemoryChecker()->RecordUse(
+        SizeOf(impl.dtype), impl.pattern);
+  }
+}
+
+
+
+// Implementation for 2-Tensor DebugNormalOp (see declaration below); called in
+// debug mode only.
+void DebugNormalOpInternal(const Tensor &a, TensorUseEnum a_use,
+                           const Tensor &b, TensorUseEnum b_use);
+// Implementation for 3-Tensor DebugNormalOp (see declaration below); called in
+// debug mode only.
+void DebugNormalOpInternal(const Tensor &a, TensorUseEnum a_use,
+                           const Tensor &b, TensorUseEnum b_use,
+                           const Tensor &c, TensorUseEnum c_use);
+
+
+
+/**
+   This convenience function is to be used in the implementation of
+   Tensors (inside the Do() function).  In debug mode, it makes various
+   checks.  This is for use in "normal" ops, i.e. ops that operate on
+   the same data-types and on the same device.
+   This version is for use in Ops that operate on two tensors.
+
+      @param [in] a     The first Tensor the Op works on.
+      @param [in] a_use The use-type of Tensor a,
+                        saying what kind of operation we are
+                        doing on it: one of
+                         - kRead
+                         - kWrite
+                         - kReadWrite
+                         - kReadInvalidate
+                         - kInvalidate
+                        (the ones with Invalidate may be relatively
+                        rare; they are for Ops where we are avoiding
+                        some operation in the expectation that the data
+                        won't be used afterward).
+      @param [in] b     The second Tensor the Op works on
+      @param [in] b_use The use-type of Tensor b
+
+
+*/
+inline void DebugNormalOp(const Tensor &a, TensorUseEnum a_use,
+                          const Tensor &b, TensorUseEnum b_use) {
+  if (DebugMode())
+    DebugNormalOpInternal(a, a_use, b, b_use);
+}
+
+
+
+/**
+   This convenience function is to be used in the implementation of
+   Tensors (inside the Do() function).  In debug mode, it makes various
+   checks.  This is for use in "normal" ops, i.e. ops that operate on
+   the same data-types and on the same device.
+   This version is for use in Ops that operate on two tensors.
+
+      @param [in] a     The first Tensor the Op works on.
+      @param [in] a_use The use-type of Tensor a,
+                        saying what kind of operation we are
+                        doing on it: one of
+                         - kRead
+                         - kReadWrite
+                         - kReadInvalidate
+                         - kInvalidate
+                        (the ones with Invalidate may be relatively
+                        rare; they are for Ops where we are avoiding
+                        some operation in the expectation that the data
+                        won't be used afterward).
+      @param [in] b     The second Tensor the Op works on
+      @param [in] b_use The use-type of Tensor b
+      @param [in] c     The second Tensor the Op works on
+      @param [in] c_use The use-type of Tensor c
+*/
+inline void DebugNormalOp(const Tensor &a, TensorUseEnum a_use,
+                          const Tensor &b, TensorUseEnum b_use,
+                          const Tensor &c, TensorUseEnum c_use) {
+  if (DebugMode())
+    DebugNormalOpInternal(a, a_use, b, b_use, c, c_use);
+}
+
+/**
+   Calling this ensures that when (in future) a Tensor's storage region is
+   allocated, it will be zeroed.  This won't have any effect if the storage
+   region was already allocated.  Note: storage regions are not allocated
+   until they are actually used (e.g. by calling GetData()), so if Tensor
+   'a' is freshly created, this will have an effect.
+ */
+inline void ZeroOnAllocation(const Tensor &a) {
+  a.impl_->storage->ZeroOnAllocation();
+}
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_H_
diff --git a/src/tensor/tensor.cc b/src/tensor/tensor.cc
new file mode 100644
index 00000000000..6720b5808c7
--- /dev/null
+++ b/src/tensor/tensor.cc
@@ -0,0 +1,30 @@
+// tensor/tensor.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/tensor.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/tensor.h b/src/tensor/tensor.h
new file mode 100644
index 00000000000..49e4bb1645f
--- /dev/null
+++ b/src/tensor/tensor.h
@@ -0,0 +1,442 @@
+// tensor/tensor.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_TENSOR_H_
+#define KALDI_TENSOR_TENSOR_H_ 1
+
+#include "tensor/tensor-common.h"
+#include "tensor/pattern.h"
+#include "tensor/tensor-impl.h"
+#include "tensor/storage.h"
+
+
+/*
+   TENSOR GLOSSARY
+
+    Base Variable:  A Variable that is not a view into another Variable,
+             but has been created directly from a Tensor (or via Detach()).
+             Each Variable has a base Variable; a base Variable's
+             base Variable is itself.  See also: "View Variable".
+
+    Debug mode:  A global bool that says whether we are debugging
+             (activates checks on computation correctness that are slow).
+
+    Invalidated:  if some data used in backprop needs to have been unchanged since
+              a particular tick (as recorded in an Op), but it has been changed
+              since then, we say that it has been invalided.  This is an error,
+              but it will only be detected in debug mode.  In effect we store a
+              record of what time (in ticks) data last changed at the
+              individual-element level (e.g. per float), via the ChangeTracker
+              object that is attached to the Storage object.  It's done in a
+              structured way, not via a huge boolean array.  This means that the
+              change-tracking mechanism is not defeated by doing Detach() or by
+              constructing multiple Variables from the same Tensor.
+
+    In-place operation: An operation that modifies a Variable, such as adding
+              to it after it has been created.  This notion is not particularly
+              meaningful in this framework, since in a sense all operations
+              are in-place operations; conceptually, the creation of a Variable
+              is seen as separate from an operation that sets it to some value,
+              and in-place operations are thus not "special".
+
+    Lazy allocation:  We do not allocate memory as soon as a Tensor is created,
+              but wait until an operation is done on it.  This makes it easier
+              to implement backprop with views of Tensors, because we can
+              construct views of Tensors whose memory has not been allocated yet.
+              The code for this happens in class Storage (see storage.h).  We
+              can also repeat this trick: on a base Variable, you can call
+              ZeroDeallocating(), which conceptually zeroes the Variable, but
+              does it by freeing the underlying data.  This enables the autograd
+              graph to be re-used without leaving too many things allocated.
+
+     Leaf Variable:  A leaf Variable is a Variable that you create directly
+             by wrapping a Tensor (or by calling .Detach()).  A leaf
+             Variable is always a base Variable.
+
+     Node:   A node in the autograd graph (Ops correspond-- roughly-- to the
+             edges in that graph).  There is a node for each tracked base variable.
+             [See also: Tracked; Base Variable].
+
+     Op:     (see op.h)  An operation on a Tensor (e.g. addition, multiplication, etc.),
+             including in-place operations.  Each Node in the autograd graph stores
+             a list of Ops that operated on that base Variable or some sub-part of
+             it.  However, if an Op modified two Nodes we need to call its Backprop()
+             only once; after figuring out which Ops need to be done, we call their
+             Backprop() in reverse order of their ticks (see: Tick).
+
+    Op-input-node:  Relative to a particular Op, a Node is an Op-input-node if
+            it is attached to at least one Variable that is an input of that Op,
+            but is not attached to any Variable that is an output of that Op.
+            An Op-input-node may not also be an Op-output-node (they are disjoint
+            sets).
+
+    Op-output-node: Relative to a particular Op, a Node is an Op-output-node
+            if it is attached to any Variable that is an output of that Op
+            (i.e. that is modified by that Op).
+
+    Optional Tensor:  In situations where we might have a Tensor and might
+            not, we use a raw std::shared_ptr<TensorImpl>.  (A Tensor wraps
+            a std::shared_ptr<TensorImpl> that is known not to be NULL).
+            Note: we don't allow a Tensor to have zero dim, so we can't
+            use that representation when the Tensor isn't really there.
+
+    Tick:   a tick is the value of a global 64-bit time counter that we increment
+            every time we mutate a Tensor; see GetTick(), and
+            Op::GetTimestamp().  When we create Ops for backpropagation of
+            derivatives, we record the tick at which the Op was created, for
+            purposes of checking for invalidation (see: "Invalidated"), and
+            also of ordering Ops during backprop.
+
+   Tracked:  We say a Variable is tracked if gradient-tracking is
+             enabled for it.  This will be the case if it is
+             a leaf Variable constructed with requires_grad = true,
+             or a non-leaf Variable that has been created or changed
+             by an operation that depended on a tracked Variable.
+             A non-tracked Variable can become tracked but not vice
+             versa.  The granularity of being tracked is at the
+            "base variable" level.
+
+   Underlying / memory underlying: For a Tensor or Variable a, the "memory
+             underlying a" means the part of computer memory, accessible through
+             the storage object, that is covered by the pattern of a.
+
+   View Variable:  A View Variable is any variable that is not a base
+            variable.  Such variables will be views of base Variables that have
+            been created from them by some operation such as slicing
+            (e.g. taking row or column ranges).
+
+    Whole Tensor:  A whole Tensor is a Tensor through which one can
+            access every byte of the storage region underlying it.
+            W.r.t. the notation in pattern.h (and using words
+            that describe Patterns to describe Tensors having those patterns),
+            this is equivalent to saying that Tensor is compact and
+            justified, and the size of its memory-index-set times the
+            bytes per element for its data-type equals the number of bytes
+            allocated in the storage region.
+
+
+ */
+
+
+
+namespace kaldi {
+namespace tensor {
+
+
+/**
+   A Tensor is a multi-dimensional array (up to 5 dimensions) of types such as
+   float or double (and eventually ints).  Multiple Tensors may point to data
+   allocated from the same Storage.  Class Tensor contains enough elements that
+   it makes sense most of the time to pass it around by reference (Tensor&) or
+   by pointer (e.g. Tensor* or std::shared_pointer<Tensor>).  This is unlike
+   in PyTorch where there is a separate TensorImpl class and Tensor really just
+   contains a pointer to it.
+
+   Most of the operations that you would do on a Tensor (like addition,
+   multiplication and so on) are declared out-of-line in tensor-functions.h.
+
+
+ */
+class Tensor {
+ public:
+
+  // Return the number of axes (a number in {0,1,2,3,4,5,6}).  In mathematical
+  // contexts, this is sometimes known as the rank of the tensor, or sometimes
+  // even its dimension, but these terms are ambiguous so we avoid them, and use
+  // the terms 'number of axes' or 'axis' throughout.
+  // Caution: the numbering of axes in the Tensor interface is different
+  // than in TensorImpl::pattern.  Here they are numbered from zero;
+  // in TensorImpl::pattern they are shifted to the right so
+  // the last axis is KALDI_TENSOR_MAX_DIM - 1.
+  inline int32 NumAxes() const { return impl_.pattern.num_axes; }
+
+  const TensorImpl &Impl() const { return impl_; }
+
+  const TensorMeta &Meta() const { return reinterpret_cast<TensorMeta&>(impl_); }
+
+  // Return reference to the struct containing the dimension and
+  // stride info.
+  const Pattern &Pattern() const { return impl_.pattern; }
+
+  // Return a vector containing dimensions of the tensor; equivalent to
+  // .shape in PyTorch.  Dims().size() will equal NumAxes().
+  // This cannot return some kind of reference because the
+  // dims are stored internally in reversed order.
+  std::vector<int32> Dims() const;
+
+  // Return a vector containing the strides of the tensor.
+  // Strides().size() will equal NumAxes().
+  std::vector<int32> Strides() const;
+
+  // Returns the dimension on the supplied axis
+  //  @param [in] axis  Axis on which dimension is required, with
+  //                    -NumAxes() <= axis < NumAxes(); negative axis
+  //                    is interpreted as an offset from NumAxes().
+  //  @return        Returns the dimension on this axis, a number >= 1.
+  inline int32 Dim(int32 axis) const { return impl_.Dim(axis); }
+
+  // Returns the stride on the supplied axis (using the public axis numbering)
+  //  @param [in] axis  Axis on which stride is required, with
+  //                    -NumAxes() <= axis < NumAxes(); negative axis
+  //                    is interpreted as an offset from NumAxes().
+  //  @return          Returns the stride on this axis, which will be 0 if
+  //                   Dim(axis) == 1, and otherwise nonzero.
+  inline int32 Stride(int32 axis) const { return impl_.Stride(axis); }
+
+  // Returns the number of elements in the Tensor; will be > 0,
+  // and will equal the product of Dims().
+  int64 NumElements() const;
+
+  // Returns true if the data forms a contiguous block in memory.
+  // (not the same as 'contiguous()' in PyTorch, which also requires
+  // that the strides be 'C'-style; for that, see HasCStrides().
+  // TODO: see if this needs to be cached.
+  bool IsContiguous() const;
+
+  // Returns true if the strides for this array are what you would
+  // expect if you were to construct a Tensor from this->Dims();
+  // this means "C"-style strides, except that any axis with dimension=1
+  // has its stride set to zero.  This is our equivalent of PyTorch's
+  // contiguous().
+  bool HasCStrides() const;
+
+  // Return the data type.
+  DataType Dtype() const { return impl_->dtype; }
+
+  // Return the device type.
+  DeviceType DeviceType() const { return impl_->device.device_type; }
+
+  /**
+     Indexing operator taking one arg.  Returns a Tensor referencing
+     the same underlying data as this Tensor.
+
+
+     You can provide an int32 where RangeExt is expected; it will be
+     converted to a special struct of type Range. See the documentation for type
+     Range, and the table which it contains.
+     will return a scalar Tensor (0 axes
+
+     Any of these indexing operators can operate on Tensors with more axes;
+     trailing axes will be left alone.
+
+  // this operator () taking int32 is only provided in the one-arg case as a
+  // convenience; in any case, RangeExt can be constructed from int32 with the
+  // same effect.
+
+  Tensor operator () (RangeExt s0) const;
+  Tensor operator () (RangeExt s0, RangeExt s1) const;
+  Tensor operator () (RangeExt s0, RangeExt s1, RangeExt s2) const;
+  Tensor operator () (RangeExt s0, RangeExt s1, RangeExt s2,
+                      RangeExt s3) const;
+  // A particularly complicated example showing what is possible:
+  // Tensor a(...);
+  // Tensor b = a(all,10,Range(0,5),Range(all,all,-1),all)
+  Tensor operator () (RangeExt s0, RangeExt s1, RangeExt s2,
+                      RangeExt s3, RangeExt s4) const;
+
+
+  // For a Tensor with NumElements() == 1, returns the element, cast to float
+  explicit operator float() const;
+  // For a Tensor with NumElements() == 1, returns the element, cast to double
+  explicit operator double() const;
+  // For a Tensor with NumElements() == 1, returns the element, cast to int32
+  explicit operator int32() const;
+
+  // For a Tensor storing floats, returns the data pointer cast to float;
+  // otherwise, throws.  (note: this is const only as it doesn't change the
+  // Tensor meta-info, but you could change the data using the pointer).
+  explicit operator float* () const;
+  // For a Tensor storing doubles, returns the data pointer cast to float;
+  // otherwise, throws.  (note: this is const only as it doesn't change the
+  // Tensor meta-info, but you could change the data using the pointer).
+  explicit operator double* () const;
+
+  // Assignment operation which sets all elements to a constant.  Valid
+  // for Tensors of any floating point type.
+  const Tensor & operator = (float f);
+
+  // Transpose the two axes by swapping their dims and strides without changing
+  // the underlying data in memory.  This modifies *this;
+  // Negative axes are allowed, and interpreted as NumAxes() - axis.
+  void Transpose(int32 axis1 = 0, int32 axis2 = 1);
+
+
+  // Constructor which does not really initialize the Tensor.  impl_.pattern,
+  // derived_ and dtype_ may contain nonsense.
+  Tensor(): data_(NULL) { }
+
+  // Copy constructor that copies the metadata while sharing the underlying
+  // data.
+  Tensor (const Tensor &other) = default;
+
+  // Move assignment.
+  Tensor(Tensor &&other): impl_(other.impl_) { storage.swap(other.storage_); }
+
+  /**
+     Construct a new Tensor with freshly allocated underlying data with
+     the data type, device and dimensions the same as `other`.  The strides
+     will be the same order as 'other' if sp == kCopyStrideOrder.
+
+       @param [in]  meta  The metadata we are copying the dims, device,
+                       dtype and possibly strides from
+       @param [in]  sp   The stride policy; if kCopyStrideOrder then we use
+                       strides with the same sign and size-order as
+                       `other`, while filling in any gaps if `other`
+                       was not contiguous, if kCstrides then we use
+                       "C" style strides, i.e. we ignore the stride
+                       order of the source.  (Of course, we set strides
+                       to zero for any axes with `dim=1`, as required by our
+                       framework).
+       @param [in]  ip   The data initialization policy
+  */
+  Tensor(const Meta &meta,
+         StridePolicy sp): impl_(new TensorImpl(meta, sp)) { }
+
+
+  /** Construct a Tensor with freshly allocated, uninitialized data.
+
+       @param [in] dims    The dimensions of the tensor, up to
+                     KALDI_TENSOR_MAX_DIM positive integers.
+       @param [in] opts  Options regarding data-type and device.
+
+    Example (note: the braces are braced-initializer-lists for
+    the object of type TensorOptions.
+<code>
+   Tensor a({3,4}, {context});
+   Tensor b({}, {context, kDoubleDtype);
+   Tensor c({5,6,7}, {context, kCpuDevice);
+   Tensor d({1,2}, {context, kDoubleDtype, kCpuDevice});
+   Tensor d({1,2}, {kDoubleDtype, kCpuDevice});
+</code>
+  */
+  inline Tensor(ArrayRef<int32> dims,
+                const TensorOptions &opts):
+      impl_(new TensorImpl(context, opts)) { }
+
+
+
+  /**
+     Construct a Tensor from the metadata in 'meta'.  Requires
+     that meta.pattern be contiguous (meaning: literally contiguous,
+     not the PyTorch meaning which is a stronger condition).
+
+       @param [in] meta  Struct containing the metadata specifying
+                     the Tensor's pattern, data-type and device
+  */
+  Tensor(TensorMeta &meta, InitializePolicy p);
+
+
+  // Move assignment.  TODO: check whether this really does move on the
+  // shared_ptr.
+  Tensor(Tensor &&other): impl_(other.impl_) { }
+
+  /**
+     Constructor from TensorImpl.  Will often be used by framework code; not
+     intended for use by users.
+   */
+  Tensor(const std::shared_ptr<const TensorImpl> &impl);
+
+  /**
+     Move-constructor version of constructor from TensorImpl.  Will often be
+     used by framework code; not intended for use by users.  TODO: check that
+     this really does move.
+  */
+  Tensor(const std::shared_ptr<const TensorImpl> &&impl): impl_(impl) { }
+
+
+  /**
+     Shallow copy: just makes this point to the TensorImpl in `other`.
+   */
+  Tensor operator =(const Tensor &other) { impl_ = other.impl_; }
+
+
+  /**
+     Return a copy of the TensorImpl underlying this Tensor;
+     this would normally be done when you want to change
+     something in the TensorImpl but don't want to invalidate
+     this Tensor or others sharing the same TensorImpl object.
+
+   */
+  TensorImpl *CopyImpl();
+
+
+  /**
+     Returns the data pointer cast to type T, with the offset from the pattern
+     included.  Calling this will force allocation of the storage region if it
+     was not already allocated.
+     If the
+  */
+  template <class T> T* GetData(bool *was_uninitialized) const;
+
+  /**
+     Returns the data pointer cast to type T, with the offset from
+     the pattern included.  Calling this will force allocation of
+     the storage region if it was not already allocated.
+   */
+  template <class T> T* GetData() const;
+
+  /**
+     Returns the data pointer cast to type T, but without the offset from the
+     pattern.
+   */
+  template <class T> T* GetRawData() const;
+
+
+ private:
+
+  // It might seem odd that we contain a shared_ptr to *const* TensorImpl.
+  // What is const here is the meta-information, not the underlying data
+  // (e.g. the floats).  The reason for this decision is mostly so that class
+  // Variable can store Tensors and shared_ptr's to TensorImpl and not
+  // worry about the meta-information pointed to by those pointers being
+  // unexpectedly changed.  The idea is, whenever you need to change this
+  // meta-info, you reallocate;  things that need to manipulate meta-info
+  // and don't want to reallocate can work directly with TensorImpl which
+  // is a lower-level, less safe interface intended for the developers of
+  // this toolkit.
+  //
+  // Note: the difference between a Tensor and a simple std::shared_ptr<const
+  // TensorImpl> is that in the Tensor the pointer is guaranteed to be non-NULL.
+  // We use the shared_ptr where it could be NULL, e.g. in Variables for the
+  // grad (since it might not have been set up).
+  std::shared_ptr<const TensorImpl> impl_;
+};
+
+
+
+/**
+   This is to be used when you know that 'impl' is non-NULL and you want to
+   treat it as a Tensor.  You should view the type `std::shared_ptr<const
+   TensorImpl>` as "might be Tensor, might be NULL".
+*/
+inline const Tensor &AsTensor(std::shared_ptr<const TensorImpl> &impl) {
+  return reinterpret_cast<const Tensor&>(impl);
+}
+
+
+
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_TENSOR_H_
diff --git a/src/tensor/variable-functions.h b/src/tensor/variable-functions.h
new file mode 100644
index 00000000000..91495c1c131
--- /dev/null
+++ b/src/tensor/variable-functions.h
@@ -0,0 +1,128 @@
+// tensor/variable-functions.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_VARIABLE_FUNCTIONS_H_
+#define KALDI_VARIALBE_FUNCTIONS_H_ 1
+
+#include "tensor/tensor.h"
+
+namespace kaldi {
+namespace tensor {
+
+// This file contains functions operating on Variables, mostly functions that
+// return other Variables.
+
+
+
+/**
+   Return a Variable wrapping a newly-allocated Tensor with undefined
+   values, with the specified dimensions.
+
+       @param [in] dims   Dimensions (in public ordering) of the requested
+                      Tensor.  Must all be positive, with the length of
+                      the list not exceeding KALDI_TENSOR_MAX_DIM = 6
+
+  An example is below.
+<code>
+   Variable scalar = Undefined({});
+   Variable a = Undefined({3,4}, {kDoubleDtype});
+   Variable b = Undefined({1,100}, {kDoubleDtype, kCudaDevice});
+</code>
+  Note on C++: reading the code above may require getting used to C++
+  braced-initializer-lists.  The {3,4} is interpreted as a
+  std::inititializer_list<int32> passed to to the constructor of ArrayRef; the
+  {kDoubleDtype} is an arg to the constructor of TensorOptions.
+ */
+Variable Undefined(ArrayRef<int32> dims,
+                   TensorOptions opts = TensorOptions());
+
+
+
+/**
+   Return a Variable with all-zero values, with the specified dimensions
+
+       @param [in] dims   Dimensions (in public ordering) of the requested
+                      Tensor.  Must all be positive, with the length of
+                      the list not exceeding KALDI_TENSOR_MAX_DIM = 6
+
+  An example is below.
+<code>
+   Variable scalar = Zeros({});
+   Variable a = Zeros({3,4}, {kDoubleDtype});
+   Variable b = Zeros({1,100}, {kDoubleDtype, kCudaDevice});
+</code>
+  Note on C++: reading the code above may require getting used to C++
+  braced-initializer-lists.  The {3,4} is interpreted as a
+  std::inititializer_list<int32> passed to to the constructor of ArrayRef; the
+  {kDoubleDtype} is an arg to the constructor of TensorOptions.
+ */
+inline Variable Zeros(ArrayRef<int32> dims,
+                      TensorOptions opts = TensorOptions());
+
+
+Variable Ones(ArrayRef<int32> dims);
+
+
+/**
+   Return a Tensor with
+ */
+Variable RandUniform(ArrayRef<int32> dims);
+
+/**
+   Sum all axes of a Variable and returns a Variable with one element and no
+   axes.
+
+       @param [in]  v   Variable to be summed.
+       @return          The summation; will equal the sum over, all
+                        axes of v; will have zero axes, and the same
+                        device and dtype of 'v'.
+Example:
+<code>
+   Variable v = Rand({3,4,5});
+   Variable w = v.Sum();
+</code>
+   See also the version of Sum() for which you can specify axes.
+ */
+Variable Sum(const Variable &v);
+
+/**
+   Sum specified axes of a Variable.  The returned Variable will have
+   that many fewer axes.
+
+       @param [in] v      Variable to be summed
+       @param [in] eaxes
+ */
+Variable Sum(const Variable &v, ArrayRef<int32> eaxes);
+
+
+
+/**
+   Return a Variable that shares the same underlying Tensor as `v` but is
+   separate in terms of the autograd graph.  The returned Variable
+   will be a base Variable (see
+ */
+Variable Detach(const Variable &v);
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_FUNCTIONS_H_
diff --git a/src/tensor/variable-inl.h b/src/tensor/variable-inl.h
new file mode 100644
index 00000000000..f7f3f4fe68e
--- /dev/null
+++ b/src/tensor/variable-inl.h
@@ -0,0 +1,132 @@
+// tensor/variable-inl.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_VARIABLE_INL_H_
+#define KALDI_TENSOR_VARIABLE_INL_H_ 1
+
+// Do not include this file directly.  It is to be included from variable.h.
+
+namespace kaldi {
+namespace tensor {
+
+bool VariableImpl::Tracked() const {
+  if (grad_) {
+    return true;
+  } else if (!base_) {
+    return false;  // This is a base Variable with no grad -> not tracked.
+  } else if (base_->grad_ == nullptr) {
+    return false;
+  } else {
+    // We need to obtain and cache the Tensor corresponding to this
+    // sub-part of the grad.  (See "Lazy allocation" in glossary in tensor.h
+    // for why this won't allocate much memory).
+    grad_ = base_->GetGradForView(data_);
+    return true;
+  }
+}
+
+Tensor VariableImpl::GetGradForView(const Tensor &data) {
+  // Check that this is a tracked base Variable.
+  KALDI_PARANOID_ASSERT(base_ == nullptr && grad_ != nullptr);
+  std::shared_ptr<TensorImpl> ans = new TensorImpl(data.Meta(),
+                                                   grad_->storage);
+  if (!rebase_grad_) {
+    // The grad will have exactly the same offset, dims and strides as the data.
+    // This is the normal case, which we encounter when the Variable was
+    // constructed from a Tensor that is justified and contiguous (see glossary
+    // in pattern.h for meanings).
+    return Tensor(ans);
+  } else {
+    if (!aux_)
+      aux_ = new VariableImplAux;
+    if (!aux_->rebaser)
+      aux_->rebaser = new PatternRebaser(pattern_,
+                                               grad_->pattern_);
+    const PatternRebaser &rebaser = *(aux_->rebaser);
+    if (!rebaser->Rebase(&(ans->pattern))) {
+      // die.
+      KALDI_ERR << "Rebasing failed.  Likely you are using views "
+          "in a very strange way.";
+    }
+    KALDI_PARANOID_ASSERT(ans->IsValid());
+    return Tensor(ans);
+  }
+}
+
+
+const std::shared_ptr<Tensor>& VariableImpl::GetGrad() {
+  if (grad_) {
+    return grad_;
+  } else if (!base_) {
+    CreateGrad();
+    return grad_;
+  } else {
+    if (!base->grad_)
+      base->CreateGrad();
+    grad_ = base->GetGradForView(data_);
+    return grad_;
+  }
+}
+
+
+void VariableImpl::CreateGrad() {
+  if (ContiguousAndStartsFromZero(data_->Impl())) {
+    // the following creates a new TensorImpl with its own new
+    // Storage object with the meta-info provided; it will just
+    // mirror data_.
+    grad_ = new TensorImpl(data_.Meta(), true);
+    rebase_grad_ = false;
+  } else {
+    // Don't allocate the storage yet; we need to fix the pattern to fill in any
+    // gaps and move the offset to zero.
+    grad_ = new TensorImpl();
+    // grad_->pattern will be as the pattern of data_, but with any
+    // gaps filled in and the smallest mindex equal to zero.
+    MakeContiguousAndJustified(data_.Meta().pattern,
+                               &(grad_->pattern));
+    rebase_grad_ = true;
+  }
+
+
+    // This is a base Variable and we need to construct the grad.
+    //
+
+    // node.  (Assume it
+    // is not allocated if grad_ was not allocated).
+    node_ = std::make_shared<Node>(data_);
+    grad_ = node_->grad;
+    return grad_;
+  } else {
+    // This is a view Variable
+    if (!base_->node_) {  // make node of base if needed
+      base_->node_ = std::make_shared<Node>(base->data_);
+      base_->grad_ = node_->grad;
+    }
+    // cache node in view
+    node_ = base_->node_;
+    grad_ = node_->GetGradFor(data_);
+    return grad_;
+  }
+}
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_VARIABLE_INL_H_
diff --git a/src/tensor/variable-inplace.h b/src/tensor/variable-inplace.h
new file mode 100644
index 00000000000..d5b4da0c9ca
--- /dev/null
+++ b/src/tensor/variable-inplace.h
@@ -0,0 +1,119 @@
+// tensor/variable-inplace.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_VARIABLE_INPLACE_H_
+#define KALDI_VARIALBE_INPLACE_H_ 1
+
+#include "tensor/tensor.h"
+
+namespace kaldi {
+namespace tensor {
+
+// This file contains functions doing various in-place operations on Variables.
+// These functions will usually be called from brief inline member functions
+// within class Variable that just forward the call here.  We do it this way
+// (rather than making the implementation of these functions be
+// member-functions) to keep the code of class Variable relatively concise.
+
+
+
+/**
+   Set all elements of Variable v to scalar value 'a'.
+
+    @param [in] a  Scalar value; can be constructed from
+                   float or double.
+    @param [in,out] v  Variable to set all the values of
+*/
+void Set(Scalar a, Variable *v);
+
+/**
+   Set all elements of Variable v to zero
+      @param [in,out] v   Variable to modify
+ */
+void SetZero(Variable *v);
+
+
+
+
+
+/**
+   Return a Variable with all-zero values, with the specified dimensions
+
+       @param [in] dims   Dimensions (in public ordering) of the requested
+                      Tensor.  Must all be positive, with the length of
+                      the list not exceeding KALDI_TENSOR_MAX_DIM = 6
+
+  An example is below.
+<code>
+   Variable scalar = Zeros({});
+   Variable a = Zeros({3,4}, {kDoubleDtype});
+   Variable b = Zeros({1,100}, {kDoubleDtype, kCudaDevice});
+</code>
+  Note on C++: reading the code above may require getting used to C++
+  braced-initializer-lists.  The {3,4} is interpreted as a
+  std::inititializer_list<int32> passed to to the constructor of ArrayRef; the
+  {kDoubleDtype} is an arg to the constructor of TensorOptions.
+ */
+inline Variable Zeros(ArrayRef<int32> dims,
+                      TensorOptions opts = TensorOptions());
+
+
+Variable Ones(ArrayRef<int32> dims);
+
+
+/**
+   Return a Tensor with
+ */
+Variable RandUniform(ArrayRef<int32> dims);
+
+/**
+   Sum all axes of a Variable and returns a Variable with one element and no
+   axes.
+
+       @param [in]  v   Variable to be summed.
+       @return          The summation; will equal the sum over, all
+                        axes of v; will have zero axes, and the same
+                        device and dtype of 'v'.
+Example:
+<code>
+   Variable v = Rand({3,4,5});
+   Variable w = v.Sum();
+</code>
+   See also the version of Sum() for which you can specify axes.
+ */
+Variable Sum(const Variable &v);
+
+/**
+   Sum specified axes of a Variable.  The returned Variable will have
+   that many fewer axes.
+
+       @param [in] v      Variable to be summed
+       @param [in] eaxes
+ */
+Variable Sum(const Variable &v, ArrayRef<int32> eaxes);
+
+
+
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+#endif  // KALDI_TENSOR_FUNCTIONS_H_
diff --git a/src/tensor/variable.cc b/src/tensor/variable.cc
new file mode 100644
index 00000000000..37b09de6bb2
--- /dev/null
+++ b/src/tensor/variable.cc
@@ -0,0 +1,34 @@
+// tensor/variable.cc
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor/variable.h"
+
+
+namespace kaldi {
+namespace tensor {
+
+
+void TensorGrad::EnsureGradAllocated() {
+
+}
+
+
+
+}  // namespace kaldi
+}  // namespace tensor
diff --git a/src/tensor/variable.h b/src/tensor/variable.h
new file mode 100644
index 00000000000..f89abafa4ee
--- /dev/null
+++ b/src/tensor/variable.h
@@ -0,0 +1,335 @@
+// tensor/variable.h
+
+// Copyright      2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TENSOR_VARIABLE_H_
+#define KALDI_TENSOR_VARIABLE_H_ 1
+
+#include "tensor/variable.h"
+
+namespace kaldi {
+namespace tensor {
+
+
+
+
+// Shared data of a base Variable.  Each tracked base Variable gets one of
+// these; non-base Variables (views into other variables) share the Node of
+// their base Variable.
+class Node {
+
+
+  /**
+     Construct a Node.
+          @param [in] tensor   The data_ of the base Variable to which
+                     this node is to be attached.  The created 'grad' will
+                     have the same dims but different Storage and possibly
+                     different strides;
+  */
+  explicit Node(const Tensor &tensor);
+
+
+
+  /**
+     This is to be used when setting the grad_ member of view variables.  it
+     constructs a new Tensor with the appropriate pattern for the view, but
+     pointing to the storage of 'grad'.
+          @param [in] tensor  The data_ of the view Variable for which
+                      we are requesting the gradient Tensor.
+          @return     Returns a Tensor that is a view into 'grad', with
+                      the same relationship to it as 'tensor' had to
+                      its underlying Variable.
+  */
+  std::shared_ptr<Tensor> GetGradFor(const Tensor &tensor);
+
+
+  /**
+     Sets the most recent Op held here (op_).  This is called whenever
+     an Op is created that changed a Variable attached to this Node.  The
+     Op itself should have a shared_ptr to the previous Op that was attached
+     to this Node.
+   */
+  inline void SetOp(const std::shared_ptr<Op> &op) { op_ = op; }
+
+  // The gradient.  This is set up when the Node is created, but the data in its
+  // Storage object won't necessarily have been allocated (see "Lazy Allocation"
+  // in tensor.h)
+  std::shared_ptr<Tensor> grad;
+
+  // Either NULL, or an object capable of converting patterns from
+  // tensor to gradients (used for views).  Will be NULL in the usual
+  // case where the Tensor for this base Variable has the same strides
+  // and offset as the grad.
+  std::unique_ptr<PatternRebaser> rebaser;
+
+  // latest_op is the most recent of the Ops that modified the base Variable
+  // this is attached to, or any view into it.
+
+  // op_list (will usually be NULL) is the head of a list of Ops that wrote to
+  // this Node (the most recent first).  In the backward pass we call Backprop()
+  // on each of these Ops in turn.  TODO: make it unique_ptr?
+  std::shared_ptr<Op> latest_op;
+
+ private:
+  Node(const Node &other);  // Disallow copy construction
+  Node & operator = (const Node &other);  // Disallow assignment
+};
+
+
+/**
+   This is an overflow from class VariableImpl of various rarely-used fields; we
+   instantiate it only when they are used.  This avoids bulking up the
+   implementation of VariableImpl with them.
+
+
+ */
+struct VariableImplAux {
+
+  // rebaser_ is always NULL for view Variables.   For tracked base
+  // Variables where data_ and grad_ have different offset and/or
+  // strides, it is an object capable of converting patterns from
+  // tensors to gradients (used when constructing views).
+  std::unique_ptr<PatternRebaser> rebaser;
+
+  // config_ is NULL if no config values have been stored; otherwise,
+  // a pointer to class Config.
+  std::unique_ptr<Config> config;
+
+
+};
+
+
+/**
+   Implementation class for Variable.  Variable just holds a shared_ptr to this.
+ */
+class VariableImpl {
+ public:
+
+  inline const Tensor &GetData() const { return data_; }
+
+  // Returns true if this Variable is tracked (see "Tracked" in the
+  // glossary in tensor.h).
+  inline bool Tracked() const;
+
+  // Returns the most recent Op in the autograd graph (will return the same
+  // value for all Variables sharing the same base Variable).  Will be
+  // NULL if this Variable was not tracked.
+  inline const std::shared_ptr<const Op> &GetOp() const;
+
+  // Returns the Tensor corresponding to the gradient; this will make the
+  // Variable (and any other Variable sharing the same base Variable) tracked if
+  // it was not tracked before (see "Tracked" in glossary in tensor.h)
+  inline const Tensor& GetGrad();
+
+
+  // Returns the Tensor corresponding to the gradient if this variable is
+  // tracked; else returns NULL; Differs from GetGrad() in its behavior for
+  // non-tracked Variables.
+  inline const std::shared_ptr<const TensorImpl>& GetGradIfTracked();
+
+
+  // Sets the most recent Op for the base Variable of this Variable;
+  // this is called by Ops to register themselves with Variables that
+  // they modify.
+  inline void SetOp(std::shared_ptr<const Op> &op);
+
+
+  // This function must only be called on tracked base Variables (see glossary
+  // in tensor.h; it requires grad_ != NULL and base_ == NULL).  It gets the
+  // grad Tensor corresponding to the data in 'data', which is assumed to
+  // be a view into this->data_.  This grad Tensor will be a view into
+  // this->grad_.  This function is called from view Variables when setting
+  // up their grad_ variables.
+  inline Tensor GetGradForView(const Tensor &data);
+
+ private:
+
+  // This function, which must only be called on a non-tracked base Variable,
+  // creates the 'grad_' tensor.
+  void CreateGrad();
+
+  // The Tensor that this Variable wraps.  (Note: it just holds a non-NULL
+  // shared_ptr<const TensorImpl>.  The const is to ensure the meta-info
+  // isn't changed unexpectedly).
+  Tensor data_;
+
+  // The gradient corresponding to `data_`, or NULL if this is:
+  //  (a) a base Variable that is not tracked, or
+  //  (b) a view Variable that is either not tracked, or we have
+  //      not yet cached the gradient.  (we might need to follow
+  //      the base_ pointer to get the gradient).
+  // Note: the data underlying this gradient is not necessarily allocated; see
+  // "Lazy Allocation" in the glossary in tensor.h.
+  // The type differs from Tensor only because it might be NULL.
+  std::shared_ptr<const TensorImpl> grad_;
+
+  // 'base_' is NULL if this is a base Variable (i.e. not a view of another
+  // Variable); otherwise it points to the base Variable.
+  std::shared_ptr<VariableImpl> base_;
+
+  // op_ is always NULL for view Variables.  For tracked base Variables,
+  // it is the most recent Op that modified this Variable.  (The autograd
+  // graph is solely between Ops; this latest_ is our entry point to that
+  // graph and is also used in its construction).
+  std::shared_ptr<const Op> op_;
+
+  // For tracked base Variables, this will be set to true if the pattern of
+  // grad_ is different from the pattern of data_ (because data_ was not
+  // contiguous and justified), and false otherwise.  If this is true, we need
+  // to rebase any views of this variable.  For non-tracked or non-base
+  // Variables, its value is undefined.
+  bool rebase_grad_;
+
+  // overwrite_ is part of a mechanism that avoids unnecessary zeroing of parts
+  // of derivatives during the backprop phase.  By default we assume that if we
+  // write to a Variable in a way that doesn't depend on the previous value
+  // (e.g. we set it, rather than add to it or multiply in-place), then the
+  // previous memory underlying that Variable has not previously participated in
+  // any operations requiring derivatives.
+  //
+  // If you are about to modify a Variable c that *has* previously participated
+  // in operations requiring derivatives, then, instead of, say:
+  //  DoSomethingWith(a, b, &c);
+  // (and let's suppose this operation ignores the previous value of `c`),
+  // you could do:
+  //  DoSomethingWith(a, b, &c.Overwrite());
+  // whereby you assert that the memory underlying this variable may have
+  // previously participated in operations requiring derivative tracking
+  // (and hence we need to an extra zeroing after the backprop).
+  // The call to Overwrite() sets the `overwrite_` bool, and then the
+  // DoSomethingWith() call should unset it.  (Note: even if that operation for
+  // some reason doesn't unset it, it doesn't really matter, as it would be safe
+  // to set it always).  The overwrite_ variable is intended to be read,
+  // and reset to false, within sub-classes of class Op.
+  //
+  // Look at the comment for class InvalidatedDataChecker in change-tracker.h
+  // for more information.
+  bool overwrite_;
+
+  // aux_ is basically a collection of less-often-used fields of class VariableImpl;
+  // it helps keep the main class uncluttered.
+  std::unique_ptr<VariableImplAux> aux_;
+};
+
+
+class Variable;
+
+
+/**
+   class Variable is like class Tensor but augmented with autograd machinery.
+*/
+class Variable {
+
+  /** Constructor from a Tensor.
+       @param [in] data  The source Tensor.  (This Variable will copy it; this
+                 is to avoid errors if you change the original Tensor).
+
+       @param [in] tracked    If `tracked` is true,
+                the gradient w.r.t. this Variable will be computed if and when
+                you call Backward() on a Variable that depends on it.
+                The same as requires_grad in PyTorch.
+  */
+  Variable(const Tensor &data, bool tracked);
+
+
+
+  /**
+     Returns true if this Variable is tracked (meaning: gradient tracking is
+     happening), see glossary in tensor.h for definition.
+  */
+  bool Tracked() const;
+
+
+
+  /**  Returns ref to the Tensor storing the data. */
+  Tensor &Data() const;
+
+
+  /**  Returns pointer to the Tensor storing the derivative w.r.t.  this
+       data.  Obtaining this Tensor won't allocate the memory, thanks to lazy
+       initialization.  Calling this will make this Variable tracked.
+  */
+  Tensor &GradData();
+
+
+  /**  Returns pointer to the Tensor storing the derivative w.r.t.  this data if
+       this Variable is already tracked, or NULL if not.  This is for framework
+       use, not for users.  Note: shared_ptr<TensorImpl> means "maybe a Tensor,
+       maybe NULL".
+  */
+  std::shared_ptr<const TensorImpl> GradDataIfPresent();
+
+
+  /**
+     Returns pointer to the base Variable (which may or may not be
+     identical to 'this'.
+   */
+  Variable GetBaseVariable();
+
+
+  /**
+     Returns the most recent Op that modified the base Variable of this
+     Variable.  This will be called so the dependency can be recorded in other
+     Ops, and also if called Backprop() and we want to create the list
+     of Ops to do backprop on.
+   */
+  std::shared_ptr<Op> GetOp();
+
+
+  /**
+     Sets the most recent Op held in the Node underlying this Variable
+     to the Op held in this shared_ptr (which must not be NULL).  This
+     is done whenever we create an Op that modifies a particular
+     Variable.
+  */
+  void SetOp(const std::shared_ptr<Op> &op);
+
+
+ private:
+  // You may ask: Variable is just a shared_ptr<VariableImpl>, so why not just
+  // get rid of it, rename VariableImpl to Variable, and give people the choice
+  // of what memory management approach to use?  The issue is, we *require* the
+  // use of shared_ptr because the `base_` pointer in VariableImpl is also a
+  // shared_ptr to VariableImpl.  Forcing the users to always supply a
+  // shared_ptr<Variable> seems like a bad pattern, so we use this `impl_`
+  // approach where the shared_ptr is hidden.  This is similar to class Tensor,
+  // although the VariableImpl is not const because (for instance) we may
+  // need to make it tracked if it isn't currently.
+  //
+  // The difference between a Variable and std::shared_ptr<VariableImpl> is that
+  // the latter may be NULL, but a Variable never has a NULL impl_.
+  std::shared_ptr<VariableImpl> impl_;
+};
+
+
+
+
+
+
+};
+
+
+}  // namespace tensor
+}  // namespace kaldi
+
+
+// Include implementation of inline functions.
+#include "tensor/variable-inl.h"
+
+
+#endif  // KALDI_TENSOR_VARIABLE_H_
diff --git a/src/tfrnnlm/Makefile b/src/tfrnnlm/Makefile
index 3dc8d584210..3e2e6f2e93f 100644
--- a/src/tfrnnlm/Makefile
+++ b/src/tfrnnlm/Makefile
@@ -30,7 +30,7 @@ TESTFILES =
 
 LIBNAME = kaldi-tensorflow-rnnlm
 
-ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \
           ../base/kaldi-base.a
 LDLIBS +=  -lz -ldl -fPIC -lrt
 LDLIBS += -L$(TENSORFLOW)/bazel-bin/tensorflow -ltensorflow_cc -ltensorflow_framework
diff --git a/src/tfrnnlmbin/Makefile b/src/tfrnnlmbin/Makefile
index 77fe58c088c..e4713dd353a 100644
--- a/src/tfrnnlmbin/Makefile
+++ b/src/tfrnnlmbin/Makefile
@@ -31,7 +31,7 @@ TESTFILES =
 
 ADDLIBS = ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \
           ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a \
           ../tfrnnlm/kaldi-tensorflow-rnnlm.a
 
 LDLIBS +=  -lz -ldl -fPIC -lrt
diff --git a/src/transform/Makefile b/src/transform/Makefile
index a265db6ac37..b515a289954 100644
--- a/src/transform/Makefile
+++ b/src/transform/Makefile
@@ -2,19 +2,17 @@ all:
 
 include ../kaldi.mk
 
-TESTFILES = regtree-fmllr-diag-gmm-test lda-estimate-test \
-      regression-tree-test fmllr-diag-gmm-test \
-      regtree-mllr-diag-gmm-test fmpe-test fmllr-raw-test
+TESTFILES = lda-estimate-test fmllr-diag-gmm-test
 
-OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \
-    regtree-fmllr-diag-gmm.o cmvn.o transform-common.o fmllr-diag-gmm.o \
-    lvtln.o mllt.o fmpe.o basis-fmllr-diag-gmm.o \
-    compressed-transform-stats.o fmllr-raw.o decodable-am-diag-gmm-regtree.o
+OBJFILES = lda-estimate.o \
+    cmvn.o transform-common.o fmllr-diag-gmm.o \
+    lvtln.o mllt.o basis-fmllr-diag-gmm.o \
+    compressed-transform-stats.o
 
 
 LIBNAME = kaldi-transform
 
 ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/transform/decodable-am-diag-gmm-regtree.cc b/src/transform/decodable-am-diag-gmm-regtree.cc
deleted file mode 100644
index 536fb8ed1bc..00000000000
--- a/src/transform/decodable-am-diag-gmm-regtree.cc
+++ /dev/null
@@ -1,234 +0,0 @@
-// transform/decodable-am-diag-gmm-regtree.cc
-
-// Copyright 2009-2011  Saarland University;  Lukas Burget
-//                2013  Johns Hopkins Universith (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-using std::vector;
-
-#include "transform/decodable-am-diag-gmm-regtree.h"
-
-namespace kaldi {
-
-
-BaseFloat DecodableAmDiagGmmRegtreeFmllr::LogLikelihoodZeroBased(int32 frame,
-                                                          int32 state) {
-  KALDI_ASSERT(frame < NumFramesReady() && frame >= 0);
-  KALDI_ASSERT(state < NumIndices() && state >= 0);
-
-  if (!valid_logdets_) {
-    logdets_.Resize(fmllr_xform_.NumRegClasses());
-    fmllr_xform_.GetLogDets(&logdets_);
-    valid_logdets_ = true;
-  }
-
-  if (log_like_cache_[state].hit_time == frame) {
-    return log_like_cache_[state].log_like;  // return cached value, if found
-  }
-
-  const DiagGmm &pdf = acoustic_model_.GetPdf(state);
-  const VectorBase<BaseFloat> &data = feature_matrix_.Row(frame);
-
-  // check if everything is in order
-  if (pdf.Dim() != data.Dim()) {
-    KALDI_ERR << "Dim mismatch: data dim = "  << data.Dim()
-        << " vs. model dim = " << pdf.Dim();
-  }
-  if (!pdf.valid_gconsts()) {
-    KALDI_ERR << "State "  << (state)  << ": Must call ComputeGconsts() "
-        "before computing likelihood.";
-  }
-
-  if (frame != previous_frame_) {  // cache the transformed & squared stats.
-    fmllr_xform_.TransformFeature(data, &xformed_data_);
-    xformed_data_squared_ = xformed_data_;
-    vector< Vector <BaseFloat> >::iterator it = xformed_data_squared_.begin(),
-        end = xformed_data_squared_.end();
-    for (; it != end; ++it) { it->ApplyPow(2.0); }
-    previous_frame_ = frame;
-  }
-
-  Vector<BaseFloat> loglikes(pdf.gconsts());  // need to recreate for each pdf
-  int32 baseclass, regclass;
-  for (int32 comp_id = 0, num_comp = pdf.NumGauss(); comp_id < num_comp;
-      ++comp_id) {
-    baseclass = regtree_.Gauss2BaseclassId(state, comp_id);
-    regclass = fmllr_xform_.Base2RegClass(baseclass);
-    // loglikes +=  means * inv(vars) * data.
-    loglikes(comp_id) += VecVec(pdf.means_invvars().Row(comp_id),
-                                xformed_data_[regclass]);
-    // loglikes += -0.5 * inv(vars) * data_sq.
-    loglikes(comp_id) -= 0.5 * VecVec(pdf.inv_vars().Row(comp_id),
-                                      xformed_data_squared_[regclass]);
-    loglikes(comp_id) += logdets_(regclass);
-  }
-
-  BaseFloat log_sum = loglikes.LogSumExp(log_sum_exp_prune_);
-  if (KALDI_ISNAN(log_sum) || KALDI_ISINF(log_sum))
-    KALDI_ERR << "Invalid answer (overflow or invalid variances/features?)";
-
-  log_like_cache_[state].log_like = log_sum;
-  log_like_cache_[state].hit_time = frame;
-
-  return log_sum;
-}
-
-DecodableAmDiagGmmRegtreeMllr::~DecodableAmDiagGmmRegtreeMllr() {
-  DeletePointers(&xformed_mean_invvars_);
-  DeletePointers(&xformed_gconsts_);
-}
-
-
-void DecodableAmDiagGmmRegtreeMllr::InitCache() {
-  if (xformed_mean_invvars_.size() != 0)
-    DeletePointers(&xformed_mean_invvars_);
-  if (xformed_gconsts_.size() != 0)
-    DeletePointers(&xformed_gconsts_);
-  int32 num_pdfs = acoustic_model_.NumPdfs();
-  xformed_mean_invvars_.resize(num_pdfs);
-  xformed_gconsts_.resize(num_pdfs);
-  is_cached_.resize(num_pdfs, false);
-  ResetLogLikeCache();
-}
-
-
-// This is almost the same code as DiagGmm::ComputeGconsts, except that
-// means are used instead of means * inv(vars). This saves some computation.
-static void ComputeGconsts(const VectorBase<BaseFloat> &weights,
-                           const MatrixBase<BaseFloat> &means,
-                           const MatrixBase<BaseFloat> &inv_vars,
-                           VectorBase<BaseFloat> *gconsts_out) {
-  int32 num_gauss = weights.Dim();
-  int32 dim = means.NumCols();
-  KALDI_ASSERT(means.NumRows() == num_gauss
-      && inv_vars.NumRows() == num_gauss && inv_vars.NumCols() == dim);
-  KALDI_ASSERT(gconsts_out->Dim() == num_gauss);
-
-  BaseFloat offset = -0.5 * M_LOG_2PI * dim;  // constant term in gconst.
-  int32 num_bad = 0;
-
-  for (int32 gauss = 0; gauss < num_gauss; gauss++) {
-    KALDI_ASSERT(weights(gauss) >= 0);  // Cannot have negative weights.
-    BaseFloat gc = Log(weights(gauss)) + offset;  // May be -inf if weights == 0
-    for (int32 d = 0; d < dim; d++) {
-      gc += 0.5 * Log(inv_vars(gauss, d)) - 0.5 * means(gauss, d)
-        * means(gauss, d) * inv_vars(gauss, d);  // diff from DiagGmm version.
-    }
-
-    if (KALDI_ISNAN(gc)) {  // negative infinity is OK but NaN is not acceptable
-      KALDI_ERR << "At component "  << gauss
-                << ", not a number in gconst computation";
-    }
-    if (KALDI_ISINF(gc)) {
-      num_bad++;
-      // If positive infinity, make it negative infinity.
-      // Want to make sure the answer becomes -inf in the end, not NaN.
-      if (gc > 0) gc = -gc;
-    }
-    (*gconsts_out)(gauss) = gc;
-  }
-  if (num_bad > 0)
-    KALDI_WARN << num_bad << " unusable components found while computing "
-               << "gconsts.";
-}
-
-
-const Matrix<BaseFloat>& DecodableAmDiagGmmRegtreeMllr::GetXformedMeanInvVars(
-    int32 state) {
-  if (is_cached_[state]) {  // found in cache
-    KALDI_ASSERT(xformed_mean_invvars_[state] != NULL);
-    KALDI_VLOG(3) << "For PDF index " << state << ": transformed means "
-                  << "found in cache.";
-    return *xformed_mean_invvars_[state];
-  } else {  // transform the means and cache them
-    KALDI_ASSERT(xformed_mean_invvars_[state] == NULL);
-    KALDI_VLOG(3) << "For PDF index " << state << ": transforming means.";
-    int32 num_gauss = acoustic_model_.GetPdf(state).NumGauss(),
-        dim = acoustic_model_.Dim();
-    const Vector<BaseFloat> &weights = acoustic_model_.GetPdf(state).weights();
-    const Matrix<BaseFloat> &invvars = acoustic_model_.GetPdf(state).inv_vars();
-    xformed_mean_invvars_[state] = new Matrix<BaseFloat>(num_gauss, dim);
-    mllr_xform_.GetTransformedMeans(regtree_, acoustic_model_, state,
-                                    xformed_mean_invvars_[state]);
-    xformed_gconsts_[state] = new Vector<BaseFloat>(num_gauss);
-    // At this point, the transformed means haven't been multiplied with
-    // the inv vars, and they are used to compute gconsts first.
-    ComputeGconsts(weights, *xformed_mean_invvars_[state], invvars,
-                   xformed_gconsts_[state]);
-    // Finally, multiply the transformed means with the inv vars.
-    xformed_mean_invvars_[state]->MulElements(invvars);
-    is_cached_[state] = true;
-    return *xformed_mean_invvars_[state];
-  }
-}
-
-const Vector<BaseFloat>& DecodableAmDiagGmmRegtreeMllr::GetXformedGconsts(
-    int32 state) {
-  if (!is_cached_[state]) {
-    KALDI_ERR << "GConsts not cached for state: " << state << ". Must call "
-              << "GetXformedMeanInvVars() first.";
-  }
-  KALDI_ASSERT(xformed_gconsts_[state] != NULL);
-  return *xformed_gconsts_[state];
-}
-
-BaseFloat DecodableAmDiagGmmRegtreeMllr::LogLikelihoodZeroBased(int32 frame,
-                                                                int32 state) {
-//  KALDI_ERR << "Function not completely implemented yet.";
-  KALDI_ASSERT(frame < NumFramesReady() && frame >= 0);
-  KALDI_ASSERT(state < NumIndices() && state >= 0);
-
-  if (log_like_cache_[state].hit_time == frame) {
-    return log_like_cache_[state].log_like;  // return cached value, if found
-  }
-
-  const DiagGmm &pdf = acoustic_model_.GetPdf(state);
-  const VectorBase<BaseFloat> &data = feature_matrix_.Row(frame);
-
-  // check if everything is in order
-  if (pdf.Dim() != data.Dim()) {
-    KALDI_ERR << "Dim mismatch: data dim = "  << data.Dim()
-        << " vs. model dim = " << pdf.Dim();
-  }
-
-  if (frame != previous_frame_) {  // cache the squared stats.
-    data_squared_.CopyFromVec(feature_matrix_.Row(frame));
-    data_squared_.ApplyPow(2.0);
-    previous_frame_ = frame;
-  }
-
-  const Matrix<BaseFloat> &means_invvars = GetXformedMeanInvVars(state);
-  const Vector<BaseFloat> &gconsts = GetXformedGconsts(state);
-
-  Vector<BaseFloat> loglikes(gconsts);  // need to recreate for each pdf
-  // loglikes +=  means * inv(vars) * data.
-  loglikes.AddMatVec(1.0, means_invvars, kNoTrans, data, 1.0);
-  // loglikes += -0.5 * inv(vars) * data_sq.
-  loglikes.AddMatVec(-0.5, pdf.inv_vars(), kNoTrans, data_squared_, 1.0);
-
-  BaseFloat log_sum = loglikes.LogSumExp(log_sum_exp_prune_);
-  if (KALDI_ISNAN(log_sum) || KALDI_ISINF(log_sum))
-    KALDI_ERR << "Invalid answer (overflow or invalid variances/features?)";
-
-  log_like_cache_[state].log_like = log_sum;
-  log_like_cache_[state].hit_time = frame;
-
-  return log_sum;
-}
-
-}  // namespace kaldi
diff --git a/src/transform/decodable-am-diag-gmm-regtree.h b/src/transform/decodable-am-diag-gmm-regtree.h
deleted file mode 100644
index b6e7888ffdc..00000000000
--- a/src/transform/decodable-am-diag-gmm-regtree.h
+++ /dev/null
@@ -1,141 +0,0 @@
-// transform/decodable-am-diag-gmm-regtree.h
-
-// Copyright 2009-2011  Saarland University;  Microsoft Corporation;
-//                      Lukas Burget
-//                2013  Johns Hopkins Universith (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_TRANSFORM_DECODABLE_AM_DIAG_GMM_REGTREE_H_
-#define KALDI_TRANSFORM_DECODABLE_AM_DIAG_GMM_REGTREE_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "itf/decodable-itf.h"
-#include "transform/regression-tree.h"
-#include "gmm/decodable-am-diag-gmm.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-#include "transform/regtree-mllr-diag-gmm.h"
-
-namespace kaldi {
-
-class DecodableAmDiagGmmRegtreeFmllr: public DecodableAmDiagGmmUnmapped {
- public:
-  DecodableAmDiagGmmRegtreeFmllr(const AmDiagGmm &am,
-                                 const TransitionModel &tm,
-                                 const Matrix<BaseFloat> &feats,
-                                 const RegtreeFmllrDiagGmm &fmllr_xform,
-                                 const RegressionTree &regtree,
-                                 BaseFloat scale,
-                                 BaseFloat log_sum_exp_prune = -1.0)
-    : DecodableAmDiagGmmUnmapped(am, feats, log_sum_exp_prune), trans_model_(tm),
-      scale_(scale), fmllr_xform_(fmllr_xform), regtree_(regtree),
-      valid_logdets_(false) {}
-
-  // Note, frames are numbered from zero but transition-ids (tid) from one.
-  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return scale_*LogLikelihoodZeroBased(frame,
-                                         trans_model_.TransitionIdToPdfFast(tid));
-  }
-
-  virtual int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
-
-  // Indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-
- protected:
-  virtual BaseFloat LogLikelihoodZeroBased(int32 frame, int32 state_index);
-
-  const TransitionModel *TransModel() { return &trans_model_; }
-
- private:
-  const TransitionModel &trans_model_;  // for transition-id to pdf mapping
-  BaseFloat scale_;
-  const RegtreeFmllrDiagGmm &fmllr_xform_;
-  const RegressionTree &regtree_;
-  std::vector< Vector<BaseFloat> > xformed_data_;
-  std::vector< Vector<BaseFloat> > xformed_data_squared_;
-  Vector<BaseFloat> logdets_;
-  bool valid_logdets_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmDiagGmmRegtreeFmllr);
-};
-
-class DecodableAmDiagGmmRegtreeMllr: public DecodableAmDiagGmmUnmapped {
- public:
-  DecodableAmDiagGmmRegtreeMllr(const AmDiagGmm &am,
-                                const TransitionModel &tm,
-                                const Matrix<BaseFloat> &feats,
-                                const RegtreeMllrDiagGmm &mllr_xform,
-                                const RegressionTree &regtree,
-                                BaseFloat scale,
-                                BaseFloat log_sum_exp_prune = -1.0):
-      DecodableAmDiagGmmUnmapped(am, feats, log_sum_exp_prune),
-      trans_model_(tm), scale_(scale), mllr_xform_(mllr_xform),
-      regtree_(regtree), data_squared_(feats.NumCols()) { InitCache(); }
-  ~DecodableAmDiagGmmRegtreeMllr();
-
-  // Note, frames are numbered from zero but transition-ids (tid) from one.
-  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return scale_*LogLikelihoodZeroBased(frame,
-                                         trans_model_.TransitionIdToPdfFast(tid));
-  }
-
-  virtual int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
-
-  // Indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-
-  const TransitionModel *TransModel() { return &trans_model_; }
-
- protected:
-  virtual BaseFloat LogLikelihoodZeroBased(int32 frame, int32 state_index);
-
- private:
-  /// Initializes the mean & gconst caches
-  void InitCache();
-  /// Get the transformed means times inverse variances for a given pdf, and
-  /// cache them. The 'state_index' is 0-based.
-  const Matrix<BaseFloat>& GetXformedMeanInvVars(int32 state_index);
-  /// Get the cached (while computing transformed means) gconsts for
-  /// likelihood calculation. The 'state_index' is 0-based.
-  const Vector<BaseFloat>& GetXformedGconsts(int32 state_index);
-
-  const TransitionModel &trans_model_;  // for transition-id to pdf mapping
-  BaseFloat scale_;
-  const RegtreeMllrDiagGmm &mllr_xform_;
-  const RegressionTree &regtree_;
-  // we want it public to have access to the pdf ids
-
-  /// Cache of transformed means time inverse variances for each state.
-  std::vector< Matrix<BaseFloat>* > xformed_mean_invvars_;
-  /// Cache of transformed gconsts for each state.
-  std::vector< Vector<BaseFloat>* > xformed_gconsts_;
-  /// Boolean variable per state to indicate whether the transformed means for
-  /// that state are cached.
-  std::vector<bool> is_cached_;
-
-  Vector<BaseFloat> data_squared_;  ///< Cached for fast likelihood calculation
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmDiagGmmRegtreeMllr);
-};
-
-}  // namespace kaldi
-
-#endif  // KALDI_TRANSFORM_DECODABLE_AM_DIAG_GMM_REGTREE_H_
diff --git a/src/transform/fmllr-raw-test.cc b/src/transform/fmllr-raw-test.cc
deleted file mode 100644
index 10fa3bae188..00000000000
--- a/src/transform/fmllr-raw-test.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-// transform/fmllr-raw-test.cc
-
-// Copyright  2009-2011 Microsoft Corporation
-//            2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "gmm/diag-gmm.h"
-#include "transform/fmllr-diag-gmm.h"
-#include "transform/fmllr-raw.h"
-
-namespace kaldi {
-
-
-void InitRandomGmm (DiagGmm *gmm_in) {
-  int32 num_gauss = 5 + rand () % 4;
-  int32 dim = 6 + Rand() % 5;
-  DiagGmm &gmm(*gmm_in);
-  gmm.Resize(num_gauss, dim);
-  Matrix<BaseFloat> inv_vars(num_gauss, dim),
-      means(num_gauss, dim);
-  Vector<BaseFloat> weights(num_gauss);
-  for (int32 i = 0; i < num_gauss; i++) {
-    for (int32 j = 0; j < dim; j++) {
-      inv_vars(i, j) = Exp(RandGauss() * (1.0 / (1 + j)));
-      means(i, j) = RandGauss() * (1.0 / (1 + j));
-    }
-    weights(i) = Exp(RandGauss());
-  }
-  weights.Scale(1.0 / weights.Sum());
-  gmm.SetWeights(weights);
-  gmm.SetInvVarsAndMeans(inv_vars, means);
-  gmm.ComputeGconsts();
-}
-
-void UnitTestFmllrRaw(bool use_offset) {
-  using namespace kaldi;
-  DiagGmm gmm;
-  InitRandomGmm(&gmm);
-  int32 model_dim =  gmm.Dim();
-
-  int32 raw_dim = 5 + Rand() % 3;
-  int32 num_splice = 1 + Rand() % 5;
-  while (num_splice * raw_dim < model_dim) {
-    num_splice++;
-  }
-
-  int32 full_dim = num_splice * raw_dim;
-  int32 npoints = raw_dim*(raw_dim+1)*10;
-
-  Matrix<BaseFloat> rand_points(npoints, full_dim);
-  rand_points.SetRandn();
-
-  Matrix<BaseFloat> lda_mllt(full_dim, full_dim + (use_offset ? 1 : 0)); // This is the full LDA+MLLT
-  // matrix.  TODO: test with offset.
-  lda_mllt.SetRandn();
-
-  FmllrRawAccs accs(raw_dim, model_dim, lda_mllt);
-
-  BaseFloat prev_objf_impr;
-  for (int32 iter = 0; iter < 4; iter++) {
-
-    for (int32 i = 0; i < npoints; i++) {
-      SubVector<BaseFloat> sample(rand_points, i);
-      accs.AccumulateForGmm(gmm, sample, 1.0);
-    }
-
-    Matrix<BaseFloat> fmllr_mat(raw_dim, raw_dim + 1);
-    fmllr_mat.SetUnit(); // sets diagonal elements to one.
-
-    FmllrRawOptions opts;
-    BaseFloat objf_impr, count;
-    accs.Update(opts, &fmllr_mat, &objf_impr, &count);
-
-    KALDI_ASSERT(objf_impr > 0.0);
-
-    if (iter != 0) {
-      // This is not something provable, but is always true
-      // in practice.
-      KALDI_ASSERT(objf_impr < prev_objf_impr);
-    }
-    prev_objf_impr = objf_impr;
-
-
-    // Now transform the raw features.
-    for (int32 splice = 0; splice < num_splice; splice++) {
-      SubMatrix<BaseFloat> raw_feats(rand_points,
-                                     0, npoints,
-                                     splice * raw_dim, raw_dim);
-      for (int32 t = 0; t < npoints; t++) {
-        SubVector<BaseFloat> this_feat(raw_feats, t);
-        ApplyAffineTransform(fmllr_mat, &this_feat);
-      }
-    }
-    accs.SetZero();
-  }
-}
-
-
-}  // namespace kaldi ends here
-
-int main() {
-  kaldi::g_kaldi_verbose_level = 5;
-
-  for (int i = 0; i < 2; i++) {  // did more iterations when first testing...
-    kaldi::UnitTestFmllrRaw(i % 2 == 0);
-  }
-  std::cout << "Test OK.\n";
-}
diff --git a/src/transform/fmllr-raw.cc b/src/transform/fmllr-raw.cc
deleted file mode 100644
index 6f52f9c630d..00000000000
--- a/src/transform/fmllr-raw.cc
+++ /dev/null
@@ -1,546 +0,0 @@
-// transform/fmllr-raw.cc
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <utility>
-#include <vector>
-using std::vector;
-
-#include "transform/fmllr-raw.h"
-#include "transform/fmllr-diag-gmm.h"
-
-namespace kaldi {
-
-FmllrRawAccs::FmllrRawAccs(int32 raw_dim,
-                           int32 model_dim,
-                           const Matrix<BaseFloat> &full_transform):
-    raw_dim_(raw_dim),
-    model_dim_(model_dim) {
-  if (full_transform.NumCols() != full_transform.NumRows() &&
-      full_transform.NumCols() != full_transform.NumRows() + 1) {
-    KALDI_ERR << "Expecting full LDA+MLLT transform to be square or d by d+1 "
-              << "(make sure you are including rejected rows).";
-  }
-  if (raw_dim <= 0 || full_transform.NumRows() % raw_dim != 0)
-    KALDI_ERR << "Raw feature dimension is invalid " << raw_dim
-              << "(must be positive and divide feature dimension)";
-  int32 full_dim = full_transform.NumRows();
-  full_transform_ = full_transform.Range(0, full_dim, 0, full_dim);
-  transform_offset_.Resize(full_dim);
-  if (full_transform_.NumCols() == full_dim + 1)
-    transform_offset_.CopyColFromMat(full_transform_, full_dim);
-  
-  int32 full_dim2 = ((full_dim+1)*(full_dim+2))/2;
-  count_ = 0.0;
-
-  temp_.Resize(full_dim + 1);
-  Q_.Resize(model_dim + 1, full_dim + 1);
-  S_.Resize(model_dim + 1, full_dim2);
-
-  single_frame_stats_.s.Resize(full_dim + 1);
-  single_frame_stats_.transformed_data.Resize(full_dim);
-  single_frame_stats_.count = 0.0;
-  single_frame_stats_.a.Resize(model_dim);
-  single_frame_stats_.b.Resize(model_dim);
-}
-
-
-bool FmllrRawAccs::DataHasChanged(const VectorBase<BaseFloat> &data) const {
-  KALDI_ASSERT(data.Dim() == FullDim());
-  return !data.ApproxEqual(single_frame_stats_.s.Range(0, FullDim()), 0.0);
-}
-
-void FmllrRawAccs::CommitSingleFrameStats() {
-  // Commit the stats for this from (in SingleFrameStats).
-  int32 model_dim = ModelDim(), full_dim = FullDim();
-  SingleFrameStats &stats = single_frame_stats_;
-  if (stats.count == 0.0) return;
-
-  count_ += stats.count;
-
-  // a_ext and b_ext are a and b extended with the count,
-  // which we'll later use to reconstruct the full stats for
-  // the rejected dimensions.
-  Vector<double> a_ext(model_dim + 1), b_ext(model_dim + 1);
-  a_ext.Range(0, model_dim).CopyFromVec(stats.a);
-  b_ext.Range(0, model_dim).CopyFromVec(stats.b);
-  a_ext(model_dim) = stats.count;
-  b_ext(model_dim) = stats.count;
-  Q_.AddVecVec(1.0, a_ext, Vector<double>(stats.s));
-
-  temp_.SetZero();
-  temp_.AddVec2(1.0, stats.s);
-  int32 full_dim2 = ((full_dim + 1) * (full_dim + 2)) / 2;
-  SubVector<double> temp_vec(temp_.Data(), full_dim2);
-  S_.AddVecVec(1.0, b_ext, temp_vec);
-}
-
-void FmllrRawAccs::InitSingleFrameStats(const VectorBase<BaseFloat> &data) {
-  SingleFrameStats &stats = single_frame_stats_;
-  int32 full_dim = FullDim();
-  KALDI_ASSERT(data.Dim() == full_dim);
-  stats.s.Range(0, full_dim).CopyFromVec(data);
-  stats.s(full_dim) = 1.0;
-  stats.transformed_data.AddMatVec(1.0, full_transform_, kNoTrans, data, 0.0);
-  stats.transformed_data.AddVec(1.0, transform_offset_);
-  stats.count = 0.0;
-  stats.a.SetZero();
-  stats.b.SetZero();
-}
-
-
-BaseFloat FmllrRawAccs::AccumulateForGmm(const DiagGmm &gmm,
-                                         const VectorBase<BaseFloat> &data,
-                                         BaseFloat weight) {
-  int32 model_dim = ModelDim(), full_dim = FullDim();
-  KALDI_ASSERT(data.Dim() == full_dim &&
-               "Expect raw, spliced data, which should have same dimension as "
-               "full transform.");
-  if (DataHasChanged(data)) {
-    // this is part of our mechanism to accumulate certain sub-parts of
-    // the computation for each frame, to avoid excessive compute.
-    CommitSingleFrameStats();
-    InitSingleFrameStats(data);
-  }
-  SingleFrameStats &stats = single_frame_stats_;
-
-  SubVector<BaseFloat> projected_data(stats.transformed_data, 0, model_dim);
-
-  int32 num_gauss = gmm.NumGauss();
-  Vector<BaseFloat> posterior(num_gauss);
-  BaseFloat log_like = gmm.ComponentPosteriors(projected_data, &posterior);
-  posterior.Scale(weight);
-  // Note: AccumulateFromPosteriors takes the original, spliced data,
-  // and returns the log-like of the rejected dimensions.
-  AccumulateFromPosteriors(gmm, data, posterior);
-
-  // Add the likelihood of the rejected dimensions to the objective function
-  // (assume zero-mean, unit-variance Gaussian; the LDA should have any offset
-  // required to ensure this).
-  if (full_dim > model_dim) {
-    SubVector<BaseFloat> rejected_data(stats.transformed_data,
-                                       model_dim, full_dim - model_dim);
-    log_like += -0.5 * (VecVec(rejected_data, rejected_data)
-                        + (full_dim - model_dim) * M_LOG_2PI);
-  }
-  return log_like;
-}
-
-/*
-  // Extended comment here.
-  //
-  // Let x_t(i) be the fully processed feature, dimension i (with fMLLR transform
-  //  and LDA transform), but *without* any offset term from the LDA, which
-  //  it's more convenient to view as an offset in the model.
-  //
-  //
-  // For a given dimension i (either accepted or rejected), the auxf can
-  // be expressed as a quadratic function of x_t(i).  We ultimately will want to
-  // express x_t(i) as a linear function of the parameters of the linearized
-  // fMLLR transform matrix.  Some notation:
-  //    Let l be the linearized transform matrix, i.e. the concatenation of the
-  //       m rows, each of length m+1, of the fMLLR transform.
-  //    Let n be the number of frames we splice together each time.
-  //    Let s_t be the spliced-together features on time t, with a one appended;
-  //       it will have n blocks each of size m, followed by a 1.  (dim is n*m + 1).
-  //     
-  // x(i) [note, this is the feature without any LDA offset], is bilinear in the
-  //      transform matrix and the features, so:
-  //
-  // x(i) = l^T M_i s_t, where s_t is the spliced features on time t,
-  //          with a 1 appended
-  //   [we need to compute M_i but we know the function is bilinear so it exists].
-  //
-  // The auxf can be written as:
-  // F = sum_i sum_t  a_{ti} x(i) - 0.5  b_{ti} x(i)^2 
-  //   = sum_i sum_t  a_{ti} x(i) - 0.5  b_{ti} x(i)^2
-  //   = sum_i sum_t  a_{ti} (l^T M_i s_t)  -  0.5 b_{ti} (l^T M_i s_t )^2
-  //   = sum_i l^T M_i q_i  +  l^T M_i S_i M_i^T l 
-  //  where
-  //     q_i = sum_t a_{ti} s_t, and
-  //     S_i = sum_t b_{ti} s_t s_t^T
-  //   [Note that we only need store S_i for the model-dim plus one, because
-  //    all the rejected dimensions have the same value]
-  //
-  //     We define a matrix Q whose rows are q_d, with
-  //       Q = \sum_t d_t s_t^T
-  //    [The Q we actually store as stats will use a modified form of d that
-  //     has a 1 for all dimensions past the model dim, to avoid redundancy;
-  //     we'll reconstruct the true Q from this later on.]
-  //     
-  //
-  // What is M_i?  Working it out is a little tedious.
-  //  Note: each M_i (from i = 0 ... full_dim) is of
-  //    dimension (raw_dim*(raw_dim+1)) by full_dim + 1
-  // 
-  // We want to express x(i) [we forget the subscript "t" sometimes],
-  // as a bilinear function of l and s_t.
-  //    We have x(i) = l^T M_i s.
-  //
-  // The (j,k)'th component of M_i is the term in x(i) that corresponds to the j'th
-  // component of l and the k'th of s.
-
-  // Before defining M_i, let us define N_i, where l^t N_i s will equal the spliced and
-  // transformed pre-LDA features of dimension i.  the N's have the same dimensions as the
-  // M's.
-  //
-  // We'll first define the j,k'th component of N_i, as this is easier; we'll then define the M_i
-  // as combinations of N_i.
-  //
-  // For a given i, j and k, the value of n_{i,j,k} will be as follows:
-  //   We first decompose index j into j1, j2 (both functions of
-  //    the original index j), where
-  //    j1 corresponds to the row-index of the fMLLR transform, j2 to the col-index.
-  //   We next decompose i into i1, i2, where i1 corresponds to the splicing number
-  //   (0...n-1), and i2 corresponds to the cepstral index.
-  //
-  //   If (j1 != i2) then n_{ijk} == 0.
-  //
-  //   Elsif k corresponds to the last element [i.e. k == m * n], then this m_{ijk} corresponds
-  //   to the effect of the j'th component of l for zero input, so:
-  //     If j2 == m (i.e. this the offset term in the fMLLR matrix), then
-  //       n_{ijk} = 1.0,
-  //     Else
-  //       n_{ijk} = 0.0
-  //     Fi
-  //
-  //   Else:
-  //     Decompose k into k1, k2, where k1 = 0.. n-1 is the splicing index, and k2 = 0...m-1 is
-  //      the cepstral index.
-  //     If k1 != i1 then
-  //       n_{ijk} = 0.0
-  //     elsif k2 != j2 then
-  //       n_{ijk} = 0.0
-  //     else
-  //       n_{ijk} = 1.0
-  //     fi
-  //    Endif
-  //    Now,  M_i will be defined as sum_i T_{ij} N_j, where T_{ij} are the elements of the
-  //     LDA+MLLT transform (but excluding any linear offset, which gets accounted for by
-  //     c_i, above).
-  //
-  //  Now suppose we want to express the auxiliary function in a simpler form
-  //  as l^T v - 0.5 l^T W l, where v and W are the "simple" linear and quadratic stats,
-  //  we can do so with:
-  //     v = \sum_i M_i q_i   
-  //  and
-  //     W = \sum_i M_i S_i M_i^T
-  //
-  */
-
-void FmllrRawAccs::AccumulateFromPosteriors(
-    const DiagGmm &diag_gmm,
-    const VectorBase<BaseFloat> &data,
-    const VectorBase<BaseFloat> &posterior) {
-  // The user may call this function directly, even though we also
-  // call it from AccumulateForGmm(), so check again:
-  if (DataHasChanged(data)) { 
-    CommitSingleFrameStats();
-    InitSingleFrameStats(data);
-  }
-  
-  int32  model_dim = ModelDim();
-
-  SingleFrameStats &stats = single_frame_stats_;
-  
-  // The quantities a and b describe the diagonal auxiliary function
-  // for each of the retained dimensions in the transformed space--
-  // in the format F = \sum_d alpha(d) x(d)  -0.5 beta(d) x(d)^2,
-  // where x(d) is the d'th dimensional fully processed feature.
-  // For d, see the comment-- it's alpha processed to take into
-  // account any offset in the LDA.  Note that it's a reference.
-  //
-  Vector<double> a(model_dim), b(model_dim);
-  
-  int32 num_comp = diag_gmm.NumGauss();
-  
-  double count = 0.0; // data-count contribution from this frame.
-
-  // Note: we could do this using matrix-matrix operations instead of
-  // row by row.  In the end it won't really matter as this is not
-  // the slowest part of the computation.
-  for (size_t m = 0; m < num_comp; m++) {
-    BaseFloat this_post = posterior(m);
-    if (this_post != 0.0) {
-      count += this_post;
-      a.AddVec(this_post, diag_gmm.means_invvars().Row(m));
-      b.AddVec(this_post, diag_gmm.inv_vars().Row(m));
-    }
-  }
-  // Correct "a" for any offset term in the LDA transform-- we view it as
-  // the opposite offset in the model [note: we'll handle the rejected dimensions
-  // in update time.]  Here, multiplying the element of "b" (which is the
-  // weighted inv-vars) by transform_offset_, and subtracting the result from
-  // a, is like subtracting the transform-offset from the original means
-  // (because a contains the means times inv-vars_.
-  Vector<double> offset(transform_offset_.Range(0, model_dim));
-  a.AddVecVec(-1.0, b, offset, 1.0);
-  stats.a.AddVec(1.0, a);
-  stats.b.AddVec(1.0, b);
-  stats.count += count;
-}
-
-
-void FmllrRawAccs::Update(const FmllrRawOptions &opts,
-                          MatrixBase<BaseFloat> *raw_fmllr_mat,
-                          BaseFloat *objf_impr,
-                          BaseFloat *count) {
-  // First commit any pending stats from the last frame.
-  if (single_frame_stats_.count != 0.0)
-    CommitSingleFrameStats();
-  
-  if (this->count_ < opts.min_count) {
-    KALDI_WARN << "Not updating (raw) fMLLR since count " << this->count_
-               << " is less than min count " << opts.min_count;
-    *objf_impr = 0.0;
-    *count = this->count_;
-    return;
-  }
-  KALDI_ASSERT(raw_fmllr_mat->NumRows() == RawDim() &&
-               raw_fmllr_mat->NumCols() == RawDim() + 1 &&
-               !raw_fmllr_mat->IsZero());
-  Matrix<double> fmllr_mat(*raw_fmllr_mat); // temporary, double-precision version
-                                            // of matrix.
-
-
-  Matrix<double> linear_stats; // like K in diagonal update.
-  std::vector<SpMatrix<double> > diag_stats; // like G in diagonal update.
-                                             // Note: we will invert these.
-  std::vector<std::vector<Matrix<double> > > off_diag_stats; // these will
-  // contribute to the linear term.
-
-  Vector<double> simple_linear_stats;
-  SpMatrix<double> simple_quadratic_stats;
-  ConvertToSimpleStats(&simple_linear_stats, &simple_quadratic_stats);
-  
-  ConvertToPerRowStats(simple_linear_stats, simple_quadratic_stats,
-                       &linear_stats, &diag_stats, &off_diag_stats);
-
-  try {
-    for (size_t i = 0; i < diag_stats.size(); i++) {
-      diag_stats[i].Invert();
-    }
-  } catch (...) {
-    KALDI_WARN << "Error inverting stats matrices for fMLLR "
-               << "[min-count too small?  Bad data?], not updating.";
-    return;
-  }
-  
-  int32 raw_dim = RawDim(), splice_width = SpliceWidth();
-  
-  double effective_beta = count_ * splice_width; // We "count" the determinant
-  // splice_width times in the objective function.
-
-  double auxf_orig = GetAuxf(simple_linear_stats, simple_quadratic_stats,
-                             fmllr_mat);
-  for (int32 iter = 0; iter < opts.num_iters; iter++) {
-    for (int32 row = 0; row < raw_dim; row++) {
-      SubVector<double> this_row(fmllr_mat, row);
-      Vector<double> this_linear(raw_dim + 1);  // Here, k_i is the linear term
-      // in the auxf expressed as a function of this row.
-      this_linear.CopyFromVec(linear_stats.Row(row));
-      for (int32 row2 = 0; row2 < raw_dim; row2++) {
-        if (row2 != row) {
-          if (row2 < row) {
-            this_linear.AddMatVec(-1.0, off_diag_stats[row][row2], kNoTrans,
-                                  fmllr_mat.Row(row2), 1.0);
-          } else {
-            // We won't have the element [row][row2] stored, but use symmetry.
-            this_linear.AddMatVec(-1.0, off_diag_stats[row2][row], kTrans,
-                                  fmllr_mat.Row(row2), 1.0);
-          }
-        }
-      }
-      FmllrInnerUpdate(diag_stats[row],
-                       this_linear,
-                       effective_beta,
-                       row,
-                       &fmllr_mat);
-    }
-    if (GetVerboseLevel() >= 2) {
-      double cur_auxf = GetAuxf(simple_linear_stats, simple_quadratic_stats,
-                                 fmllr_mat),
-          auxf_change = cur_auxf - auxf_orig;
-      KALDI_VLOG(2) << "Updating raw fMLLR: objf improvement per frame was "
-                    << (auxf_change / this->count_) << " over "
-                    << this->count_ << " frames, by the " << iter
-                    << "'th iteration";
-    }
-  }
-  double auxf_final = GetAuxf(simple_linear_stats, simple_quadratic_stats,
-                              fmllr_mat),
-      auxf_change = auxf_final - auxf_orig;
-  *count = this->count_;
-  KALDI_VLOG(1) << "Updating raw fMLLR: objf improvement per frame was "
-                << (auxf_change / this->count_) << " over "
-                << this->count_ << " frames.";
-  if (auxf_final > auxf_orig) {
-    *objf_impr = auxf_change;
-    *count = this->count_;
-    raw_fmllr_mat->CopyFromMat(fmllr_mat);
-  } else {
-    *objf_impr = 0.0;
-    // don't update "raw_fmllr_mat"
-  }
-}
-
-void FmllrRawAccs::SetZero() {
-  count_ = 0.0;
-  single_frame_stats_.count = 0.0;
-  single_frame_stats_.s.SetZero();
-  Q_.SetZero();
-  S_.SetZero();
-}
-
-// Compute the M_i quantities, needed in the update.  This function could be
-// greatly speeded up but I don't think it's the limiting factor.
-void FmllrRawAccs::ComputeM(std::vector<Matrix<double> > *M) const {
-  int32 full_dim = FullDim(), raw_dim = RawDim(),
-      raw_dim2 = raw_dim * (raw_dim + 1);
-  M->resize(full_dim);
-  for (int32 i = 0; i < full_dim; i++)
-    (*M)[i].Resize(raw_dim2, full_dim + 1);  
-
-  // the N's are simpler matrices from which we'll interpolate the M's.
-  // In this loop we imagine w are computing the vector of N's, but
-  // when we get each element, if it's nonzero we propagate it straight
-  // to the M's.
-  for (int32 i = 0; i < full_dim; i++) {
-    // i is index after fMLLR transform; i1 is splicing index,
-    // i2 is cepstral index.
-    int32 i1 = i / raw_dim, i2 = i % raw_dim;
-    for (int32 j = 0; j < raw_dim2; j++) {
-      // j1 is row-index of fMLLR transform, j2 is column-index
-      int32 j1 = j / (raw_dim + 1), j2 = j % (raw_dim + 1);
-      for (int32 k = 0; k < full_dim + 1; k++) {
-        BaseFloat n_ijk;
-        if (j1 != i2) {
-          n_ijk = 0.0;
-        } else if (k == full_dim) {
-          if (j2 == raw_dim) // offset term in fMLLR matrix.
-            n_ijk = 1.0;
-          else
-            n_ijk = 0.0;
-        } else {
-          // k1 is splicing index, k2 is cepstral idnex.
-          int32 k1 = k / raw_dim, k2 = k % raw_dim;
-          if (k1 != i1 || k2 != j2)
-            n_ijk = 0.0;
-          else
-            n_ijk = 1.0;
-        }
-        if (n_ijk != 0.0)
-          for (int32 l = 0; l < full_dim; l++)
-            (*M)[l](j, k) += n_ijk * full_transform_(l, i);
-      }
-    }
-  }
-}
-
-void FmllrRawAccs::ConvertToSimpleStats(
-    Vector<double> *simple_linear_stats,
-    SpMatrix<double> *simple_quadratic_stats) const {
-  std::vector<Matrix<double> > M;
-  ComputeM(&M);
-
-  int32 full_dim = FullDim(), raw_dim = RawDim(), model_dim = ModelDim(),
-      raw_dim2 = raw_dim * (raw_dim + 1),
-      full_dim2 = ((full_dim+1)*(full_dim+2))/2;
-  simple_linear_stats->Resize(raw_dim2);
-  simple_quadratic_stats->Resize(raw_dim2);
-  for (int32 i = 0; i < full_dim; i++) {
-    Vector<double> q_i(full_dim + 1);
-    SpMatrix<double> S_i(full_dim + 1);
-    SubVector<double> S_i_vec(S_i.Data(), full_dim2);
-    if (i < model_dim) {
-      q_i.CopyFromVec(Q_.Row(i));
-      S_i_vec.CopyFromVec(S_.Row(i));
-    } else {
-      q_i.CopyFromVec(Q_.Row(model_dim)); // The last row contains stats proportional
-      // to "count", which we need to modify to be correct.
-      q_i.Scale(-transform_offset_(i)); // These stats are zero (corresponding to
-      // a zero-mean model) if there is no offset in the LDA transform.  Note:
-      // the two statements above are the equivalent, for the rejected dims,
-      // of the statement "a.AddVecVec(-1.0, b, offset);" for the kept ones.
-      // 
-      S_i_vec.CopyFromVec(S_.Row(model_dim)); // these are correct, and
-      // all the same (corresponds to unit variance).
-    }
-    // The equation v = \sum_i M_i q_i:
-    simple_linear_stats->AddMatVec(1.0, M[i], kNoTrans, q_i, 1.0);
-    // The equation W = \sum_i M_i S_i M_i^T
-    // Here, M[i] is quite sparse, so AddSmat2Sp will be faster.
-    simple_quadratic_stats->AddSmat2Sp(1.0, M[i], kNoTrans, S_i, 1.0);
-  }
-}
-
-// See header for comment.
-void FmllrRawAccs::ConvertToPerRowStats(
-    const Vector<double> &simple_linear_stats,
-    const SpMatrix<double> &simple_quadratic_stats_sp,
-    Matrix<double> *linear_stats,
-    std::vector<SpMatrix<double> > *diag_stats,
-    std::vector<std::vector<Matrix<double> > > *off_diag_stats) const {
-
-  // get it as a Matrix, which makes it easier to extract sub-parts.
-  Matrix<double> simple_quadratic_stats(simple_quadratic_stats_sp);
-
-  linear_stats->Resize(RawDim(), RawDim() + 1);
-  linear_stats->CopyRowsFromVec(simple_linear_stats);
-  diag_stats->resize(RawDim());
-  off_diag_stats->resize(RawDim());
-
-  // Set *diag_stats
-  int32 rd1 = RawDim() + 1;
-  for (int32 i = 0; i < RawDim(); i++) {
-    SubMatrix<double> this_diag(simple_quadratic_stats,
-                                i * rd1, rd1,
-                                i * rd1, rd1);
-    (*diag_stats)[i].Resize(RawDim() + 1);
-    (*diag_stats)[i].CopyFromMat(this_diag, kTakeMean);
-  }    
-  
-  for (int32 i = 0; i < RawDim(); i++) {
-    (*off_diag_stats)[i].resize(i);
-    for (int32 j = 0; j < i; j++) {
-      SubMatrix<double> this_off_diag(simple_quadratic_stats,
-                                      i * rd1, rd1,
-                                      j * rd1, rd1);
-      (*off_diag_stats)[i][j] = this_off_diag;
-    }
-  }
-}
-
-double FmllrRawAccs::GetAuxf(const Vector<double> &simple_linear_stats,
-                             const SpMatrix<double> &simple_quadratic_stats,
-                             const Matrix<double> &fmllr_mat) const {
-  // linearize transform...
-  int32 raw_dim = RawDim(), spice_width = SpliceWidth();
-  Vector<double> fmllr_vec(raw_dim * (raw_dim + 1));
-  fmllr_vec.CopyRowsFromMat(fmllr_mat);
-  SubMatrix<double> square_part(fmllr_mat, 0, raw_dim,
-                                0, raw_dim);
-  double logdet = square_part.LogDet();
-  return VecVec(fmllr_vec, simple_linear_stats) -
-      0.5 * VecSpVec(fmllr_vec, simple_quadratic_stats, fmllr_vec) +
-      logdet * spice_width * count_;
-}
-
-
-
-} // namespace kaldi
diff --git a/src/transform/fmllr-raw.h b/src/transform/fmllr-raw.h
deleted file mode 100644
index cc2d33f4830..00000000000
--- a/src/transform/fmllr-raw.h
+++ /dev/null
@@ -1,206 +0,0 @@
-// transform/fmllr-raw.h
-
-// Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_TRANSFORM_FMLLR_RAW_H_
-#define KALDI_TRANSFORM_FMLLR_RAW_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "gmm/am-diag-gmm.h"
-#include "gmm/mle-full-gmm.h"
-#include "transform/transform-common.h"
-#include "util/kaldi-table.h"
-#include "util/kaldi-holder.h"
-
-namespace kaldi {
-
-
-/*
-  This header contains classes and functions related to computing Constrained
-  MLLR (equivalently, fMLLR) on the raw MFCCs or similar, when they have been
-  spliced and projected with something like LDA+MLLT, but where our model is
-  built on top of the spliced and projected features.  We back-project the
-  model estimation back to the original MFCCs so our transform optimizes the
-  data likelihood given our model in the projected space.  We have to include
-  the rejected dimensions in this likelihood, too.  The objective function
-  includes N times the log-determinant of the square part of the transform,
-  where N is the number of times we spliced consecutive features (e.g. N = 9,
-  if we spliced +- 4 frames of context).
-
-  For concreteness (but without losing generality), assuming we spliced
-  13-dimensional MFCCs across 9 frames to get 117-dimensional features.
-   
-  Each of the 117-dim features is a linear function of the 13(13+1) transform
-  parameters.  We have a particular vectorization of these parameters, from
-  which (with the transform) we work out the full quadratic auxiliary function
-  w.r.t. the parameters.
-
-  This gives us a generic quadratic scalar function of the 13(13+1) parameters.
-  How to get this quadratic w.r.t. one row?  Always keep updated the current
-  derivative w.r.t. one row.  The quadratic w.r.t. that row can be read off.
-  The log-determinant is easy to work out from the cofactor.
-
-  So the full stats will be a (13(13+1)) by (13(13+1)) SpMatrix, plus
-  a bias term.
-
-  The update will iterate row by row, and work out the quadratic function
-  of the row.
-*/
-
-
-struct FmllrRawOptions {
-  BaseFloat min_count;
-  int32 num_iters;
-  FmllrRawOptions(): min_count(100.0), num_iters(20) { }
-  void Register(OptionsItf *opts) {
-    opts->Register("fmllr-min-count", &min_count,
-                   "Minimum count required to update fMLLR");
-    opts->Register("fmllr-num-iters", &num_iters,
-                   "Number of iterations in fMLLR update phase.");
-  }
-};
-
-class FmllrRawAccs {
- public:
-  FmllrRawAccs() { }
-
-  /// Dimension of raw MFCC (etc.) features
-  int32 RawDim() const { return raw_dim_; }
-  /// Full feature dimension after splicing.
-  int32 FullDim() const { return full_transform_.NumRows(); }
-  /// Number of frames that are spliced together each time.
-  int32 SpliceWidth() const { return FullDim() / RawDim(); }
-  /// Dimension of the model.
-  int32 ModelDim() const { return model_dim_; }
-  
-  // Initializer takes the raw dimension of the features (e.g. 13 for typicaly
-  // MFCC features, and the full transform (e.g. an LDA+MLLT transform).  This
-  // full transform is the transform extended with the "rejected rows" that
-  // we would normally discard; we need them for this type of estimation.
-  FmllrRawAccs(int32 raw_dim,
-               int32 model_dim,
-               const Matrix<BaseFloat> &full_transform);
-
-  
-  /// Accumulate stats for a single GMM in the model; returns log likelihood.
-  /// Here, "data" will typically be of larger dimension than the model.
-  /// Note: "data" is the original, spliced features-- before LDA+MLLT.
-  /// Returns log-like for this data given this GMM, including rejected
-  /// dimensions (not multiplied by weight).
-  BaseFloat AccumulateForGmm(const DiagGmm &gmm,
-                             const VectorBase<BaseFloat> &data,
-                             BaseFloat weight);
-  
-  /// Accumulate stats for a GMM, given supplied posteriors.  Note: "data" is
-  /// the original, spliced features-- before LDA+MLLT. 
-  void AccumulateFromPosteriors(const DiagGmm &gmm,
-                                const VectorBase<BaseFloat> &data,
-                                const VectorBase<BaseFloat> &posteriors);
-
-  /// Update "raw_fmllr_mat"; it should have the correct dimension and
-  /// reasonable values at entry (see the function InitFmllr in fmllr-diag-gmm.h
-  /// for how to initialize it.)
-  /// The only reason this function is not const is because we may have
-  /// to call CommitSingleFrameStats().
-  void Update(const FmllrRawOptions &opts,
-              MatrixBase<BaseFloat> *raw_fmllr_mat,
-              BaseFloat *objf_impr,
-              BaseFloat *count);
-
-  void SetZero();
- private:
-  struct SingleFrameStats {
-    Vector<BaseFloat> s; // [FullDim() + 1]-dimensional spliced data, plus 1.0
-    Vector<BaseFloat> transformed_data; // [FullDim()] Data times full transform, with offset.
-    double count;
-    Vector<double> a; // linear term in per-frame auxf; dim is model-dim.
-    Vector<double> b; // quadratic term in per-frame auxf; dim is model-dim.
-  };
-  
-  void CommitSingleFrameStats();
-
-  void InitSingleFrameStats(const VectorBase<BaseFloat> &data);
-  
-  bool DataHasChanged(const VectorBase<BaseFloat> &data) const; // compares it to the
-  // data in single_frame_stats_, returns true if it's different.
-
-  
-  /// Compute the auxiliary function for this matrix.
-  double GetAuxf(const Vector<double> &simple_linear_stats,
-                 const SpMatrix<double> &simple_quadratic_stats,
-                 const Matrix<double> &fmllr_mat) const;
-
-  /// Converts from the Q and S stats to a simple objective function
-  /// of the form l . simple_linear_stats -0.5 l^t simple_quadratic_stats l,
-  /// plus the determinant term, where l is the linearized transform.
-  void ConvertToSimpleStats(
-      Vector<double> *simple_linear_stats,
-      SpMatrix<double> *simple_quadratic_stats) const;
-
-  /// Computes the M_i matrices used in the update, see the extended comment in
-  /// fmllr-raw.cc for explanation.
-  void ComputeM(
-      std::vector<Matrix<double> > *M) const;
-  
-  /// Transform stats into a convenient format for the update.
-  /// linear_stats is of dim RawDim() by RawDim() + 1, it's the linear term.
-  /// diag_stats (of dimension RawDim(), each element of dimension RawDim() + 1
-  /// is the quadratic terms w.r.t. the diagonals.  off_diag_stats contains the
-  /// cross-terms between different rows; it is indexed [i][j], with
-  /// 0 <= i < RawDim(), and j < i, and each element is of dimension RawDim() + 1
-  /// by RawDim() + 1.  The [i][j]'th element is interpreted as follows:
-  /// the inner product with the [i'th row] [element [i][j]] [j'th row] is the
-  /// term in the objective function.
-  /// This function resizes its output.
-  void ConvertToPerRowStats(
-      const Vector<double> &simple_linear_stats,
-      const SpMatrix<double> &simple_quadratic_stats_sp,
-      Matrix<double> *linear_stats,
-      std::vector<SpMatrix<double> > *diag_stats,
-      std::vector<std::vector<Matrix<double> > > *off_diag_stats) const;
-  
-  int32 raw_dim_; // Raw MFCC dimension.
-  int32 model_dim_; // Model dimension
-
-  Matrix<BaseFloat> full_transform_; // Does not include any offset term
-  // (last column).
-  Vector<BaseFloat> transform_offset_; // The offset term (or zero).
-  
-
-  SingleFrameStats single_frame_stats_;
-  
-  double count_; // The data-count.  Note: in accounting for the determinant, we will
-                 // have to multiply this by the number of times the data is spliced
-                 // together on each frame.
-
-  SpMatrix<double> temp_; // [full_dim + 1][full_dim + 1], outer product of s.
-  Matrix<double> Q_; // linear stats, indexed [model_dim + 1][full_dim + 1]
-  Matrix<double> S_; // quadratic stats, indexed
-                     // [model_dim + 1][((full_dim+1)*(full_dim+2))/2]
-  
-  KALDI_DISALLOW_COPY_AND_ASSIGN(FmllrRawAccs);
-};
-
-
-
-} // namespace kaldi
-
-#endif  // KALDI_TRANSFORM_FMLLR_RAW_H_
diff --git a/src/transform/fmpe-test.cc b/src/transform/fmpe-test.cc
deleted file mode 100644
index ec76bd1ef20..00000000000
--- a/src/transform/fmpe-test.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-// transform/fmpe-test.cc
-
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/diag-gmm-normal.h"
-#include "gmm/model-test-common.h"
-#include "transform/fmpe.h"
-
-namespace kaldi {
-
-
-// Compute derivative of GMM log-likelihood w.r.t. features.
-// Note: this code copied from gmm-get-feat-deriv.cc; had
-// to simplify a bit.
-void GetFeatDeriv(const DiagGmm &gmm,
-                  const Matrix<BaseFloat> &feats,
-                  Matrix<BaseFloat> *deriv) {
-  
-  deriv->Resize(feats.NumRows(), feats.NumCols());
-
-  Vector<BaseFloat> gauss_posteriors;
-  Vector<BaseFloat> temp_vec(feats.NumCols());
-  for (int32 i = 0; i < feats.NumRows(); i++) {
-    SubVector<BaseFloat> this_feat(feats, i);
-    SubVector<BaseFloat> this_deriv(*deriv, i);
-    gmm.ComponentPosteriors(this_feat, &gauss_posteriors);
-    BaseFloat weight = 1.0;
-    gauss_posteriors.Scale(weight);
-    // The next line does: to i'th row of deriv, add
-    // means_invvars^T * gauss_posteriors,
-    // where each row of means_invvars is the mean times
-    // diagonal inverse covariance... after transposing,
-    // this becomes a weighted of these rows, weighted by
-    // the posteriors.  This comes from the term
-    //  feat^T * inv_var * mean
-    // in the objective function.
-    this_deriv.AddMatVec(1.0, gmm.means_invvars(), kTrans,
-                         gauss_posteriors, 1.0);
-
-    // next line does temp_vec == inv_vars^T * gauss_posteriors,
-    // which sets temp_vec to a weighted sum of the inv_vars,
-    // weighed by Gaussian posterior.
-    temp_vec.AddMatVec(1.0, gmm.inv_vars(), kTrans,
-                       gauss_posteriors, 0.0);
-    // Add to the derivative, -(this_feat .* temp_vec),
-    // which is the term that comes from the -0.5 * inv_var^T feat_sq,
-    // in the objective function (where inv_var is a vector, and feat_sq
-    // is a vector of squares of the feature values).
-    this_deriv.AddVecVec(-1.0, this_feat, temp_vec, 1.0);
-  }
-}
-
-// Gets total log-likelihood, summed over all frames.
-BaseFloat GetGmmLike(const DiagGmm &gmm,
-                     const Matrix<BaseFloat> &feats) {
-  BaseFloat ans = 0.0;
-  for (int32 i = 0; i < feats.NumRows(); i++)
-    ans += gmm.LogLikelihood(feats.Row(i));
-  return ans;
-}
-
-void TestFmpe() {
-  int32 dim = 10 + (Rand() % 10);
-  int32 num_comp = 10 + (Rand() % 10);
-  DiagGmm gmm;
-  unittest::InitRandDiagGmm(dim, num_comp, &gmm);
-  
-  int32 num_frames = 20;
-  Matrix<BaseFloat> feats(num_frames, dim);
-
-  for (int32 i = 0; i < num_frames; i++)
-    for (int32 j = 0; j < dim; j++)
-      feats(i, j) = RandGauss();
-
-  FmpeOptions opts; // Default.
-  {
-    Fmpe fmpe(gmm, opts);
-    {
-      bool binary = (Rand() % 2 == 1);
-      Output ko("tmpf", binary);
-      fmpe.Write(ko.Stream(), binary);
-    }
-  }
-  Fmpe fmpe(gmm, opts);
-  {
-    bool binary_in;
-    Input ki("tmpf", &binary_in);
-    fmpe.Read(ki.Stream(), binary_in);
-  }
-
-  // We'll first be testing that the feature derivative is
-  // accurate, by measuring a small random offset in feature space.
-  {
-    Matrix<BaseFloat> deriv;
-    Matrix<BaseFloat> random_offset(feats.NumRows(), feats.NumCols());
-    for (int32 i = 0; i < feats.NumRows(); i++)
-      for (int32 j = 0; j < feats.NumCols(); j++)
-        random_offset(i, j) = 1.0e-03 * RandGauss();
-    BaseFloat like_before = GetGmmLike(gmm, feats);
-    feats.AddMat(1.0, random_offset);
-    BaseFloat like_after = GetGmmLike(gmm, feats);
-    feats.AddMat(-1.0, random_offset); // undo the change.
-    GetFeatDeriv(gmm, feats, &deriv);
-    BaseFloat change1 = like_after - like_before,
-        change2 = TraceMatMat(random_offset, deriv, kTrans);
-    KALDI_LOG << "Random offset led to like change "
-              << change1 << " (manually), and " << change2
-              << " (derivative)";
-    // note: not making this threshold smaller, as don't want
-    // spurious failures.  Seems to be OK though.
-    KALDI_ASSERT( fabs(change1-change2) < 0.15*fabs(change1+change2));
-  }
-
-  std::vector<std::vector<int32> > gselect(feats.NumRows()); // make it have all Gaussians...
-  for (int32 i = 0; i < feats.NumRows(); i++)
-    for (int32 j = 0; j < gmm.NumGauss(); j++)
-      gselect[i].push_back(j);
-
-  Matrix<BaseFloat> fmpe_offset;
-  // Check that the fMPE feature offset is zero.
-  fmpe.ComputeFeatures(feats, gselect, &fmpe_offset);
-  KALDI_ASSERT(fmpe_offset.IsZero());
-  
-  // Note: we're just using the ML objective function here.
-  // This is just to make sure the derivatives are all computed
-  // correctly.
-  BaseFloat like_before_update = GetGmmLike(gmm, feats);
-  // Now get stats for update.
-  FmpeStats stats(fmpe);
-  Matrix<BaseFloat> deriv;
-  GetFeatDeriv(gmm, feats, &deriv);
-  fmpe.AccStats(feats, gselect, deriv, NULL, &stats);
-  FmpeUpdateOptions update_opts;
-  update_opts.learning_rate = 0.001; // so linear assumption is more valid.
-  BaseFloat delta = fmpe.Update(update_opts, stats);
-
-  fmpe.ComputeFeatures(feats, gselect, &fmpe_offset);
-  feats.AddMat(1.0, fmpe_offset);
-
-  BaseFloat like_after_update = GetGmmLike(gmm, feats);
-
-  BaseFloat delta2 = like_after_update - like_before_update;
-  KALDI_LOG << "Change predicted by fMPE Update function is "
-            << delta << ", change computed directly is "
-            << delta2;
-  KALDI_ASSERT(fabs(delta-delta2) < 0.15 * fabs(delta+delta2));
-  
-  unlink("tmpf");
-}
-
-}
-
-
-int main() {
-  kaldi::g_kaldi_verbose_level = 5;
-  for (int i = 0; i <= 10; i++)
-    kaldi::TestFmpe();
-  std::cout << "Test OK.\n";
-}
-
diff --git a/src/transform/fmpe.cc b/src/transform/fmpe.cc
deleted file mode 100644
index 9a49bf53678..00000000000
--- a/src/transform/fmpe.cc
+++ /dev/null
@@ -1,691 +0,0 @@
-// transform/fmpe.cc
-
-// Copyright 2011-2012  Yanmin Qian  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "transform/fmpe.h"
-#include "util/text-utils.h"
-#include "gmm/diag-gmm-normal.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-
-void Fmpe::SetContexts(std::string context_str) {
-  // sets the contexts_ variable.
-  using std::vector;
-  using std::string;
-  contexts_.clear();
-  vector<string> ctx_vec; // splitting context_str on ":"
-  SplitStringToVector(context_str, ":", false, &ctx_vec);
-  contexts_.resize(ctx_vec.size());
-  for (size_t i = 0; i < ctx_vec.size(); i++) {
-    vector<string> pair_vec; // splitting ctx_vec[i] on ";"
-    SplitStringToVector(ctx_vec[i], ";", false, &pair_vec);
-    KALDI_ASSERT(pair_vec.size() != 0 && "empty context!");
-    for (size_t j = 0; j < pair_vec.size(); j++) {
-      vector<string> one_pair;
-      SplitStringToVector(pair_vec[j], ",", false, &one_pair);
-      KALDI_ASSERT(one_pair.size() == 2 &&
-                   "Mal-formed context string: bad --context-expansion option?");
-      int32 pos = 0;
-      BaseFloat weight = BaseFloat(0);
-      bool ok = ConvertStringToInteger(one_pair[0], &pos);
-      ok = ConvertStringToReal(one_pair[1], &weight) && ok;
-      if (!ok)
-        KALDI_ERR << "Mal-formed context string: bad --context-expansion option?";
-      contexts_[i].push_back(std::make_pair(pos, weight));
-    }
-  }
-}
-
-void Fmpe::ComputeC() {
-  KALDI_ASSERT(gmm_.NumGauss() != 0.0);
-  int32 dim = gmm_.Dim();
-
-  // Getting stats from the GMM... assume the model is
-  // correct.
-  SpMatrix<double> x2_stats(dim);
-  Vector<double> x_stats(dim);
-  double tot_count = 0.0;
-  DiagGmmNormal ngmm(gmm_);
-  for (int32 pdf = 0; pdf < ngmm.NumGauss(); pdf++) {
-    x2_stats.AddVec2(ngmm.weights_(pdf), ngmm.means_.Row(pdf));
-    x2_stats.AddDiagVec(ngmm.weights_(pdf), ngmm.vars_.Row(pdf)); // add diagonal
-    // covar to diagonal elements of x2_stats.
-    x_stats.AddVec(ngmm.weights_(pdf), ngmm.means_.Row(pdf));
-    tot_count += ngmm.weights_(pdf);
-  }
-  KALDI_ASSERT(tot_count != 0.0);
-  x2_stats.Scale(1.0 / tot_count);
-  x_stats.Scale(1.0 / tot_count);
-  x2_stats.AddVec2(-1.0, x_stats); // subtract outer product of mean,
-  // to get centered covariance.
-  C_.Resize(dim);
-  try {
-    TpMatrix<double> Ctmp(dim); Ctmp.Cholesky(x2_stats);
-    C_.CopyFromTp(Ctmp);
-  } catch (...) {
-    KALDI_ERR << "Error initializing fMPE object: cholesky of "
-        "feature variance failed.  Probably code error, or NaN/inf in model";
-  }
-}
-
-void Fmpe::ComputeStddevs() {
-  const Matrix<BaseFloat> &inv_vars = gmm_.inv_vars();
-  stddevs_.Resize(inv_vars.NumRows(), inv_vars.NumCols());
-  stddevs_.CopyFromMat(inv_vars);
-  stddevs_.ApplyPow(-0.5);
-}
-
-
-void Fmpe::ApplyContext(const MatrixBase<BaseFloat> &intermed_feat,
-                        MatrixBase<BaseFloat> *feat_out) const {
-  // Applies the temporal-context part of the transformation.
-  int32 dim = FeatDim(), ncontexts = NumContexts(),
-      T = intermed_feat.NumRows();
-  KALDI_ASSERT(intermed_feat.NumCols() == dim * ncontexts &&
-               intermed_feat.NumRows() == feat_out->NumRows()
-               && feat_out->NumCols() == dim);
-  // note: ncontexts == contexts_.size().
-  for (int32 i = 0; i < ncontexts; i++) {
-    // this_intermed_feat is the chunk of the "intermediate features"
-    // that corresponds to this "context"
-    SubMatrix<BaseFloat> this_intermed_feat(intermed_feat, 0, T,
-                                            dim*i, dim);
-    for (int32 j = 0; j < static_cast<int32>(contexts_[i].size()); j++) {
-      int32 t_offset = contexts_[i][j].first;
-      BaseFloat weight = contexts_[i][j].second;
-      // Note: we could do this more efficiently using matrix operations,
-      // but this doesn't dominate the computation and I think this is
-      // clearer.
-      for (int32 t_out = 0; t_out < T; t_out++) { // t_out indexes the output
-        int32 t_in = t_out + t_offset; // t_in indexes the input.
-        if (t_in >= 0 && t_in < T) // Discard frames outside range.
-          feat_out->Row(t_out).AddVec(weight, this_intermed_feat.Row(t_in));
-      }
-    }
-  }
-}
-
-void Fmpe::ApplyContextReverse(const MatrixBase<BaseFloat> &feat_deriv,
-                               MatrixBase<BaseFloat> *intermed_feat_deriv)
-    const {
-  // Applies the temporal-context part of the transformation,
-  // in reverse, for getting derivatives for training.
-  int32 dim = FeatDim(), ncontexts = NumContexts(),
-      T = feat_deriv.NumRows();
-  KALDI_ASSERT(intermed_feat_deriv->NumCols() == dim * ncontexts &&
-               intermed_feat_deriv->NumRows() == feat_deriv.NumRows()
-               && feat_deriv.NumCols() == dim);
-  // note: ncontexts == contexts_.size().
-  for (int32 i = 0; i < ncontexts; i++) {
-    // this_intermed_feat is the chunk of the derivative of
-    // "intermediate features" that corresponds to this "context"
-    // (this is output, in this routine).
-    SubMatrix<BaseFloat> this_intermed_feat_deriv(*intermed_feat_deriv, 0, T,
-                                                  dim*i, dim);
-    for (int32 j = 0; j < static_cast<int32>(contexts_[i].size()); j++) {
-      int32 t_offset = contexts_[i][j].first;
-      BaseFloat weight = contexts_[i][j].second;
-      // Note: we could do this more efficiently using matrix operations,
-      // but this doesn't dominate the computation and I think this is
-      // clearer.
-      for (int32 t_out = 0; t_out < T; t_out++) { // t_out indexes the output
-        int32 t_in = t_out + t_offset; // t_in indexes the input.
-        if (t_in >= 0 && t_in < T) // Discard frames outside range.
-          this_intermed_feat_deriv.Row(t_in).AddVec(weight,
-                                                    feat_deriv.Row(t_out));
-        // Note: the line above is where the work happens; it's the same
-        // as in ApplyContext except reversing the input and output.
-      }
-    }
-  }
-}
-
-void Fmpe::ApplyC(MatrixBase<BaseFloat> *feat_out, bool reverse) const {
-  int32 T = feat_out->NumRows();
-  Vector<BaseFloat> tmp(feat_out->NumCols());
-  for (int32 t = 0; t < T; t++) {
-    SubVector<BaseFloat> row(*feat_out, t);
-    // Next line does: tmp = C_ * row
-    tmp.AddTpVec(1.0, C_, (reverse ? kTrans : kNoTrans), row, 0.0);
-    row.CopyFromVec(tmp);
-  }
-}
-
-// Constructs the high-dim features and applies the main projection matrix
-// projT_.  This projects from dimension ngauss*(dim+1) to dim*ncontexts.  Note:
-// because the input vector of size ngauss*(dim+1) is sparse in a blocky way
-// (i.e. each frame only has a couple of nonzero posteriors), we deal with
-// sub-matrices of the projection matrix projT_.  We actually further optimize
-// the code by taking all frames in a file that had nonzero posteriors for a
-// particular Gaussian, and forming a matrix out of the corresponding
-// high-dimensional features; we can then use a matrix-matrix multiply rather
-// than using vector-matrix operations.
-
-void Fmpe::ApplyProjection(const MatrixBase<BaseFloat> &feat_in,
-                           const std::vector<std::vector<int32> > &gselect,
-                           MatrixBase<BaseFloat> *intermed_feat) const {
-  int32 dim = FeatDim(), ncontexts = NumContexts();  
-  
-  Vector<BaseFloat> post; // will be posteriors of selected Gaussians.
-  Vector<BaseFloat> input_chunk(dim+1); // will be a segment of
-  // the high-dimensional features.
-
-  // "all_posts" is a vector of ((gauss-index, time-index), gaussian
-  // posterior).
-  // We'll compute the posterior information, sort it, and then
-  // go through it in sorted order, which maintains memory locality
-  // when accessing the projection matrix.
-  // Note: if we really cared we could make this use level-3 BLAS
-  // (matrix-matrix multiply), but we'd need to have a temporary
-  // matrix for the output and input.
-  std::vector<std::pair<std::pair<int32, int32>, BaseFloat> > all_posts;
-  
-  for (int32 t = 0; t < feat_in.NumRows(); t++) {
-    SubVector<BaseFloat> this_feat(feat_in, t);
-    gmm_.LogLikelihoodsPreselect(this_feat, gselect[t], &post);
-    // At this point, post will contain log-likes of the selected
-    // Gaussians.
-    post.ApplySoftMax(); // Now they are posteriors (which sum to one).
-    for (int32 i = 0; i < post.Dim(); i++) {
-      int32 gauss = gselect[t][i];
-      all_posts.push_back(std::make_pair(std::make_pair(gauss, t), post(i)));
-    }
-  }
-  std::sort(all_posts.begin(), all_posts.end());
-  
-  bool optimize = true;
-
-  if (!optimize) { // Why do we keep this un-optimized code around?
-    // For clarity, so you can see what's going on, and for easier
-    // comparision with ApplyProjectionReverse which is similar to this
-    // un-optimized segment.  Both un-optimized and optimized versions
-    // should give identical transforms (up to tiny roundoff differences).
-    for (size_t i = 0; i < all_posts.size(); i++) {
-      int32 gauss = all_posts[i].first.first, t = all_posts[i].first.second;
-      SubVector<BaseFloat> this_feat(feat_in, t);
-      SubVector<BaseFloat> this_intermed_feat(*intermed_feat, t);
-      BaseFloat this_post = all_posts[i].second;
-      SubVector<BaseFloat> this_stddev(stddevs_, gauss);
-
-      // The next line is equivalent to setting input_chunk to
-      // -this_post * the gaussian mean / (gaussian stddev).  Note: we use
-      // the fact that mean * inv_var *  stddev == mean / stddev.
-      input_chunk.Range(0, dim).AddVecVec(-this_post, gmm_.means_invvars().Row(gauss),
-                                          this_stddev, 0.0);
-      // The next line is equivalent to adding (feat / gaussian stddev) to
-      // input_chunk, so now it contains (feat - mean) / stddev, which is
-      // our "normalized" feature offset.
-      input_chunk.Range(0, dim).AddVecDivVec(this_post, this_feat, this_stddev,
-                                             1.0);
-      // The last element of this input_chunk is the posterior itself
-      // (between 0 and 1).
-      input_chunk(dim) = this_post * config_.post_scale;
-
-      // this_intermed_feat += [appropriate chjunk of projT_] * input_chunk.
-      this_intermed_feat.AddMatVec(1.0, projT_.Range(gauss*(dim+1), dim+1,
-                                                     0, dim*ncontexts),
-                                   kTrans, input_chunk, 1.0);
-    }
-  } else {
-    size_t i = 0;
-    // We process the "posts" vector in chunks, where each chunk corresponds to
-    // the same Gaussian index (but different times).
-    while (i < all_posts.size()) {
-      int32 gauss = all_posts[i].first.first;
-      SubVector<BaseFloat> this_stddev(stddevs_, gauss),
-          this_mean_invvar(gmm_.means_invvars(), gauss);
-      SubMatrix<BaseFloat> this_projT_chunk(projT_, gauss*(dim+1), dim+1,
-                                            0, dim*ncontexts);
-      int32 batch_size; // number of posteriors with same Gaussian..
-      for (batch_size = 0;
-           batch_size+i < static_cast<int32>(all_posts.size()) &&
-               all_posts[batch_size+i].first.first == gauss;
-           batch_size++); // empty loop body.
-      Matrix<BaseFloat> input_chunks(batch_size, dim+1);
-      Matrix<BaseFloat> intermed_temp(batch_size, dim*ncontexts);
-      for (int32 j = 0; j < batch_size; j++) { // set up "input_chunks".
-        // To understand this code, first examine code and comments in "non-optimized"
-        // code chunk above (the other branch of the if/else statement).
-        int32 t = all_posts[i+j].first.second;
-        SubVector<BaseFloat> this_feat(feat_in, t);
-        SubVector<BaseFloat> this_input_chunk(input_chunks, j);
-        BaseFloat this_post = all_posts[i+j].second;
-        this_input_chunk.Range(0, dim).AddVecVec(-this_post,
-                                                 this_mean_invvar,
-                                                 this_stddev, 0.0);
-        this_input_chunk.Range(0, dim).AddVecDivVec(this_post, this_feat,
-                                                    this_stddev, 1.0);
-        this_input_chunk(dim) = this_post * config_.post_scale;
-      }
-      // The next line is where most of the computation will happen,
-      // during the feature computation phase.  We have rearranged
-      // stuff so it's a matrix-matrix operation, for greater
-      // efficiency (when using optimized libraries like ATLAS).
-      intermed_temp.AddMatMat(1.0, input_chunks, kNoTrans,
-                              this_projT_chunk, kNoTrans, 0.0);
-      for (int32 j = 0; j < batch_size; j++) { // add data from
-        // intermed_temp to the output "intermed_feat"
-        int32 t = all_posts[i+j].first.second;
-        SubVector<BaseFloat> this_intermed_feat(*intermed_feat, t);
-        SubVector<BaseFloat> this_intermed_temp(intermed_temp, j);
-        // this_intermed_feat += this_intermed_temp.
-        this_intermed_feat.AddVec(1.0, this_intermed_temp);
-      }
-      i += batch_size;
-    }
-  }
-}      
-
-
-
-// This function does the reverse to ApplyProjection, for the case
-// where we want the derivatives w.r.t. the projection matrix.
-// It stores the positive and negative parts of this separately.
-void Fmpe::ApplyProjectionReverse(const MatrixBase<BaseFloat> &feat_in,
-                                  const std::vector<std::vector<int32> > &gselect,
-                                  const MatrixBase<BaseFloat> &intermed_feat_deriv,
-                                  MatrixBase<BaseFloat> *proj_deriv_plus,
-                                  MatrixBase<BaseFloat> *proj_deriv_minus) const {
-  int32 dim = FeatDim(), ncontexts = NumContexts();  
-  
-  Vector<BaseFloat> post; // will be posteriors of selected Gaussians.
-  Vector<BaseFloat> input_chunk(dim+1); // will be a segment of
-  // the high-dimensional features.
-
-  // "all_posts" is a vector of ((gauss-index, time-index), gaussian
-  // posterior).
-  // We'll compute the posterior information, sort it, and then
-  // go through it in sorted order, which maintains memory locality
-  // when accessing the projection matrix.
-  std::vector<std::pair<std::pair<int32, int32>, BaseFloat> > all_posts;
-  
-  for (int32 t = 0; t < feat_in.NumRows(); t++) {
-    SubVector<BaseFloat> this_feat(feat_in, t);
-    gmm_.LogLikelihoodsPreselect(this_feat, gselect[t], &post);
-    // At this point, post will contain log-likes of the selected
-    // Gaussians.
-    post.ApplySoftMax(); // Now they are posteriors (which sum to one).
-    for (int32 i = 0; i < post.Dim(); i++) {
-      // The next few lines (where we set up "input_chunk") are identical
-      // to ApplyProjection.
-      int32 gauss = gselect[t][i];
-      all_posts.push_back(std::make_pair(std::make_pair(gauss, t), post(i)));
-    }
-  }
-  std::sort(all_posts.begin(), all_posts.end());
-  for (size_t i = 0; i < all_posts.size(); i++) {
-    int32 gauss = all_posts[i].first.first, t = all_posts[i].first.second;
-    BaseFloat this_post = all_posts[i].second;
-    SubVector<BaseFloat> this_feat(feat_in, t);    
-    SubVector<BaseFloat> this_intermed_feat_deriv(intermed_feat_deriv, t);
-    SubVector<BaseFloat> this_stddev(stddevs_, gauss);
-    input_chunk.Range(0, dim).AddVecVec(-this_post, gmm_.means_invvars().Row(gauss),
-                                        this_stddev, 0.0);
-    input_chunk.Range(0, dim).AddVecDivVec(this_post, this_feat, this_stddev,
-                                           1.0);
-    input_chunk(dim) = this_post * config_.post_scale;
-
-    // If not for accumulating the + and - parts separately, we would be
-    // doing something like:
-    // proj_deriv_.Range(0, dim*ncontexts, gauss*(dim+1), dim+1).AddVecVec(
-    //                    1.0, this_intermed_feat_deriv, input_chunk);
-
-
-    SubMatrix<BaseFloat> plus_chunk(*proj_deriv_plus, 
-                                    gauss*(dim+1), dim+1,
-                                    0, dim*ncontexts),
-        minus_chunk(*proj_deriv_minus, 
-                    gauss*(dim+1), dim+1,
-                    0, dim*ncontexts);
-          
-    // This next function takes the rank-one matrix
-    //  (input_chunk * this_intermed_deriv'), and adds the positive
-    // part to proj_deriv_plus, and minus the negative part to
-    // proj_deriv_minus.
-    AddOuterProductPlusMinus(static_cast<BaseFloat>(1.0),
-                             input_chunk,
-                             this_intermed_feat_deriv,
-                             &plus_chunk, &minus_chunk);
-  }
-}      
-
-void Fmpe::ComputeFeatures(const MatrixBase<BaseFloat> &feat_in,
-                           const std::vector<std::vector<int32> > &gselect,
-                           Matrix<BaseFloat> *feat_out) const {
-  int32 dim = FeatDim();
-  KALDI_ASSERT(feat_in.NumRows() != 0 && feat_in.NumCols() == dim);
-  KALDI_ASSERT(feat_in.NumRows() == static_cast<int32>(gselect.size()));
-  feat_out->Resize(feat_in.NumRows(), feat_in.NumCols()); // will zero it.
-  
-  // Intermediate-dimension features
-  Matrix<BaseFloat> intermed_feat(feat_in.NumRows(),
-                                  dim * NumContexts());
-
-  // Apply the main projection, from high-dim to intermediate
-  // dimension (dim * NumContexts()).
-  ApplyProjection(feat_in, gselect, &intermed_feat);
-
-  // Apply the temporal context and reduces from
-  // dimension dim*ncontexts to dim.
-  ApplyContext(intermed_feat, feat_out);
-
-  // Lastly, apply the the "C" matrix-- linear transform on the offsets.
-  ApplyC(feat_out);
-}
-
-
-void Fmpe::AccStats(const MatrixBase<BaseFloat> &feat_in,
-                    const std::vector<std::vector<int32> > &gselect,
-                    const MatrixBase<BaseFloat> &direct_feat_deriv,
-                    const MatrixBase<BaseFloat> *indirect_feat_deriv, // may be NULL
-                    FmpeStats *fmpe_stats) const {
-  SubMatrix<BaseFloat> stats_plus(fmpe_stats->DerivPlus());
-  SubMatrix<BaseFloat> stats_minus(fmpe_stats->DerivMinus());
-  int32 dim = FeatDim(), ncontexts = NumContexts();
-  KALDI_ASSERT(feat_in.NumRows() != 0 && feat_in.NumCols() == dim);
-  KALDI_ASSERT(feat_in.NumRows() == static_cast<int32>(gselect.size()));
-  KALDI_ASSERT(SameDim(stats_plus, projT_) && SameDim(stats_minus, projT_) &&
-               SameDim(feat_in, direct_feat_deriv));
-
-  if (indirect_feat_deriv != NULL)
-    fmpe_stats->AccumulateChecks(feat_in, direct_feat_deriv, *indirect_feat_deriv);
-  
-  Matrix<BaseFloat> feat_deriv(direct_feat_deriv); // "feat_deriv" is initially direct+indirect.
-  if (indirect_feat_deriv != NULL)
-    feat_deriv.AddMat(1.0, *indirect_feat_deriv);
-  
-  // We do the "*Reverse" version of each stage now, in reverse order.
-  ApplyCReverse(&feat_deriv);
-  
-  Matrix<BaseFloat> intermed_feat_deriv(feat_in.NumRows(), dim*ncontexts);
-  ApplyContextReverse(feat_deriv, &intermed_feat_deriv);
-  
-  ApplyProjectionReverse(feat_in, gselect, intermed_feat_deriv,
-                         &stats_plus, &stats_minus);
-}
-
-
-void FmpeOptions::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, context_expansion);
-  WriteBasicType(os, binary, post_scale);
-}
-void FmpeOptions::Read(std::istream &is, bool binary) {
-  ReadToken(is, binary, &context_expansion);
-  ReadBasicType(is, binary, &post_scale);
-}
-
-Fmpe::Fmpe(const DiagGmm &gmm, const FmpeOptions &config): gmm_(gmm),
-                                                          config_(config) {
-  SetContexts(config.context_expansion);
-  ComputeC();
-  ComputeStddevs();
-  projT_.Resize(NumGauss() * (FeatDim()+1), FeatDim() * NumContexts());
-}
-
-BaseFloat Fmpe::Update(const FmpeUpdateOptions &config,
-                       const FmpeStats &stats) {
-  SubMatrix<BaseFloat> proj_deriv_plus = stats.DerivPlus(),
-      proj_deriv_minus = stats.DerivMinus();
-  // tot_linear_objf_impr is the change in the actual
-  // objective function if it were linear, i.e.
-  //   objf-gradient . parameter-change
-  // Note: none of this is normalized by the #frames (we don't have
-  // this info here), so that is done at the script level.
-  BaseFloat tot_linear_objf_impr = 0.0;
-  int32 changed = 0; // Keep track of how many elements change sign.
-  KALDI_ASSERT(SameDim(proj_deriv_plus, projT_) && SameDim(proj_deriv_minus, projT_));
-  KALDI_ASSERT(proj_deriv_plus.Min() >= 0);
-  KALDI_ASSERT(proj_deriv_minus.Min() >= 0);
-  BaseFloat learning_rate = config.learning_rate,
-      l2_weight = config.l2_weight;
-  
-  for (int32 i = 0; i < projT_.NumRows(); i++) {
-    for (int32 j = 0; j < projT_.NumCols(); j++) {
-      BaseFloat p = proj_deriv_plus(i, j), n = proj_deriv_minus(i, j),
-          x = projT_(i, j);
-      // Suppose the basic update (before regularization) is:
-      // z <-- x  +   learning_rate * (p - n) / (p + n),
-      // where z is the new parameter and x is the old one.
-      // Here, we view (learning_rate / (p + n)) as a parameter-specific
-      // learning rate.  In fact we view this update as the maximization
-      // of an auxiliary function of the form:
-      //  (z-x).(p-n)    - 0.5 (z - x)^2 (p+n)/learning_rate
-      // and taking the derivative w.r.t z, we get:
-      // Q'(z) =  (p-n) - (z - x) (p+n) / learning_rate
-      // which we set to zero and solve for z, to get z = x + learning_rate.(p-n)/(p+n)
-      // At this point we add regularization, a term of the form -l2_weight * z^2.
-      // Our new auxiliary function derivative is:
-      // Q(z) = -2.l2_weight.z + (p-n) - (z - x) (p+n) / learning_rate
-      // We can write this as:
-      // Q(z) = z . (-2.l2_weight - (p+n)/learning_rate)
-      //        + (p-n) + x(p+n)/learning_rate
-      // solving for z, we get:
-      //      z = ((p-n) + x (p+n)/learning_rate) / (2.l2_weight + (p+n)/learning_rate)
-
-      BaseFloat z = ((p-n) + x*(p+n)/learning_rate) / (2*l2_weight + (p+n)/learning_rate);
-      // z is the new parameter value.
-
-      tot_linear_objf_impr += (z-x) * (p-n); // objf impr based on linear assumption.
-      projT_(i, j) = z;
-      if (z*x < 0) changed++;
-    }
-  }
-  KALDI_LOG << "Objf impr (assuming linear) is " << tot_linear_objf_impr;
-  KALDI_LOG << ((100.0*changed)/(projT_.NumRows()*projT_.NumCols()))
-            << "% of matrix elements changed sign.";
-  return tot_linear_objf_impr;
-}
-
-// Note: we write the GMM first, without any other header.
-// This way, the gselect code can treat the form on disk as
-// a normal GMM object.
-void Fmpe::Write(std::ostream &os, bool binary) const {
-  if (gmm_.NumGauss() == 0)
-    KALDI_ERR << "Fmpe::Write, object not initialized.";
-  gmm_.Write(os, binary);
-  config_.Write(os, binary);
-  // stddevs_ are derived, don't write them.
-  projT_.Write(os, binary);
-  C_.Write(os, binary);
-  // contexts_ are derived from config, don't write them.
-}
-
-
-void Fmpe::Read(std::istream &is, bool binary) {
-  gmm_.Read(is, binary);
-  config_.Read(is, binary);
-  ComputeStddevs(); // computed from gmm.
-  projT_.Read(is, binary);
-  C_.Read(is, binary);
-  SetContexts(config_.context_expansion);
-}
-
-
-BaseFloat ComputeAmGmmFeatureDeriv(const AmDiagGmm &am_gmm,
-                                   const TransitionModel &trans_model,
-                                   const Posterior &posterior,
-                                   const MatrixBase<BaseFloat> &features,
-                                   Matrix<BaseFloat> *direct_deriv,
-                                   const AccumAmDiagGmm *model_diff,
-                                   Matrix<BaseFloat> *indirect_deriv) {
-  KALDI_ASSERT((model_diff != NULL) == (indirect_deriv != NULL));
-  BaseFloat ans = 0.0;
-  KALDI_ASSERT(posterior.size() == static_cast<size_t>(features.NumRows()));
-  direct_deriv->Resize(features.NumRows(), features.NumCols());
-  if (indirect_deriv != NULL)
-    indirect_deriv->Resize(features.NumRows(), features.NumCols());
-  Vector<BaseFloat> temp_vec(features.NumCols());
-  Vector<double> temp_vec_dbl(features.NumCols());
-  for (size_t i = 0; i < posterior.size(); i++) {
-    for (size_t j = 0; j < posterior[i].size(); j++) {
-      int32 tid = posterior[i][j].first,  // transition identifier.
-          pdf_id = trans_model.TransitionIdToPdf(tid);
-      BaseFloat weight = posterior[i][j].second;
-      const DiagGmm &gmm = am_gmm.GetPdf(pdf_id);
-      Vector<BaseFloat> gauss_posteriors;
-      SubVector<BaseFloat> this_feat(features, i);
-      SubVector<BaseFloat> this_direct_deriv(*direct_deriv, i);
-      ans += weight * 
-          gmm.ComponentPosteriors(this_feat, &gauss_posteriors);
-      
-      gauss_posteriors.Scale(weight);
-      // The next line does: to i'th row of deriv, add
-      // means_invvars^T * gauss_posteriors,
-      // where each row of means_invvars is the mean times
-      // diagonal inverse covariance... after transposing,
-      // this becomes a weighted of these rows, weighted by
-      // the posteriors.  This comes from the term
-      //  feat^T * inv_var * mean
-      // in the objective function.
-      this_direct_deriv.AddMatVec(1.0, gmm.means_invvars(), kTrans,
-                                  gauss_posteriors, 1.0);      
-
-      // next line does temp_vec == inv_vars^T * gauss_posteriors,
-      // which sets temp_vec to a weighted sum of the inv_vars,
-      // weighed by Gaussian posterior.
-      temp_vec.AddMatVec(1.0, gmm.inv_vars(), kTrans,
-                         gauss_posteriors, 0.0);
-      // Add to the derivative, -(this_feat .* temp_vec),
-      // which is the term that comes from the -0.5 * inv_var^T feat_sq,
-      // in the objective function (where inv_var is a vector, and feat_sq
-      // is a vector of squares of the feature values).
-      // Note: we have to do some messing about with double-precision here
-      // because the stats only come in double precision.
-      this_direct_deriv.AddVecVec(-1.0, this_feat, temp_vec, 1.0);
-      if (model_diff != NULL && weight > 0.0) { // We need to get the indirect diff.
-        // This "weight > 0.0" checks that this is the numerator stats, as the
-        // fMPE indirect diff applies only to the ML stats-- CAUTION, this
-        // code will only work as-is for fMMI (and the stats should not be
-        // canceled), due to the assumption that ML stats == num stats.
-        Vector<double> gauss_posteriors_dbl(gauss_posteriors);
-        const AccumDiagGmm &deriv_acc = model_diff->GetAcc(pdf_id);
-        // part of the derivative.  Note: we could just store the direct and
-        // indirect derivatives together in one matrix, but it makes it easier
-        // to accumulate certain diagnostics if we store them separately.
-        SubVector<BaseFloat> this_indirect_deriv(*indirect_deriv, i);
-        // note: deriv_acc.mean_accumulator() contains the derivative of
-        // the objective function w.r.t. the "x stats" accumulated for
-        // this GMM.  variance_accumulator() is the same for the "x^2 stats".
-        temp_vec_dbl.AddMatVec(1.0, deriv_acc.mean_accumulator(), kTrans,
-                               gauss_posteriors_dbl, 0.0);
-        this_indirect_deriv.AddVec(1.0, temp_vec_dbl);
-        temp_vec_dbl.AddMatVec(1.0, deriv_acc.variance_accumulator(), kTrans,
-                               gauss_posteriors_dbl, 0.0);
-        temp_vec.CopyFromVec(temp_vec_dbl); // convert to float.
-        // next line because d(x^2 stats for Gaussian)/d(feature) =
-        // 2 * (gaussian posterior) * feature.
-        this_indirect_deriv.AddVecVec(2.0, this_feat, temp_vec, 1.0);
-      }
-    }
-  }
-  return ans;
-}
-
-
-SubMatrix<BaseFloat> FmpeStats::DerivPlus() const { // const-ness not preserved.
-  KALDI_ASSERT(deriv.NumRows() != 0);
-  int32 proj_num_rows = deriv.NumRows(),
-      proj_num_cols = deriv.NumCols()/2;
-  return SubMatrix<BaseFloat>(deriv, 0, proj_num_rows,
-                              0, proj_num_cols);
-}
-SubMatrix<BaseFloat> FmpeStats::DerivMinus() const { // const-ness not preserved.
-  KALDI_ASSERT(deriv.NumRows() != 0);
-  int32 proj_num_rows = deriv.NumRows(),
-      proj_num_cols = deriv.NumCols()/2;
-  return SubMatrix<BaseFloat>(deriv, 0, proj_num_rows,
-                              proj_num_cols, proj_num_cols);
-}
-
-void FmpeStats::Init(const Fmpe &fmpe) {
-  int32 num_rows = fmpe.ProjectionTNumRows(),
-      num_cols = fmpe.ProjectionTNumCols();
-  deriv.Resize(num_rows, num_cols*2);
-
-  int32 feat_dim = fmpe.FeatDim();
-  checks.Resize(8, feat_dim);
-}
-
-void FmpeStats::AccumulateChecks(const MatrixBase<BaseFloat> &feats,
-                                 const MatrixBase<BaseFloat> &direct_deriv,
-                                 const MatrixBase<BaseFloat> &indirect_deriv) {
-  int32 T = feats.NumRows(), dim = feats.NumCols();
-  KALDI_ASSERT(direct_deriv.NumRows() == T && direct_deriv.NumCols() == dim &&
-               indirect_deriv.NumRows() == T && indirect_deriv.NumCols() == dim);
-  KALDI_ASSERT(checks.NumRows() == 8 && checks.NumCols() == dim);
-  for (int32 t = 0; t < T; t++) {
-    for (int32 d = 0; d < dim; d++) {
-      BaseFloat zero = 0.0;
-      checks(0, d) += std::max(zero, direct_deriv(t, d));
-      checks(1, d) += std::max(zero, -direct_deriv(t, d));
-      checks(2, d) += std::max(zero, indirect_deriv(t, d));
-      checks(3, d) += std::max(zero, -indirect_deriv(t, d));
-      checks(4, d) += std::max(zero, feats(t, d)*direct_deriv(t, d));
-      checks(5, d) += std::max(zero, -feats(t, d)*direct_deriv(t, d));
-      checks(6, d) += std::max(zero, feats(t, d)*indirect_deriv(t, d));
-      checks(7, d) += std::max(zero, -feats(t, d)*indirect_deriv(t, d));
-    }
-  }
-}
-
-void FmpeStats::DoChecks() {
-  if (checks.IsZero()) {
-    KALDI_LOG << "No checks will be done, probably indirect derivative was not used.";
-    return;
-  }
-  int32 dim = checks.NumCols();
-  Vector<double> shift_check(dim), shift_check2(dim), scale_check(dim), scale_check2(dim);
-  for (int32 d = 0; d < dim; d++) {
-    // shiftnumerator = direct+indirect deriv-- should be zero.
-    double shift_num = checks(0, d) - checks(1, d) + checks(2, d) - checks(3, d),
-        shift_den = checks(0, d) + checks(1, d) + checks(2, d) + checks(3, d),
-        shift_den2 = fabs(checks(0, d) - checks(1, d)) + fabs(checks(2, d) - checks(3, d));
-    shift_check(d) = shift_num / shift_den;
-    shift_check2(d) = shift_num / shift_den2;
-    double scale_num = checks(4, d) - checks(5, d) + checks(6, d) - checks(7, d),
-        scale_den = checks(4, d) + checks(5, d) + checks(6, d) + checks(7, d),
-        scale_den2 = fabs(checks(4, d) - checks(5, d)) + fabs(checks(6, d) - checks(7, d));
-    scale_check(d) = scale_num / scale_den;
-    scale_check2(d) = scale_num / scale_den2;
-  }
-
-  KALDI_LOG << "Shift-check is as follows (should be in range +- 0.01 or less)."
-            << shift_check;
-  KALDI_LOG << "Scale-check is as follows (should be in range +- 0.01 or less)."
-            << scale_check;
-  KALDI_LOG << "Shift-check(2) is as follows: most elements should be in range +-0.1: "
-            << shift_check2;
-  KALDI_LOG << "Scale-check(2) is as follows: most elements should be in range +-0.1: "
-            << scale_check2;
-}
-
-void FmpeStats::Write(std::ostream &os, bool binary) const {
-  deriv.Write(os, binary);
-  checks.Write(os, binary);
-}
-
-void FmpeStats::Read(std::istream &is, bool binary, bool add) {
-  deriv.Read(is, binary, add);
-  checks.Read(is, binary, add);
-}
-
-
-}  // End of namespace kaldi
diff --git a/src/transform/fmpe.h b/src/transform/fmpe.h
deleted file mode 100644
index f5f95938246..00000000000
--- a/src/transform/fmpe.h
+++ /dev/null
@@ -1,271 +0,0 @@
-// transform/fmpe.h
-
-// Copyright 2011-2012  Yanmin Qian  Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_TRANSFORM_FMPE_H_
-#define KALDI_TRANSFORM_FMPE_H_ 1
-
-#include <vector>
-
-#include "gmm/am-diag-gmm.h"
-#include "gmm/mle-am-diag-gmm.h"
-#include "hmm/transition-model.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-
-
-struct FmpeOptions {
-  // Probably the easiest place to start, to understand fMPE, is the
-  // paper "Improvements to fMPE for discriminative training of features".
-  // We are simplifying a few things here.  We are getting rid of the
-  // "indirect differential"; we are adding a linear transform after the
-  // high->low dimension projection whose function is to "un-whiten" the
-  // transformed features (i.e. project from a nominally Gaussian-distributed
-  // space into our actual feature space), in order to make it unnecessary to
-  // take into account the per-dim variance during the update phase of fMPE;
-  // and the update equations are rather simpler than described in
-  // the paper; we take away some stuff, but add in the capability to
-  // do l2 regularization during the update phase.
-  
-  std::string context_expansion; // This string describes the various contexts...
-  // the easiest way to think of it is, we first generate the high-dimensional
-  // features without context expansion, and we then append the left and right
-  // frames, and also weighted averages of further-out frames, as specified by
-  // this string.  Suppose there are 1024 Gaussians and the feature dimension is
-  // 40.  In the simple way to describe it, supposing there are 9 contexts (the
-  // central frame, the left and right frames, and 6 averages of more distant
-  // frames), we generate the "offset features" of dimension (1024 * 41), then
-  // add left and right temporal context to the high-dim features so the
-  // dimension is (1024 * 41 * 9), and then project down to 40, so we train a
-  // matrix of 40 x (1024 * 41 * 9).  As described in the paper, though, we
-  // reorganize the computation for efficiency (it has to do with preserving
-  // sparsity), and we train a matrix of dimension (40 * 9) x (1024 * 41).  The
-  // (40 x 9) -> 40 transformation, which involves time as well, is dictated by
-  // these contexts.
-
-  // You probably won't want to mess with this "context_expansion" string.
-  // The most important parameter to tune is the number of Gaussians in
-  // the UBM.  Typically this will be in the range 300 to 1000.
-
-  BaseFloat post_scale; // Scale on the posterior component of the high-dim
-  // features (1 of these for every [feat-dim] of the offset features).
-  // Typically 5.0-- this just gives a bit more emphasis to these posteriors
-  // during training, like a faster learning rate.
-  
-  FmpeOptions(): context_expansion("0,1.0:-1,1.0:1,1.0:-2,0.5;-3,0.5:2,0.5;3,0.5:-4,0.5;-5,0.5:4,0.5;5,0.5:-6,0.333;-7,0.333;-8,0.333:6,0.333;7,0.333;8,0.333"),
-                post_scale(5.0) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("post-scale", &post_scale, "Scaling constant on posterior "
-                   "element of offset features, to give it a faster learning "
-                   "rate.");
-    opts->Register("context-expansion", &context_expansion, "Specifies the "
-                   "temporal context-splicing of high-dimensional features.");
-  }
-  // We include write and read functions, since this
-  // object is included as a member of the fMPE object.
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary);
-};
-
-struct FmpeUpdateOptions {
-  BaseFloat learning_rate; // Learning rate constant.  Like inverse of E
-  // in the papers.
-  BaseFloat l2_weight; // Weight on l2 regularization term
-  
-  FmpeUpdateOptions(): learning_rate(0.1), l2_weight(100.0) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("learning-rate", &learning_rate,
-                   "Learning rate constant (like inverse of E in fMPE papers)");
-    opts->Register("l2-weight", &l2_weight,
-                   "Weight on l2 regularization term in objective function.");
-  }  
-};
-
-class Fmpe;
-
-struct FmpeStats {
-  FmpeStats() { };
-  void Init(const Fmpe &fmpe);
-  FmpeStats(const Fmpe &fmpe) { Init(fmpe); }
-
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary, bool add = false);
-  
-  SubMatrix<BaseFloat> DerivPlus() const;
-  SubMatrix<BaseFloat> DerivMinus() const;
-
-  /// If we're using the indirect differential, accumulates certain quantities
-  /// that will be used in the update phase to verify that the computation
-  /// of the indirect differential was done correctly
-  void AccumulateChecks(const MatrixBase<BaseFloat> &feats,
-                        const MatrixBase<BaseFloat> &direct_deriv,
-                        const MatrixBase<BaseFloat> &indirect_deriv);
-  void DoChecks(); // Will check that stuff cancels.  Just prints
-  // messages for now.
- private:
-  Matrix<BaseFloat> deriv; // contains positive and negative parts of derivatives
-  // separately as sub-parts of the matrix, to ensure memory locality.
-
-  // checks() is an 8 x fmpe.FeatDim() matrix that stores:
-  //  (0-1) summed-deriv from direct, +ve and -ve part.
-  //  (2-3) summed-deriv from indirect, +ve and -ve part.
-  //  (4-5) (summed-deriv from direct * features), +ve and -ve part.
-  //  (6-7) (summed-deriv from indirect * features), +ve and -ve part.
-  Matrix<double> checks; // contains quantities we use to check the
-  // indirect and direct derivatives are canceling as they should.
-
-};
-
-class Fmpe {
- public:
-  Fmpe() {}
-  Fmpe(const DiagGmm &gmm, const FmpeOptions &config);
-
-  int32 FeatDim() const { return gmm_.Dim(); }
-  int32 NumGauss() const { return gmm_.NumGauss(); }
-  int32 NumContexts() const { return static_cast<int32>(contexts_.size()); }
-
-  // Note: this returns the number of rows and columns in projT_,
-  // which is the transpose of the high->intermediate dimensional
-  // projection matrix.  This is the dimension we want for the
-  // stats.
-  int32 ProjectionTNumRows() const { return (FeatDim()+1) * NumGauss(); }
-  int32 ProjectionTNumCols() const { return FeatDim() * NumContexts(); }
-
-  
-  // Computes the fMPE feature offsets and outputs them.
-  // You can add feat_in to this afterwards, if you want.
-  // Requires the Gaussian-selection info, which would normally
-  // be computed by a separate program-- this consists of
-  // lists of the top-scoring Gaussians for these features.
-  void ComputeFeatures(const MatrixBase<BaseFloat> &feat_in,
-                       const std::vector<std::vector<int32> > &gselect,
-                       Matrix<BaseFloat> *feat_out) const;
-
-  // For training-- compute the derivative w.r.t the projection matrix
-  // (we keep the positive and negative parts separately to help
-  // set the learning rates).
-  void AccStats(const MatrixBase<BaseFloat> &feat_in,
-                const std::vector<std::vector<int32> > &gselect,
-                const MatrixBase<BaseFloat> &direct_feat_deriv,
-                const MatrixBase<BaseFloat> *indirect_feat_deriv, // may be NULL
-                FmpeStats *stats) const;
-  
-  // Note: the form on disk starts with the GMM; that way,
-  // the gselect program can treat the fMPE object as if it
-  // is a GMM.
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary);
-
-  // Returns total objf improvement, based on linear assumption.
-  BaseFloat Update(const FmpeUpdateOptions &config,
-                   const FmpeStats &stats);
-  
- private:
-  void SetContexts(std::string context_str);
-  void ComputeC(); // Computes the Cholesky factor C, from the GMM.
-  void ComputeStddevs();
-
-  // Constructs the high-dim features and applies the main projection matrix proj_.
-  void ApplyProjection(const MatrixBase<BaseFloat> &feat_in,
-                       const std::vector<std::vector<int32> > &gselect,
-                       MatrixBase<BaseFloat> *intermed_feat) const;
-
-  // The same in reverse, for computing derivatives.
-  void ApplyProjectionReverse(const MatrixBase<BaseFloat> &feat_in,
-                              const std::vector<std::vector<int32> > &gselect,
-                              const MatrixBase<BaseFloat> &intermed_feat_deriv,
-                              MatrixBase<BaseFloat> *proj_deriv_plus,
-                              MatrixBase<BaseFloat> *proj_deriv_minus) const;
-
-  // Applies the temporal context splicing from the intermediate
-  // features-- adds the result to feat_out which at this point
-  // will typically be zero.
-  void ApplyContext(const MatrixBase<BaseFloat> &intermed_feat,
-                    MatrixBase<BaseFloat> *feat_out) const;
-
-  // This is as ApplyContext but for back-propagating the derivative.
-  // Result is added to intermediate_feat_deriv which at this point will
-  // typically be zero.
-  void ApplyContextReverse(const MatrixBase<BaseFloat> &feat_deriv,
-                           MatrixBase<BaseFloat> *intermed_feat_deriv) const;
-
-  // Multiplies the feature offsets by the Cholesky matrix C.
-  void ApplyC(MatrixBase<BaseFloat> *feat_out, bool reverse = false) const;
-
-  // For computing derivatives-- multiply the derivatives by C^T,
-  // which is the "reverse" of the forward pass of multiplying
-  // by C (this is how derivatives behave...)
-  void ApplyCReverse(MatrixBase<BaseFloat> *deriv) const { ApplyC(deriv, true); }
-
-  
-  
-  DiagGmm gmm_; // The GMM used to get posteriors.
-  FmpeOptions config_;
-  Matrix<BaseFloat> stddevs_; // The standard deviations of the
-  // variances of the GMM -- computed to avoid taking a square root
-  // in the fMPE computation.   Derived variable-- not stored on
-  // disk.
-  Matrix<BaseFloat> projT_; // The transpose of the projection matrix;
-  // this is of dimension
-  // (NumGauss() * (FeatDim()+1)) * (FeatDim() * NumContexts()).
-  
-  TpMatrix<BaseFloat> C_; // Cholesky factor of the variance Sigma of
-  // features around their mean (as estimated from GMM)... applied
-  // to fMPE offset just before we add it to the features.  This allows
-  // us to simplify the fMPE update and not have to worry about
-  // the features having non-unit variance, and what effect this should
-  // have on the learning rate..
-  
-  // The following variable dictates how we use temporal context.
-  // e.g. contexts = { { (0, 1.0) }, { (-1, 1.0) }, { (1, 1.0) },
-  //                   { (-2, 0.5 ), (-3, 0.5) }, ...  }
-  std::vector<std::vector<std::pair<int32, BaseFloat> > > contexts_;
-  
-};
-
-/// Computes derivatives of the likelihood of these states (weighted),
-/// w.r.t. the feature values.  Used in fMPE training.  Note, the
-/// weights "posterior" may be positive or negative-- for MMI, MPE,
-/// etc., they will typically be of both signs.  Will resize "deriv".
-/// Returns the sum of (GMM likelihood * weight), which may be used
-/// as an approximation to the objective function.
-/// Last two parameters are optional.  See GetStatsDerivative() for
-/// or fMPE paper (ICASSP, 2005) more info on indirect derivative.
-/// Caution: if you supply the last two parameters, this function only
-/// works in the MMI case as it assumes the stats with positive weight
-/// are numerator == ml stats-- this is only the same thing in the MMI
-/// case, not fMPE.
-BaseFloat ComputeAmGmmFeatureDeriv(const AmDiagGmm &am_gmm,
-                                   const TransitionModel &trans_model,
-                                   const Posterior &posterior,
-                                   const MatrixBase<BaseFloat> &features,
-                                   Matrix<BaseFloat> *direct_deriv,
-                                   const AccumAmDiagGmm *model_diff = NULL,
-                                   Matrix<BaseFloat> *indirect_deriv = NULL);
-
-
-
-}  // End namespace kaldi
-
-
-#endif
diff --git a/src/transform/regtree-fmllr-diag-gmm-test.cc b/src/transform/regtree-fmllr-diag-gmm-test.cc
deleted file mode 100644
index 7f471d83966..00000000000
--- a/src/transform/regtree-fmllr-diag-gmm-test.cc
+++ /dev/null
@@ -1,320 +0,0 @@
-// transform/regtree-fmllr-diag-gmm-test.cc
-
-// Copyright 2009-2011  Georg Stemmer;  Saarland University
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/mle-diag-gmm.h"
-#include "gmm/mle-am-diag-gmm.h"
-#include "gmm/model-test-common.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-
-namespace kaldi {
-
-static void
-RandFullCova(Matrix<BaseFloat> *matrix) {
-  size_t dim = matrix->NumCols();
-  KALDI_ASSERT(matrix->NumCols() == matrix->NumRows());
-
-  size_t iter = 0;
-  size_t max_iter = 10000;
-  // generate random (non-singular) matrix
-  // until condition
-  Matrix<BaseFloat> tmp(dim, dim);
-  SpMatrix<BaseFloat> tmp2(dim);
-  while (iter < max_iter) {
-    tmp.SetRandn();
-    if (tmp.Cond() < 100) break;
-    iter++;
-  }
-  if (iter >= max_iter) {
-    KALDI_ERR << "Internal error: found no random covariance matrix.";
-  }
-  // tmp * tmp^T will give positive definite matrix
-  tmp2.AddMat2(1.0, tmp, kNoTrans, 0.0);
-  matrix->CopyFromSp(tmp2);
-}
-
-
-/// Generate features for a certain covariance type
-/// covariance_type == 0: full covariance
-/// covariance_type == 1: diagonal covariance
-
-enum cova_type {
-  full,
-  diag
-};
-
-static void
-generate_features(cova_type covariance_type,
-                  size_t n_gaussians,
-                  size_t dim,
-                  Matrix<BaseFloat> &trans_mat,
-                  size_t frames_per_gaussian,
-                  std::vector<Vector<BaseFloat>*> & train_feats,
-                  std::vector<Vector<BaseFloat>*> & adapt_feats
-                  ) {
-  // compute inverse of the transformation matrix
-  Matrix<BaseFloat> inv_trans_mat(dim, dim);
-  inv_trans_mat.CopyFromMat(trans_mat, kNoTrans);
-  inv_trans_mat.Invert();
-  // the untransformed means are random
-  Matrix<BaseFloat> untransformed_means(dim, n_gaussians);
-  untransformed_means.SetRandn();
-  untransformed_means.Scale(10);
-
-  // the actual means result from
-  // transformation with inv_trans_mat
-  Matrix<BaseFloat> actual_means(dim, n_gaussians);
-
-  // actual_means = inv_trans_mat * untransformed_means
-  actual_means.AddMatMat(1.0, inv_trans_mat, kNoTrans,
-                         untransformed_means, kNoTrans, 0.0);
-
-  size_t train_counter = 0;
-
-  // temporary variables
-  Vector<BaseFloat> randomvec(dim);
-  Matrix<BaseFloat> Sj(dim, dim);
-
-  // loop over all gaussians
-  for (size_t j = 0; j < n_gaussians; j++) {
-    if (covariance_type == diag) {
-      // random diagonal covariance for gaussian j
-      Sj.SetZero();
-      for (size_t d = 0; d < dim; d++) {
-        Sj(d, d) = 2*Exp(RandGauss());
-      }
-    }
-    if (covariance_type == full) {
-      // random full covariance for gaussian j
-      RandFullCova(&Sj);
-    }
-    // compute inv_trans_mat * Sj
-    Matrix<BaseFloat> tmp_matrix(dim, dim);
-    tmp_matrix.AddMatMat(1.0, inv_trans_mat, kNoTrans, Sj, kNoTrans, 0.0);
-
-    // compute features
-    for (size_t i = 0; i < frames_per_gaussian; i++) {
-      train_feats[train_counter] = new Vector<BaseFloat>(dim);
-      adapt_feats[train_counter] = new Vector<BaseFloat>(dim);
-
-      // initalize feature vector with mean of class j
-      train_feats[train_counter]->CopyColFromMat(untransformed_means, j);
-      adapt_feats[train_counter]->CopyColFromMat(actual_means, j);
-
-      // determine random vector and
-      // multiply the random vector with SJ
-      // and add it to train_feats:
-      // train_feats = train_feats + SJ * random
-      // for adapt_feats we include the invtrans_mat:
-      // adapt_feats = adapt_feats + invtrans_mat * SJ * random
-      for (size_t d = 0; d < dim; d++) {
-        randomvec(d) = RandGauss();
-      }
-      train_feats[train_counter]->AddMatVec(1.0, Sj, kNoTrans,
-                                            randomvec, 1.0);
-      adapt_feats[train_counter]->AddMatVec(1.0, tmp_matrix, kNoTrans,
-                                            randomvec, 1.0);
-      train_counter++;
-    }
-  }
-  return;
-}
-
-void UnitTestRegtreeFmllrDiagGmm(cova_type feature_type, size_t max_bclass) {
-  // dimension of the feature space
-  size_t dim = 5 + Rand() % 3;
-
-  // number of components in the data
-  size_t n_gaussians = 8;
-
-  // number of data points to generate for every gaussian
-  size_t frames_per_gaussian = 100;
-
-  // generate random transformation matrix trans_mat
-  Matrix<BaseFloat> trans_mat(dim, dim);
-  int i = 0;
-  while (i < 10000) {
-    trans_mat.SetRandn();
-    if (trans_mat.Cond() < 100) break;
-    i++;
-  }
-  std::cout << "Condition of original Trans_Mat: " << trans_mat.Cond() << '\n';
-
-  // generate many feature vectors for each of the mixture components
-  std::vector<Vector<BaseFloat>*>
-      train_feats(n_gaussians * frames_per_gaussian);
-  std::vector<Vector<BaseFloat>*>
-      adapt_feats(n_gaussians * frames_per_gaussian);
-
-  generate_features(feature_type,
-                    n_gaussians,
-                    dim,
-                    trans_mat,
-                    frames_per_gaussian,
-                    train_feats,
-                    adapt_feats);
-
-  // initial values for a GMM
-  Vector<BaseFloat> weights(1);
-  Matrix<BaseFloat> means(1, dim), vars(1, dim), invvars(1, dim);
-  for (size_t d= 0; d < dim; d++) {
-    means(0, d) = 0.0F;
-    vars(0, d) = 1.0F;
-  }
-  weights(0) = 1.0F;
-  invvars.CopyFromMat(vars);
-  invvars.InvertElements();
-
-  // new HMM with 1 state
-  DiagGmm *gmm = new DiagGmm();
-  gmm->Resize(1, dim);
-  gmm->SetWeights(weights);
-  gmm->SetInvVarsAndMeans(invvars, means);
-  gmm->ComputeGconsts();
-  GmmFlagsType flags = kGmmAll;
-  MleDiagGmmOptions opts;
-
-  AmDiagGmm *am = new AmDiagGmm();
-  am->AddPdf(*gmm);
-  AccumAmDiagGmm *est_am = new AccumAmDiagGmm();
-
-  // train HMM
-  size_t iteration = 0;
-  size_t maxiterations = 10;
-  int32 maxcomponents = n_gaussians;
-  BaseFloat loglike = 0;
-  while (iteration < maxiterations) {
-    est_am->Init(*am, flags);
-
-    loglike = 0;
-    for (size_t j = 0; j < train_feats.size(); j++) {
-      loglike += est_am->AccumulateForGmm(*am, *train_feats[j], 0, 1.0);
-    }
-    MleAmDiagGmmUpdate(opts, *est_am, flags, am, NULL, NULL);
-
-    std::cout << "Loglikelihood before iteration " << iteration << " : "
-              << std::scientific << loglike << " number of components: "
-              << am->NumGaussInPdf(0) << '\n';
-
-    if ((iteration % 3 == 1) &&
-        (am->NumGaussInPdf(0) * 2 <= maxcomponents)) {
-      size_t n = am->NumGaussInPdf(0)*2;
-      am->SplitPdf(0, n, 0.001);
-    }
-    iteration++;
-  }
-
-  // adapt HMM to the transformed feature vectors
-  iteration = 0;
-  RegtreeFmllrDiagGmmAccs * fmllr_accs = new RegtreeFmllrDiagGmmAccs();
-  RegressionTree regtree;
-
-  RegtreeFmllrOptions xform_opts;
-  xform_opts.min_count = 100 * (1 + Rand() % 10);
-  xform_opts.use_regtree = (RandUniform() < 0.5)? false : true;
-
-  size_t num_pdfs = 1;
-  Vector<BaseFloat> occs(num_pdfs);
-  for (int32 i = 0; i < static_cast<int32>(num_pdfs); i++) {
-    occs(i) = 1.0/static_cast<BaseFloat>(num_pdfs);
-  }
-  std::vector<int32> silphones;
-  regtree.BuildTree(occs, silphones, *am, max_bclass);
-  maxiterations = 10;
-  std::vector<Vector<BaseFloat>*> logdet(adapt_feats.size());
-  for (size_t j = 0; j < adapt_feats.size(); j++) {
-    logdet[j] = new Vector<BaseFloat>(1);
-    logdet[j]->operator()(0) = 0.0;
-  }
-  while (iteration < maxiterations) {
-    fmllr_accs->Init(regtree.NumBaseclasses(), dim);
-    fmllr_accs->SetZero();
-    RegtreeFmllrDiagGmm *new_fmllr = new RegtreeFmllrDiagGmm();
-    loglike = 0;
-    for (size_t j = 0; j < adapt_feats.size(); j++) {
-      loglike += fmllr_accs->AccumulateForGmm(regtree, *am, *adapt_feats[j], 0, 1.0);
-      loglike += logdet[j]->operator()(0);
-    }
-    std::cout << "FMLLR: Loglikelihood before iteration " << iteration << " : "
-              << std::scientific << loglike << '\n';
-
-    fmllr_accs->Update(regtree, xform_opts, new_fmllr, NULL, NULL);
-    std::cout << "Got " << new_fmllr->NumBaseClasses() << " baseclasses\n";
-    bool binary = (RandUniform() < 0.5)? true : false;
-    std::cout << "Writing the transform to disk.\n";
-    new_fmllr->Write(Output("tmpf", binary).Stream(), binary);
-    RegtreeFmllrDiagGmm *fmllr_read = new RegtreeFmllrDiagGmm();
-    bool binary_in;
-    Input ki("tmpf", &binary_in);
-    std::cout << "Reading the transform from disk.\n";
-    fmllr_read->Read(ki.Stream(), binary_in);
-    fmllr_read->Validate();
-
-    // transform features
-    std::vector<Vector<BaseFloat> > trans_feats(1);
-    Vector<BaseFloat> trans_logdet;
-//    new_fmllr->ComputeLogDets();
-    trans_logdet.Resize(fmllr_read->NumRegClasses());
-    fmllr_read->GetLogDets(&trans_logdet);
-    for (size_t j = 0; j < adapt_feats.size(); j++) {
-      fmllr_read->TransformFeature(*adapt_feats[j], &trans_feats);
-      logdet[j]->operator()(0) += trans_logdet(0);
-      adapt_feats[j]->CopyFromVec(trans_feats[0]);
-    }
-    iteration++;
-    delete new_fmllr;
-    delete fmllr_read;
-    
-    unlink("tmpf");
-  }
-
-//  // transform features with empty transform
-//  std::vector<Vector<BaseFloat> > trans_feats(1);
-//  RegtreeFmllrDiagGmm *empty_fmllr = new RegtreeFmllrDiagGmm();
-//  empty_fmllr->Init(0, 0);
-//  for (size_t j = 0; j < adapt_feats.size(); j++) {
-//    empty_fmllr->TransformFeature(*adapt_feats[j], &trans_feats);
-//  }
-//  delete empty_fmllr;
-
-  // clean up
-  delete fmllr_accs;
-  delete est_am;
-  delete am;
-  delete gmm;
-  DeletePointers(&logdet);
-  DeletePointers(&train_feats);
-  DeletePointers(&adapt_feats);
-}
-}  // namespace kaldi ends here
-
-int main() {
-  for (int i = 0; i <= 8; i+=2) {  // test is too slow so can't do too many
-    std::cout << "--------------------------------------" << '\n';
-    std::cout << "Test number " << i << '\n';
-    std::cout << "--\nfeatures = full\n";
-    kaldi::UnitTestRegtreeFmllrDiagGmm(kaldi::full, (i%10+1));
-    std::cout << "--\nfeatures = diag\n";
-    kaldi::UnitTestRegtreeFmllrDiagGmm(kaldi::diag, (i%10+1));
-    std::cout << "--------------------------------------" << '\n';
-  }
-  std::cout << "Test OK.\n";
-}
-
diff --git a/src/transform/regtree-fmllr-diag-gmm.cc b/src/transform/regtree-fmllr-diag-gmm.cc
deleted file mode 100644
index f34b9987183..00000000000
--- a/src/transform/regtree-fmllr-diag-gmm.cc
+++ /dev/null
@@ -1,407 +0,0 @@
-// transform/regtree-fmllr-diag-gmm.cc
-
-// Copyright 2009-2011  Saarland University;  Georg Stemmer;
-//                      Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <utility>
-#include <vector>
-using std::vector;
-
-#include "itf/optimizable-itf.h"
-#include "transform/fmllr-diag-gmm.h"
-#include "transform/regtree-fmllr-diag-gmm.h"
-
-namespace kaldi {
-
-void RegtreeFmllrDiagGmm::Init(size_t num_xforms, size_t dim) {
-  if (num_xforms == 0) {  // empty transform
-    xform_matrices_.clear();
-    logdet_.Resize(0);
-    valid_logdet_ = false;
-    dim_ = 0;  // non-zero dimension is meaningless with empty transform
-    num_xforms_ = 0;
-  } else {
-    KALDI_ASSERT(dim != 0);  // if not empty, dim = 0 is meaningless
-    dim_ = dim;
-    num_xforms_ = num_xforms;
-    xform_matrices_.resize(num_xforms);
-    logdet_.Resize(num_xforms);
-    vector< Matrix<BaseFloat> >::iterator xform_itr = xform_matrices_.begin(),
-        xform_itr_end = xform_matrices_.end();
-    for (; xform_itr != xform_itr_end; ++xform_itr) {
-      xform_itr->Resize(dim, dim+1);
-      xform_itr->SetUnit();
-    }
-    valid_logdet_ = true;
-  }
-}
-
-void RegtreeFmllrDiagGmm::SetUnit() {
-  KALDI_ASSERT(num_xforms_ > 0 && dim_ > 0);
-  vector< Matrix<BaseFloat> >::iterator xform_itr = xform_matrices_.begin(),
-      xform_itr_end = xform_matrices_.end();
-  for (; xform_itr != xform_itr_end; ++xform_itr) {
-    xform_itr->SetUnit();
-  }
-}
-
-void RegtreeFmllrDiagGmm::Validate() {
-  if (dim_ < 0 || num_xforms_ < 0) {  // uninitialized case
-    KALDI_ERR <<"Do not call Validate() with an uninitialized object (dim = "
-              << (dim_) << ", # transforms = " << (num_xforms_);
-  } else if (dim_ * num_xforms_ == 0) {  // empty case
-    KALDI_ASSERT(num_xforms_ == 0 && dim_ == 0);
-    if (xform_matrices_.size() != 0 || logdet_.Dim() != 0) {
-      KALDI_ERR << "Number of transforms = " << (xform_matrices_.size())
-                << ", number of log-determinant terms = " << (logdet_.Dim())
-                << ". Expected number = 0";
-    }
-    return;
-  }
-
-  // non-empty case: typical usage scenario
-  if (xform_matrices_.size() != static_cast<size_t>(num_xforms_)
-      || logdet_.Dim() != num_xforms_) {
-    KALDI_ERR << "Number of transforms = " << (xform_matrices_.size())
-              << ", number of log-determinant terms = " << (logdet_.Dim())
-              << ". `Expected number = " << (num_xforms_);
-  }
-  for (int32 i = 0; i < num_xforms_; i++) {
-    if (xform_matrices_[i].NumRows() != dim_ ||
-        xform_matrices_[i].NumCols() != (dim_+1)) {
-      KALDI_ERR << "For transform " << (i) << ": inconsistent size: rows = "
-                << (xform_matrices_[i].NumRows()) << ", cols = "
-                << xform_matrices_[i].NumCols() << ", dim = " << (dim_);
-    }
-  }
-  if (bclass2xforms_.size() > 0) {
-    for (int32 i = 0, maxi = bclass2xforms_.size(); i < maxi; i++) {
-      if (bclass2xforms_[i] >= num_xforms_) {
-        KALDI_ERR << "For baseclass " << (i) << ", transform index "
-                  << (bclass2xforms_[i]) << " exceeds total transforms "
-                  << (num_xforms_);
-      }
-    }
-  } else {
-    if (num_xforms_ > 1) {
-      KALDI_WARN << "Multiple FMLLR transforms found without baseclass info.";
-    }
-  }
-}
-
-void RegtreeFmllrDiagGmm::ComputeLogDets() {
-  logdet_.Resize(num_xforms_);
-  for (int32 r = 0; r < num_xforms_; r++) {
-    SubMatrix<BaseFloat> tmp_a(xform_matrices_[r], 0, dim_, 0,
-                               dim_);
-    logdet_(r) = tmp_a.LogDet();
-    KALDI_ASSERT(!KALDI_ISNAN(logdet_(r)));
-  }
-  valid_logdet_ = true;
-}
-
-void RegtreeFmllrDiagGmm::TransformFeature(const VectorBase<BaseFloat> &in,
-                                    vector<Vector<BaseFloat> > *out) const {
-  KALDI_ASSERT(out != NULL);
-
-  if (xform_matrices_.size() == 0) {  // empty transform
-    KALDI_ASSERT(num_xforms_ == 0 && dim_ == 0 && logdet_.Dim() == 0);
-    KALDI_WARN << "Asked to apply empty feature transform. Copying instead.";
-    out->resize(1);
-    (*out)[0].Resize(in.Dim());
-    (*out)[0].CopyFromVec(in);
-    return;
-  } else {
-    KALDI_ASSERT(in.Dim() == dim_);
-    // if (!valid_logdet_)
-    // KALDI_ERR << "Must call ComputeLogDets() before transforming data.";
-    // [no need for this check].
-    Vector<BaseFloat> extended_feat(dim_ + 1);
-    extended_feat.Range(0, dim_).CopyFromVec(in);
-    extended_feat(dim_) = 1.0;
-    KALDI_ASSERT(num_xforms_ > 0);
-    out->resize(num_xforms_);
-    for (int32 xform_index = 0; xform_index < num_xforms_;
-         ++xform_index) {
-      (*out)[xform_index].Resize(dim_);
-      (*out)[xform_index].AddMatVec(1.0, xform_matrices_[xform_index],
-                                    kNoTrans, extended_feat, 0.0);
-    }
-  }
-}
-
-void RegtreeFmllrDiagGmm::Write(std::ostream &out, bool binary) const {
-  WriteToken(out, binary, "<FMLLRXFORM>");
-  WriteToken(out, binary, "<NUMXFORMS>");
-  WriteBasicType(out, binary, num_xforms_);
-  WriteToken(out, binary, "<DIMENSION>");
-  WriteBasicType(out, binary, dim_);
-
-  vector< Matrix<BaseFloat> >::const_iterator xform_itr =
-      xform_matrices_.begin(), xform_itr_end = xform_matrices_.end();
-  for (; xform_itr != xform_itr_end; ++xform_itr) {
-    WriteToken(out, binary, "<XFORM>");
-    xform_itr->Write(out, binary);
-  }
-
-  WriteToken(out, binary, "<BCLASS2XFORMS>");
-  WriteIntegerVector(out, binary, bclass2xforms_);
-  WriteToken(out, binary, "</FMLLRXFORM>");
-}
-
-
-void RegtreeFmllrDiagGmm::Read(std::istream &in, bool binary) {
-  ExpectToken(in, binary, "<FMLLRXFORM>");
-  ExpectToken(in, binary, "<NUMXFORMS>");
-  ReadBasicType(in, binary, &num_xforms_);
-  ExpectToken(in, binary, "<DIMENSION>");
-  ReadBasicType(in, binary, &dim_);
-  KALDI_ASSERT(num_xforms_ >= 0 && dim_ >= 0);  // can be 0 for empty xform
-
-  xform_matrices_.resize(num_xforms_);
-  vector< Matrix<BaseFloat> >::iterator xform_itr = xform_matrices_.begin(),
-      xform_itr_end = xform_matrices_.end();
-  for (; xform_itr != xform_itr_end; ++xform_itr) {
-    ExpectToken(in, binary, "<XFORM>");
-    xform_itr->Read(in, binary);
-    KALDI_ASSERT(xform_itr->NumRows() == (xform_itr->NumCols() - 1)
-           && xform_itr->NumRows() == dim_);
-  }
-
-  ExpectToken(in, binary, "<BCLASS2XFORMS>");
-  ReadIntegerVector(in, binary, &bclass2xforms_);
-  ExpectToken(in, binary, "</FMLLRXFORM>");
-  ComputeLogDets();  // so that the transforms can be used.
-}
-
-// ************************************************************************
-
-
-
-
-void RegtreeFmllrDiagGmmAccs::Init(size_t num_bclass, size_t dim) {
-  if (num_bclass == 0) {  // empty stats
-    DeletePointers(&baseclass_stats_);
-    baseclass_stats_.clear();
-    num_baseclasses_ = 0;
-    dim_ = 0;  // non-zero dimension is meaningless in empty stats
-  } else {
-    KALDI_ASSERT(dim != 0);  // if not empty, dim = 0 is meaningless
-    num_baseclasses_ = num_bclass;
-    dim_ = dim;
-    DeletePointers(&baseclass_stats_);
-    baseclass_stats_.resize(num_bclass);
-    for (vector<AffineXformStats*>::iterator it = baseclass_stats_.begin(),
-             end = baseclass_stats_.end(); it != end; ++it) {
-      *it = new AffineXformStats();
-      (*it)->Init(dim, dim);
-    }
-  }
-}
-
-void RegtreeFmllrDiagGmmAccs::SetZero() {
-  for (vector<AffineXformStats*>::iterator it = baseclass_stats_.begin(),
-           end = baseclass_stats_.end(); it != end; ++it) {
-    (*it)->SetZero();
-  }
-}
-
-BaseFloat RegtreeFmllrDiagGmmAccs::AccumulateForGmm(
-    const RegressionTree &regtree, const AmDiagGmm &am,
-    const VectorBase<BaseFloat> &data, size_t pdf_index, BaseFloat weight) {
-  const DiagGmm &pdf = am.GetPdf(pdf_index);
-  int32 num_comp = pdf.NumGauss();
-  Vector<BaseFloat> posterior(num_comp);
-  BaseFloat loglike = pdf.ComponentPosteriors(data, &posterior);
-  posterior.Scale(weight);
-  Vector<double> posterior_d(posterior);
-
-  Vector<double> extended_data(dim_+1);
-  extended_data.Range(0, dim_).CopyFromVec(data);
-  extended_data(dim_) = 1.0;
-  SpMatrix<double> scatter(dim_+1);
-  scatter.AddVec2(1.0, extended_data);
-
-  Vector<double> inv_var_mean(dim_);
-  Matrix<double> g_scale(baseclass_stats_.size(), dim_);  // scale on "scatter" for each dim.
-  for (int32 m = 0; m < num_comp; m++) {
-    inv_var_mean.CopyRowFromMat(pdf.means_invvars(), m);
-    int32 bclass = regtree.Gauss2BaseclassId(pdf_index, m);
-
-    baseclass_stats_[bclass]->beta_ += posterior_d(m);
-    baseclass_stats_[bclass]->K_.AddVecVec(posterior_d(m), inv_var_mean,
-                                           extended_data);
-    for (int32 d = 0; d < dim_; d++)
-      g_scale(bclass, d) +=  posterior(m) * pdf.inv_vars()(m, d);
-  }
-  for (size_t bclass = 0; bclass < baseclass_stats_.size(); bclass++) {
-    vector< SpMatrix<double> > &G = baseclass_stats_[bclass]->G_;
-    for (int32 d = 0; d < dim_; d++)
-      if (g_scale(bclass, d) != 0.0)
-        G[d].AddSp(g_scale(bclass, d), scatter);
-  }
-  return loglike;
-}
-
-void RegtreeFmllrDiagGmmAccs::AccumulateForGaussian(
-    const RegressionTree &regtree, const AmDiagGmm &am,
-    const VectorBase<BaseFloat> &data, size_t pdf_index, size_t gauss_index,
-    BaseFloat weight) {
-  const DiagGmm &pdf = am.GetPdf(pdf_index);
-  size_t dim = static_cast<size_t>(dim_);
-  Vector<double> extended_data(dim+1);
-  extended_data.Range(0, dim).CopyFromVec(data);
-  extended_data(dim) = 1.0;
-  SpMatrix<double> scatter(dim+1);
-  scatter.AddVec2(1.0, extended_data);
-  double weight_d = static_cast<double>(weight);
-
-  unsigned bclass = regtree.Gauss2BaseclassId(pdf_index, gauss_index);
-  Vector<double> inv_var_mean(dim_);
-  inv_var_mean.CopyRowFromMat(pdf.means_invvars(), gauss_index);
-
-  baseclass_stats_[bclass]->beta_ += weight_d;
-  baseclass_stats_[bclass]->K_.AddVecVec(weight_d, inv_var_mean, extended_data);
-  vector< SpMatrix<double> > &G = baseclass_stats_[bclass]->G_;
-  for (size_t d = 0; d < dim; d++)
-    G[d].AddSp((weight_d * pdf.inv_vars()(gauss_index, d)), scatter);
-}
-
-void RegtreeFmllrDiagGmmAccs::Write(std::ostream &out, bool binary) const {
-  WriteToken(out, binary, "<FMLLRACCS>");
-  WriteToken(out, binary, "<NUMBASECLASSES>");
-  WriteBasicType(out, binary, num_baseclasses_);
-  WriteToken(out, binary, "<DIMENSION>");
-  WriteBasicType(out, binary, dim_);
-  WriteToken(out, binary, "<STATS>");
-  vector<AffineXformStats*>::const_iterator itr = baseclass_stats_.begin(),
-      end = baseclass_stats_.end();
-  for ( ; itr != end; ++itr)
-    (*itr)->Write(out, binary);
-  WriteToken(out, binary, "</FMLLRACCS>");
-}
-
-void RegtreeFmllrDiagGmmAccs::Read(std::istream &in, bool binary, bool add) {
-  ExpectToken(in, binary, "<FMLLRACCS>");
-  ExpectToken(in, binary, "<NUMBASECLASSES>");
-  ReadBasicType(in, binary, &num_baseclasses_);
-  ExpectToken(in, binary, "<DIMENSION>");
-  ReadBasicType(in, binary, &dim_);
-  KALDI_ASSERT(num_baseclasses_ > 0 && dim_ > 0);
-  baseclass_stats_.resize(num_baseclasses_);
-  ExpectToken(in, binary, "<STATS>");
-  vector<AffineXformStats*>::iterator itr = baseclass_stats_.begin(),
-      end = baseclass_stats_.end();
-  for ( ; itr != end; ++itr) {
-    *itr = new AffineXformStats();
-    (*itr)->Init(dim_, dim_);
-    (*itr)->Read(in, binary, add);
-  }
-  ExpectToken(in, binary, "</FMLLRACCS>");
-}
-
-
-void RegtreeFmllrDiagGmmAccs::Update(const RegressionTree &regtree,
-                              const RegtreeFmllrOptions &opts,
-                              RegtreeFmllrDiagGmm *out_fmllr,
-                              BaseFloat *auxf_impr_out,
-                              BaseFloat *tot_t_out) const {
-  BaseFloat tot_auxf_impr = 0.0, tot_t = 0.0;
-  Matrix<BaseFloat> xform_mat(dim_, dim_+1);
-  if (opts.use_regtree) {  // estimate transforms using a regression tree
-    vector<AffineXformStats*> regclass_stats;
-    vector<int32> base2regclass;
-    bool update_xforms = regtree.GatherStats(baseclass_stats_, opts.min_count,
-                                             &base2regclass, &regclass_stats);
-    out_fmllr->set_bclass2xforms(base2regclass);
-    // If update_xforms == true, none should be negative, else all should be -1
-    if (update_xforms) {
-      out_fmllr->Init(regclass_stats.size(), dim_);
-      size_t num_rclass = regclass_stats.size();
-      for (size_t rclass_index = 0;
-           rclass_index < num_rclass; ++rclass_index) {
-        KALDI_ASSERT(regclass_stats[rclass_index]->beta_ >= opts.min_count);
-        xform_mat.SetUnit();
-        tot_t += regclass_stats[rclass_index]->beta_;
-
-        tot_auxf_impr +=
-            ComputeFmllrMatrixDiagGmmFull(xform_mat, *(regclass_stats[rclass_index]),
-                                          opts.num_iters, &xform_mat);
-        
-        out_fmllr->SetParameters(xform_mat, rclass_index);
-      }
-      KALDI_LOG << "Estimated " << num_rclass << " regression classes.";
-    } else {
-      out_fmllr->Init(1, dim_);  // Use a unit transform at the root.
-    }
-    DeletePointers(&regclass_stats);
-    // end of estimation using regression tree
-  } else {  // No regtree: estimate 1 transform per baseclass (if enough count)
-    for (int32 bclass_index = 0; bclass_index < num_baseclasses_;
-         ++bclass_index) {
-      tot_t += baseclass_stats_[bclass_index]->beta_;
-    }
-
-    out_fmllr->Init(num_baseclasses_, dim_);
-    vector<int32> base2regclass(num_baseclasses_);
-    for (int32 bclass_index = 0; bclass_index < num_baseclasses_;
-         ++bclass_index) {
-      if (baseclass_stats_[bclass_index]->beta_ >= opts.min_count) {
-        xform_mat.SetUnit();
-
-        if (opts.update_type == "full") {
-          tot_auxf_impr +=
-              ComputeFmllrMatrixDiagGmmFull(xform_mat,
-                                            *(baseclass_stats_[bclass_index]),
-                                            opts.num_iters, &xform_mat);
-        } else if (opts.update_type == "diag")
-          tot_auxf_impr +=
-              ComputeFmllrMatrixDiagGmmDiagonal(xform_mat,
-                                                *(baseclass_stats_[bclass_index]),
-                                                &xform_mat);
-        else if (opts.update_type == "offset")
-          tot_auxf_impr +=
-              ComputeFmllrMatrixDiagGmmOffset(xform_mat,
-                                              *(baseclass_stats_[bclass_index]),
-                                              &xform_mat);
-        else if (opts.update_type == "none")
-          tot_auxf_impr = 0.0;
-        else
-          KALDI_ERR << "Unknown fMLLR update type " << opts.update_type
-                    << ", fmllr-update-type must be one of \"full\"|\"diag\"|\"offset\"|\"none\"";
-
-        out_fmllr->SetParameters(xform_mat, bclass_index);
-        base2regclass[bclass_index] = bclass_index;
-      } else {
-        KALDI_WARN << "For baseclass " << (bclass_index) << " count = "
-                   << (baseclass_stats_[bclass_index]->beta_) << " < "
-                   << opts.min_count << ": not updating FMLLR";
-        base2regclass[bclass_index] = -1;
-      }
-      out_fmllr->set_bclass2xforms(base2regclass);
-    }  // end looping over all baseclasses
-  }  // end of estimating one transform per baseclass without regtree
-  if (auxf_impr_out) *auxf_impr_out = tot_auxf_impr;
-  if (tot_t_out) *tot_t_out = tot_t;
-}
-
-
-
-
-}  // namespace kaldi
-
diff --git a/src/transform/regtree-fmllr-diag-gmm.h b/src/transform/regtree-fmllr-diag-gmm.h
deleted file mode 100644
index 9130850ab8c..00000000000
--- a/src/transform/regtree-fmllr-diag-gmm.h
+++ /dev/null
@@ -1,204 +0,0 @@
-// transform/regtree-fmllr-diag-gmm.h
-
-// Copyright 2009-2011  Saarland University;  Georg Stemmer;
-//                      Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_TRANSFORM_REGTREE_FMLLR_DIAG_GMM_H_
-#define KALDI_TRANSFORM_REGTREE_FMLLR_DIAG_GMM_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "gmm/am-diag-gmm.h"
-#include "transform/transform-common.h"
-#include "transform/regression-tree.h"
-#include "util/kaldi-table.h"
-#include "util/kaldi-holder.h"
-
-namespace kaldi {
-
-
-///  Configuration variables for FMLLR transforms
-struct RegtreeFmllrOptions {
-  std::string update_type;  ///< "full", "diag", "offset", "none"
-  BaseFloat min_count;  ///< Minimum occupancy for computing a transform
-  int32 num_iters;      ///< Number of iterations (if using an iterative update)
-  bool use_regtree;     ///< If 'true', find transforms to generate using regression tree.
-                        ///< If 'false', generate transforms for each baseclass.
-
-  RegtreeFmllrOptions(): update_type("full"), min_count(1000.0),
-                         num_iters(10), use_regtree(true) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("fmllr-update-type", &update_type,
-                   "Update type for fMLLR (\"full\"|\"diag\"|\"offset\"|\"none\")");
-    opts->Register("fmllr-min-count", &min_count,
-                   "Minimum count to estimate an fMLLR transform.");
-    opts->Register("fmllr-num-iters", &num_iters,
-                   "Number of fMLLR iterations (if using an iterative update).");
-    opts->Register("fmllr-use-regtree", &use_regtree,
-                   "Use a regression-class tree for fMLLR.");
-  }
-};
-
-
-/** An FMLLR (feature-space MLLR) transformation, also called CMLLR
- *  (constrained MLLR) is an affine transformation of the feature vectors.
- *  This class supports multiple transforms, and a regression tree.
- *  For a single, feature-level transformation see fmllr-diag-gmm-global.h
- *  Note: the "regression classes" are the classes after tree-clustering,
- *  which are smaller in number than the "base classes"  (these correspond
- *  to the leaves of the tree).
- */
-class RegtreeFmllrDiagGmm {
- public:
-  RegtreeFmllrDiagGmm() : dim_(-1), num_xforms_(-1), valid_logdet_(false) {}
-  explicit RegtreeFmllrDiagGmm(const RegtreeFmllrDiagGmm &other)
-      : dim_(other.dim_), num_xforms_(other.num_xforms_),
-        xform_matrices_(other.xform_matrices_), logdet_(other.logdet_),
-        valid_logdet_(other.valid_logdet_),
-        bclass2xforms_(other.bclass2xforms_) {}
-  ~RegtreeFmllrDiagGmm() {}
-  /// Allocates memory for transform matrix & bias vector
-  void Init(size_t num_xforms, size_t dim);
-  void Validate();  ///< Checks whether the various parameters are consistent
-  /// Sets transform matrix to identity and bias vector to zero
-  void SetUnit();
-  /// Computes the log-determinant of the Jacobians for each transform
-  void ComputeLogDets();
-  /// Get the transformed features for each of the transforms.
-  void TransformFeature(const VectorBase<BaseFloat> &in,
-                        std::vector< Vector<BaseFloat> > *out) const;
-  void Write(std::ostream &out_stream, bool binary) const;
-  void Read(std::istream &in_stream, bool binary);
-
-  /// Accessors
-  int32 Dim() const { return dim_; }
-  int32 NumBaseClasses() const { return bclass2xforms_.size(); }
-  int32 NumRegClasses() const { return num_xforms_; }
-  void GetXformMatrix(int32 xform_index, Matrix<BaseFloat> *out) const;
-  void GetLogDets(VectorBase<BaseFloat> *out) const;
-  int32 Base2RegClass(int32 bclass) const { return bclass2xforms_[bclass]; }
-
-  /// Mutators
-  void SetParameters(const MatrixBase<BaseFloat> &mat, size_t regclass);
-  void set_bclass2xforms(const std::vector<int32> &in) { bclass2xforms_ = in; }
-
- private:
-  int32 dim_;             ///< Dimension of feature vectors
-  int32 num_xforms_;            ///< Number of transform matrices
-  std::vector< Matrix<BaseFloat> > xform_matrices_;  ///< Transform matrices
-  Vector<BaseFloat> logdet_;    ///< Log-determinants of the Jacobians
-  bool valid_logdet_;           ///< Whether logdets are for current transforms
-  /// For each baseclass index of which transform to use; -1 => no xform
-  std::vector<int32> bclass2xforms_;
-
-  void operator = (const RegtreeFmllrDiagGmm&);  // Disallow assignment operator
-};
-
-inline void RegtreeFmllrDiagGmm::GetXformMatrix(int32 xform_index,
-                                              Matrix<BaseFloat> *out) const {
-  if (xform_index >= num_xforms_) {
-    KALDI_ERR << "Index (" << xform_index << ") out of range [0, "
-        << num_xforms_ << "]";
-  }
-  out->Resize(dim_, dim_ + 1);
-  out->CopyFromMat(xform_matrices_[xform_index], kNoTrans);
-}
-
-inline void RegtreeFmllrDiagGmm::SetParameters(const MatrixBase<BaseFloat> &mat,
-                                        size_t regclass) {
-  xform_matrices_[regclass].CopyFromMat(mat, kNoTrans);
-  valid_logdet_ = false;
-}
-
-inline void RegtreeFmllrDiagGmm::GetLogDets(VectorBase<BaseFloat> *out) const {
-  KALDI_ASSERT(valid_logdet_ && out->Dim() == logdet_.Dim());
-  out->CopyFromVec(logdet_);
-}
-
-typedef TableWriter< KaldiObjectHolder<RegtreeFmllrDiagGmm> >  RegtreeFmllrDiagGmmWriter;
-typedef RandomAccessTableReader< KaldiObjectHolder<RegtreeFmllrDiagGmm> >
-            RandomAccessRegtreeFmllrDiagGmmReader;
-typedef RandomAccessTableReaderMapped< KaldiObjectHolder<RegtreeFmllrDiagGmm> >
-            RandomAccessRegtreeFmllrDiagGmmReaderMapped;
-typedef SequentialTableReader< KaldiObjectHolder<RegtreeFmllrDiagGmm> >  RegtreeFmllrDiagGmmSeqReader;
-
-/** \class RegtreeFmllrDiagGmmAccs
- *  Class for computing the accumulators needed for the maximum-likelihood
- *  estimate of FMLLR transforms for an acoustic model that uses diagonal
- *  Gaussian mixture models as emission densities.
- */
-class RegtreeFmllrDiagGmmAccs {
- public:
-  RegtreeFmllrDiagGmmAccs() : num_baseclasses_(-1), dim_(-1) {}
-  ~RegtreeFmllrDiagGmmAccs() { DeletePointers(&baseclass_stats_); }
-
-  void Init(size_t num_bclass, size_t dim);
-  void SetZero();
-
-  /// Accumulate stats for a single GMM in the model; returns log likelihood.
-  /// This does not work if the features have already been transformed
-  /// with multiple feature transforms (so you can't use use this to
-  /// do a 2nd pass of regression-tree fMLLR estimation, which as I write
-  /// (Dan, 2016) I'm not sure that this framework even supports.
-  BaseFloat AccumulateForGmm(const RegressionTree &regtree,
-                             const AmDiagGmm &am,
-                             const VectorBase<BaseFloat> &data,
-                             size_t pdf_index, BaseFloat weight);
-
-  /// Accumulate stats for a single Gaussian component in the model.
-  void AccumulateForGaussian(const RegressionTree &regtree,
-                             const AmDiagGmm &am,
-                             const VectorBase<BaseFloat> &data,
-                             size_t pdf_index, size_t gauss_index,
-                             BaseFloat weight);
-
-  void Update(const RegressionTree &regtree, const RegtreeFmllrOptions &opts,
-              RegtreeFmllrDiagGmm *out_fmllr, BaseFloat *auxf_impr,
-              BaseFloat *tot_t) const;
-
-  void Write(std::ostream &out_stream, bool binary) const;
-  void Read(std::istream &in_stream, bool binary, bool add);
-
-  /// Accessors
-  int32 Dim() const { return dim_; }
-  int32 NumBaseClasses() const { return num_baseclasses_; }
-  const std::vector<AffineXformStats*> &baseclass_stats() const {
-    return baseclass_stats_;
-  }
-
- private:
-  /// Per-baseclass stats; used for accumulation
-  std::vector<AffineXformStats*> baseclass_stats_;
-  /// Number of baseclasses
-  int32 num_baseclasses_;
-  /// Dimension of feature vectors
-  int32 dim_;
-
-  // Cannot have copy constructor and assigment operator
-  KALDI_DISALLOW_COPY_AND_ASSIGN(RegtreeFmllrDiagGmmAccs);
-};
-
-
-
-
-}  // namespace kaldi
-
-#endif  // KALDI_TRANSFORM_REGTREE_FMLLR_DIAG_GMM_H_
diff --git a/src/transform/regtree-mllr-diag-gmm-test.cc b/src/transform/regtree-mllr-diag-gmm-test.cc
deleted file mode 100644
index 812a78d56d2..00000000000
--- a/src/transform/regtree-mllr-diag-gmm-test.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-// transform/regtree-mllr-diag-gmm-test.cc
-
-// Copyright 2009-2011   Saarland University
-// Author:  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/mle-diag-gmm.h"
-#include "gmm/mle-am-diag-gmm.h"
-#include "gmm/model-test-common.h"
-#include "transform/regtree-mllr-diag-gmm.h"
-
-using kaldi::int32;
-using kaldi::BaseFloat;
-using kaldi::RegtreeMllrDiagGmmAccs;
-namespace ut = kaldi::unittest;
-
-void TestMllrAccsIO(const kaldi::AmDiagGmm &am_gmm,
-                    const kaldi::RegressionTree &regtree,
-                    const RegtreeMllrDiagGmmAccs &accs,
-                    const kaldi::Matrix<BaseFloat> adapt_data) {
-  // First, non-binary write
-  accs.Write(kaldi::Output("tmpf", false).Stream(), false);
-
-  kaldi::RegtreeMllrDiagGmm mllr;
-  kaldi::RegtreeMllrOptions opts;
-  opts.min_count = 100;
-  opts.use_regtree = false;
-  accs.Update(regtree, opts, &mllr, NULL, NULL);
-  kaldi::AmDiagGmm am1;
-  am1.CopyFromAmDiagGmm(am_gmm);
-  mllr.TransformModel(regtree, &am1);
-
-  BaseFloat loglike = 0;
-  int32 npoints = adapt_data.NumRows();
-  for (int32 j = 0; j < npoints; j++) {
-    loglike += am1.LogLikelihood(0, adapt_data.Row(j));
-  }
-  KALDI_LOG << "Per-frame loglike after adaptation = " << (loglike/npoints)
-            << " over " << npoints << " frames.";
-
-  size_t num_comp2 = 1 + kaldi::RandInt(0, 9);  // random number of mixtures
-  int32 dim = am_gmm.Dim();
-  kaldi::DiagGmm gmm2;
-  ut::InitRandDiagGmm(dim, num_comp2, &gmm2);
-  kaldi::Vector<BaseFloat> data(dim);
-  gmm2.Generate(&data);
-  BaseFloat loglike1 = am1.LogLikelihood(0, data);
-//  KALDI_LOG << "LL0 = " << loglike0 << "; LL1 = " << loglike1;
-
-  KALDI_LOG << "Test ASCII IO.";
-  bool binary_in;
-  kaldi::RegtreeMllrDiagGmm mllr1;
-  RegtreeMllrDiagGmmAccs *accs1 = new RegtreeMllrDiagGmmAccs();
-  // Non-binary read
-  kaldi::Input ki1("tmpf", &binary_in);
-  accs1->Read(ki1.Stream(), binary_in, false);
-  accs1->Update(regtree, opts, &mllr1, NULL, NULL);
-  delete accs1;
-  kaldi::AmDiagGmm am2;
-  am2.CopyFromAmDiagGmm(am_gmm);
-  mllr.TransformModel(regtree, &am2);
-  BaseFloat loglike2 = am2.LogLikelihood(0, data);
-//  KALDI_LOG << "LL1 = " << loglike1 << "; LL2 = " << loglike2;
-  kaldi::AssertEqual(loglike1, loglike2, 1e-6);
-
-  kaldi::RegtreeMllrDiagGmm mllr2;
-  // Next, binary write
-  KALDI_LOG << "Test Binary IO.";
-  accs.Write(kaldi::Output("tmpfb", true).Stream(), true);
-  RegtreeMllrDiagGmmAccs *accs2 = new RegtreeMllrDiagGmmAccs();
-  // Binary read
-  kaldi::Input ki2("tmpfb", &binary_in);
-  accs2->Read(ki2.Stream(), binary_in, false);
-  accs2->Update(regtree, opts, &mllr2, NULL, NULL);
-  delete accs2;
-  kaldi::AmDiagGmm am3;
-  am3.CopyFromAmDiagGmm(am_gmm);
-  mllr.TransformModel(regtree, &am3);
-  BaseFloat loglike3 = am3.LogLikelihood(0, data);
-//  KALDI_LOG << "LL1 = " << loglike1 << "; LL3 = " << loglike3;
-  kaldi::AssertEqual(loglike1, loglike3, 1e-6);
-  
-  unlink("tmpf");
-  unlink("tmpfb");
-}
-
-void TestXformMean(const kaldi::AmDiagGmm &am_gmm,
-                   const kaldi::RegressionTree &regtree,
-                   const RegtreeMllrDiagGmmAccs &accs,
-                   const kaldi::Matrix<BaseFloat> adapt_data) {
-  kaldi::RegtreeMllrDiagGmm mllr;
-  kaldi::RegtreeMllrOptions opts;
-  opts.min_count = 100;
-  opts.use_regtree = false;
-  accs.Update(regtree, opts, &mllr, NULL, NULL);
-
-  kaldi::AmDiagGmm am1;
-  am1.CopyFromAmDiagGmm(am_gmm);
-  mllr.TransformModel(regtree, &am1);
-
-  kaldi::DiagGmm tmp_pdf;
-  tmp_pdf.CopyFromDiagGmm(am_gmm.GetPdf(0));
-  kaldi::Matrix<BaseFloat> tmp_means(am_gmm.GetPdf(0).NumGauss(), am_gmm.Dim());
-  mllr.GetTransformedMeans(regtree, am_gmm, 0, &tmp_means);
-  tmp_pdf.SetInvVarsAndMeans(tmp_pdf.inv_vars(), tmp_means);
-  tmp_pdf.ComputeGconsts();
-
-  BaseFloat loglike0 = 0, loglike = 0;
-  int32 npoints = adapt_data.NumRows();
-  for (int32 j = 0; j < npoints; j++) {
-    loglike0 += am1.LogLikelihood(0, adapt_data.Row(j));
-    loglike += tmp_pdf.LogLikelihood(adapt_data.Row(j));
-  }
-  KALDI_LOG << "Per-frame loglike after adaptation = " << (loglike0/npoints)
-            << " over " << npoints << " frames.";
-//  KALDI_LOG << "LL0 = " << loglike0 << "; LL = " << loglike;
-  kaldi::AssertEqual(loglike0, loglike, 1e-6);
-
-  kaldi::Matrix<BaseFloat> tmp_means2(am_gmm.GetPdf(0).NumGauss(), am_gmm.Dim());
-  mllr.GetTransformedMeans(regtree, am_gmm, 0, &tmp_means2);
-  tmp_pdf.SetInvVarsAndMeans(tmp_pdf.inv_vars(), tmp_means2);
-  tmp_pdf.ComputeGconsts();
-
-  BaseFloat loglike1 = 0;
-  for (int32 j = 0; j < npoints; j++) {
-    loglike1 += tmp_pdf.LogLikelihood(adapt_data.Row(j));
-  }
-//  KALDI_LOG << "LL = " << loglike << "; LL1 = " << loglike1;
-  kaldi::AssertEqual(loglike, loglike1, 1e-6);
-}
-
-
-void UnitTestRegtreeMllrDiagGmm() {
-  size_t dim = 1 + kaldi::RandInt(1, 9);  // random dimension of the gmm
-  size_t num_comp = 1 + kaldi::RandInt(0, 5);  // random number of mixtures
-  kaldi::DiagGmm gmm;
-  ut::InitRandDiagGmm(dim, num_comp, &gmm);
-  kaldi::AmDiagGmm am_gmm;
-  am_gmm.Init(gmm, 1);
-
-  size_t num_comp2 = 1 + kaldi::RandInt(0, 5);  // random number of mixtures
-  kaldi::DiagGmm gmm2;
-  ut::InitRandDiagGmm(dim, num_comp2, &gmm2);
-  int32 npoints = dim*(dim+1)*10 + 500;
-  kaldi::Matrix<BaseFloat> adapt_data(npoints, dim);
-  for (int32 j = 0; j < npoints; j++) {
-    kaldi::SubVector<BaseFloat> row(adapt_data, j);
-    gmm2.Generate(&row);
-  }
-
-  kaldi::RegressionTree regtree;
-  std::vector<int32> sil_indices;
-  kaldi::Vector<BaseFloat> state_occs(1);
-  state_occs(0) = npoints;
-  regtree.BuildTree(state_occs, sil_indices, am_gmm, 1);
-  int32 num_bclass = regtree.NumBaseclasses();
-
-  kaldi::RegtreeMllrDiagGmmAccs accs;
-  BaseFloat loglike = 0;
-  accs.Init(num_bclass, dim);
-  for (int32 j = 0; j < npoints; j++) {
-    loglike += accs.AccumulateForGmm(regtree, am_gmm, adapt_data.Row(j),
-                                     0, 1.0);
-  }
-  KALDI_LOG << "Per-frame loglike during accumulations = " << (loglike/npoints)
-            << " over " << npoints << " frames.";
-
-  TestMllrAccsIO(am_gmm, regtree, accs, adapt_data);
-  TestXformMean(am_gmm, regtree, accs, adapt_data);
-}
-
-int main() {
-  kaldi::g_kaldi_verbose_level = 5;
-  for (int i = 0; i <= 10; i++)
-    UnitTestRegtreeMllrDiagGmm();
-  std::cout << "Test OK.\n";
-}
-
diff --git a/src/transform/regtree-mllr-diag-gmm.cc b/src/transform/regtree-mllr-diag-gmm.cc
deleted file mode 100644
index 7c30d66ff56..00000000000
--- a/src/transform/regtree-mllr-diag-gmm.cc
+++ /dev/null
@@ -1,398 +0,0 @@
-// transform/regtree-mllr-diag-gmm.cc
-
-// Copyright 2009-2011  Saarland University;  Jan Silovsky
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <utility>
-using std::pair;
-#include <vector>
-using std::vector;
-
-#include "transform/regtree-mllr-diag-gmm.h"
-
-namespace kaldi {
-
-void RegtreeMllrDiagGmm::Init(int32 num_xforms, int32 dim) {
-  if (num_xforms == 0) {  // empty transform
-    xform_matrices_.clear();
-    dim_ = 0;  // non-zero dimension is meaningless with empty transform
-    num_xforms_ = 0;
-    bclass2xforms_.clear();
-  } else {
-    KALDI_ASSERT(dim != 0);  // if not empty, dim = 0 is meaningless
-    dim_ = dim;
-    num_xforms_ = num_xforms;
-    xform_matrices_.resize(num_xforms);
-    vector< Matrix<BaseFloat> >::iterator xform_itr = xform_matrices_.begin(),
-                                      xform_itr_end = xform_matrices_.end();
-    for (; xform_itr != xform_itr_end; ++xform_itr) {
-      xform_itr->Resize(dim, dim+1);
-      xform_itr->SetUnit();
-    }
-  }
-}
-
-void RegtreeMllrDiagGmm::SetUnit() {
-  vector< Matrix<BaseFloat> >::iterator xform_itr = xform_matrices_.begin(),
-                                    xform_itr_end = xform_matrices_.end();
-  for (; xform_itr != xform_itr_end; ++xform_itr) {
-    xform_itr->SetUnit();
-  }
-}
-
-void RegtreeMllrDiagGmm::TransformModel(const RegressionTree &regtree,
-                                        AmDiagGmm *am) {
-  KALDI_ASSERT(static_cast<int32>(bclass2xforms_.size()) ==
-               regtree.NumBaseclasses());
-  Vector<BaseFloat> extended_mean(dim_+1), xformed_mean(dim_);
-  for (int32 bclass_index = 0, num_bclasses = regtree.NumBaseclasses();
-       bclass_index < num_bclasses; ++bclass_index) {
-    int32 xform_index;
-    if ((xform_index = bclass2xforms_[bclass_index]) > -1) {
-      KALDI_ASSERT(xform_index < num_xforms_);
-      const vector< pair<int32, int32> > &bclass =
-          regtree.GetBaseclass(bclass_index);
-      for (vector< pair<int32, int32> >::const_iterator itr = bclass.begin(),
-          end = bclass.end(); itr != end; ++itr) {
-        SubVector<BaseFloat> tmp_mean(extended_mean.Range(0, dim_));
-        am->GetGaussianMean(itr->first, itr->second, &tmp_mean);
-        extended_mean(dim_) = 1.0;
-        xformed_mean.AddMatVec(1.0, xform_matrices_[xform_index], kNoTrans,
-                               extended_mean, 0.0);
-        am->SetGaussianMean(itr->first, itr->second, xformed_mean);
-      }  // end iterating over Gaussians in baseclass
-    }  // else keep the means untransformed
-  }  // end iterating over all baseclasses
-  am->ComputeGconsts();
-}
-
-
-void RegtreeMllrDiagGmm::GetTransformedMeans(const RegressionTree &regtree,
-                                             const AmDiagGmm &am,
-                                             int32 pdf_index,
-                                             MatrixBase<BaseFloat> *out) const {
-  KALDI_ASSERT(static_cast<int32>(bclass2xforms_.size()) ==
-               regtree.NumBaseclasses());
-  int32 num_gauss = am.GetPdf(pdf_index).NumGauss();
-  KALDI_ASSERT(out->NumRows() == num_gauss && out->NumCols() == dim_);
-
-  Vector<BaseFloat> extended_mean(dim_+1);
-  extended_mean(dim_) = 1.0;
-
-  for (int32 gauss_index = 0; gauss_index < num_gauss; gauss_index++) {
-    int32 bclass_index = regtree.Gauss2BaseclassId(pdf_index, gauss_index);
-    int32 xform_index = bclass2xforms_[bclass_index];
-    if (xform_index > -1) {  // use a transform
-      KALDI_ASSERT(xform_index < num_xforms_);
-      SubVector<BaseFloat> tmp_mean(extended_mean.Range(0, dim_));
-      am.GetGaussianMean(pdf_index, gauss_index, &tmp_mean);
-      SubVector<BaseFloat> out_row(out->Row(gauss_index));
-      out_row.AddMatVec(1.0, xform_matrices_[xform_index], kNoTrans,
-                        extended_mean, 0.0);
-    } else {  // Copy untransformed mean
-      SubVector<BaseFloat> out_row(out->Row(gauss_index));
-      am.GetGaussianMean(pdf_index, gauss_index, &out_row);
-    }
-  }
-}
-
-
-void RegtreeMllrDiagGmm::Write(std::ostream &out, bool binary) const {
-  WriteToken(out, binary, "<MLLRXFORM>");
-  WriteToken(out, binary, "<NUMXFORMS>");
-  WriteBasicType(out, binary, num_xforms_);
-  WriteToken(out, binary, "<DIMENSION>");
-  WriteBasicType(out, binary, dim_);
-
-  vector< Matrix<BaseFloat> >::const_iterator xform_itr =
-      xform_matrices_.begin(), xform_itr_end = xform_matrices_.end();
-  for (; xform_itr != xform_itr_end; ++xform_itr) {
-    WriteToken(out, binary, "<XFORM>");
-    xform_itr->Write(out, binary);
-  }
-
-  WriteToken(out, binary, "<BCLASS2XFORMS>");
-  WriteIntegerVector(out, binary, bclass2xforms_);
-  WriteToken(out, binary, "</MLLRXFORM>");
-}
-
-
-void RegtreeMllrDiagGmm::Read(std::istream &in, bool binary) {
-  ExpectToken(in, binary, "<MLLRXFORM>");
-  ExpectToken(in, binary, "<NUMXFORMS>");
-  ReadBasicType(in, binary, &num_xforms_);
-  ExpectToken(in, binary, "<DIMENSION>");
-  ReadBasicType(in, binary, &dim_);
-  KALDI_ASSERT(num_xforms_ >= 0 && dim_ >= 0);  // can be 0 for empty xform
-
-  xform_matrices_.resize(num_xforms_);
-  vector< Matrix<BaseFloat> >::iterator xform_itr = xform_matrices_.begin(),
-                                    xform_itr_end = xform_matrices_.end();
-  for (; xform_itr != xform_itr_end; ++xform_itr) {
-    ExpectToken(in, binary, "<XFORM>");
-    xform_itr->Read(in, binary);
-    KALDI_ASSERT(xform_itr->NumRows() == (xform_itr->NumCols() - 1)
-                 && xform_itr->NumRows() == dim_);
-  }
-
-  ExpectToken(in, binary, "<BCLASS2XFORMS>");
-  ReadIntegerVector(in, binary, &bclass2xforms_);
-  ExpectToken(in, binary, "</MLLRXFORM>");
-}
-
-// ************************************************************************
-
-void RegtreeMllrDiagGmmAccs::Init(int32 num_bclass, int32 dim) {
-  if (num_bclass == 0) {  // empty stats
-    DeletePointers(&baseclass_stats_);
-    baseclass_stats_.clear();
-    num_baseclasses_ = 0;
-    dim_ = 0;  // non-zero dimension is meaningless in empty stats
-  } else {
-    KALDI_ASSERT(dim != 0);  // if not empty, dim = 0 is meaningless
-    num_baseclasses_ = num_bclass;
-    dim_ = dim;
-    baseclass_stats_.resize(num_baseclasses_);
-    for (vector<AffineXformStats*>::iterator it = baseclass_stats_.begin(),
-        end = baseclass_stats_.end(); it != end; ++it) {
-      *it = new AffineXformStats();
-      (*it)->Init(dim_, dim_);
-    }
-  }
-}
-
-void RegtreeMllrDiagGmmAccs::SetZero() {
-  for (vector<AffineXformStats*>::iterator it = baseclass_stats_.begin(),
-      end = baseclass_stats_.end(); it != end; ++it) {
-    (*it)->SetZero();
-  }
-}
-
-BaseFloat RegtreeMllrDiagGmmAccs::AccumulateForGmm(
-    const RegressionTree &regtree, const AmDiagGmm &am,
-    const VectorBase<BaseFloat> &data, int32 pdf_index, BaseFloat weight) {
-  const DiagGmm &pdf = am.GetPdf(pdf_index);
-  int32 num_comp = static_cast<int32>(pdf.NumGauss());
-  Vector<BaseFloat> posterior(num_comp);
-  BaseFloat loglike = pdf.ComponentPosteriors(data, &posterior);
-  posterior.Scale(weight);
-  Vector<double> posterior_d(posterior);
-
-  Vector<double> data_d(data);
-  Vector<double> inv_var_x(dim_);
-  Vector<double> extended_mean(dim_+1);
-  SpMatrix<double> mean_scatter(dim_+1);
-
-  for (int32 m = 0; m < num_comp; m++) {
-    unsigned bclass = regtree.Gauss2BaseclassId(pdf_index, m);
-    inv_var_x.CopyFromVec(pdf.inv_vars().Row(m));
-    inv_var_x.MulElements(data_d);
-
-    // Using SubVector to stop compiler warning
-    SubVector<double> tmp_mean(extended_mean, 0, dim_);
-    pdf.GetComponentMean(m, &tmp_mean);  // modifies extended_mean
-    extended_mean(dim_) = 1.0;
-    mean_scatter.SetZero();
-    mean_scatter.AddVec2(1.0, extended_mean);
-
-    baseclass_stats_[bclass]->beta_ += posterior_d(m);
-    baseclass_stats_[bclass]->K_.AddVecVec(posterior_d(m), inv_var_x,
-                                           extended_mean);
-    vector< SpMatrix<double> > &G = baseclass_stats_[bclass]->G_;
-    for (int32 d = 0; d < dim_; d++)
-      G[d].AddSp((posterior_d(m) * pdf.inv_vars()(m, d)), mean_scatter);
-  }
-  return loglike;
-}
-
-void RegtreeMllrDiagGmmAccs::AccumulateForGaussian(
-    const RegressionTree &regtree, const AmDiagGmm &am,
-    const VectorBase<BaseFloat> &data, int32 pdf_index, int32 gauss_index,
-    BaseFloat weight) {
-  const DiagGmm &pdf = am.GetPdf(pdf_index);
-  Vector<double> data_d(data);
-  Vector<double> inv_var_x(dim_);
-  Vector<double> extended_mean(dim_+1);
-  double weight_d = static_cast<double>(weight);
-
-  unsigned bclass = regtree.Gauss2BaseclassId(pdf_index, gauss_index);
-  inv_var_x.CopyFromVec(pdf.inv_vars().Row(gauss_index));
-  inv_var_x.MulElements(data_d);
-
-  // Using SubVector to stop compiler warning
-  SubVector<double> tmp_mean(extended_mean, 0, dim_);
-  pdf.GetComponentMean(gauss_index, &tmp_mean);  // modifies extended_mean
-  extended_mean(dim_) = 1.0;
-  SpMatrix<double> mean_scatter(dim_+1);
-  mean_scatter.AddVec2(1.0, extended_mean);
-
-  baseclass_stats_[bclass]->beta_ += weight_d;
-  baseclass_stats_[bclass]->K_.AddVecVec(weight_d, inv_var_x, extended_mean);
-  vector< SpMatrix<double> > &G = baseclass_stats_[bclass]->G_;
-  for (int32 d = 0; d < dim_; d++)
-    G[d].AddSp((weight_d * pdf.inv_vars()(gauss_index, d)), mean_scatter);
-}
-
-void RegtreeMllrDiagGmmAccs::Write(std::ostream &out, bool binary) const {
-  WriteToken(out, binary, "<MLLRACCS>");
-  WriteToken(out, binary, "<NUMBASECLASSES>");
-  WriteBasicType(out, binary, num_baseclasses_);
-  WriteToken(out, binary, "<DIMENSION>");
-  WriteBasicType(out, binary, dim_);
-  WriteToken(out, binary, "<STATS>");
-  vector<AffineXformStats*>::const_iterator itr = baseclass_stats_.begin(),
-                                            end = baseclass_stats_.end();
-  for ( ; itr != end; ++itr)
-    (*itr)->Write(out, binary);
-  WriteToken(out, binary, "</MLLRACCS>");
-}
-
-void RegtreeMllrDiagGmmAccs::Read(std::istream &in, bool binary, bool add) {
-  ExpectToken(in, binary, "<MLLRACCS>");
-  ExpectToken(in, binary, "<NUMBASECLASSES>");
-  ReadBasicType(in, binary, &num_baseclasses_);
-  ExpectToken(in, binary, "<DIMENSION>");
-  ReadBasicType(in, binary, &dim_);
-  KALDI_ASSERT(num_baseclasses_ > 0 && dim_ > 0);
-  baseclass_stats_.resize(num_baseclasses_);
-  ExpectToken(in, binary, "<STATS>");
-  vector<AffineXformStats*>::iterator itr = baseclass_stats_.begin(),
-                                      end = baseclass_stats_.end();
-  for ( ; itr != end; ++itr) {
-    *itr = new AffineXformStats();
-    (*itr)->Init(dim_, dim_);
-    (*itr)->Read(in, binary, add);
-  }
-  ExpectToken(in, binary, "</MLLRACCS>");
-}
-
-static void ComputeMllrMatrix(const Matrix<double> &K,
-                              const vector< SpMatrix<double> > &G,
-                              Matrix<BaseFloat> *out) {
-  int32 dim = G.size();
-  Matrix<double> tmp_out(dim, dim+1);
-  for (int32 d = 0; d < dim; d++) {
-    if (G[d].Cond() > 1.0e+9) {
-      KALDI_WARN << "Dim " << d << ": Badly conditioned stats. Setting MLLR "
-                 << "transform to unit.";
-      tmp_out.SetUnit();
-      break;
-    }
-    SpMatrix<double> inv_g(G[d]);
-//    KALDI_LOG << "Dim " << d << ": G: max = " << inv_g.Max() << ", min = "
-//              << inv_g.Min() << ", log det = " << inv_g.LogDet(NULL)
-//              << ", cond = " << inv_g.Cond();
-    inv_g.Invert();
-//    KALDI_LOG << "Inv G: max = " << inv_g.Max() << ", min = " << inv_g.Min()
-//              << ", log det = " << inv_g.LogDet(NULL) << ", cond = "
-//              << inv_g.Cond();
-    tmp_out.Row(d).AddSpVec(1.0, inv_g, K.Row(d), 0.0);
-  }
-  out->CopyFromMat(tmp_out, kNoTrans);
-}
-
-static BaseFloat MllrAuxFunction(const Matrix<BaseFloat> &xform,
-                                 const AffineXformStats &stats) {
-  int32 dim = stats.G_.size();
-  Matrix<double> xform_d(xform);
-  Vector<double> xform_row_g(dim + 1);
-  SubMatrix<double> A(xform_d, 0, dim, 0, dim);
-  double obj = TraceMatMat(xform_d, stats.K_, kTrans);
-  for (int32 d = 0; d < dim; d++) {
-    xform_row_g.AddSpVec(1.0, stats.G_[d], xform_d.Row(d), 0.0);
-    obj -= 0.5 * VecVec(xform_row_g, xform_d.Row(d));
-  }
-  return obj;
-}
-
-void RegtreeMllrDiagGmmAccs::Update(const RegressionTree &regtree,
-                                    const RegtreeMllrOptions &opts,
-                                    RegtreeMllrDiagGmm *out_mllr,
-                                    BaseFloat *auxf_impr,
-                                    BaseFloat *t) const {
-  BaseFloat tot_auxf_impr = 0, tot_t = 0;
-  Matrix<BaseFloat> xform_mat(dim_, dim_ + 1);
-  if (opts.use_regtree) {  // estimate transforms using a regression tree
-    vector<AffineXformStats*> regclass_stats;
-    vector<int32> base2regclass;
-    bool update_xforms = regtree.GatherStats(baseclass_stats_, opts.min_count,
-                                             &base2regclass, &regclass_stats);
-    out_mllr->set_bclass2xforms(base2regclass);
-    // If update_xforms == true, none should be negative, else all should be -1
-    if (update_xforms) {
-      out_mllr->Init(regclass_stats.size(), dim_);
-      for (int32 rclass_index = 0, num_rclass = regclass_stats.size();
-           rclass_index < num_rclass; ++rclass_index) {
-        KALDI_ASSERT(regclass_stats[rclass_index]->beta_ >= opts.min_count);
-        xform_mat.SetUnit();
-        BaseFloat obj_old = MllrAuxFunction(xform_mat,
-                                            *(regclass_stats[rclass_index]));
-        ComputeMllrMatrix(regclass_stats[rclass_index]->K_,
-                          regclass_stats[rclass_index]->G_, &xform_mat);
-        out_mllr->SetParameters(xform_mat, rclass_index);
-        BaseFloat obj_new = MllrAuxFunction(xform_mat,
-                                            *(regclass_stats[rclass_index]));
-        KALDI_LOG << "MLLR: regclass " << (rclass_index)
-                  << ": Objective function impr per frame is "
-                  << ((obj_new - obj_old)/regclass_stats[rclass_index]->beta_)
-                  << " over " << regclass_stats[rclass_index]->beta_
-                  << " frames.";
-        KALDI_ASSERT(obj_new >= obj_old - (std::abs(obj_new)+std::abs(obj_old))*1.0e-05);
-        tot_t += regclass_stats[rclass_index]->beta_;
-        tot_auxf_impr += obj_new - obj_old;
-      }
-    } else {
-      out_mllr->Init(1, dim_);  // Use a unit transform at the root.
-    }
-    DeletePointers(&regclass_stats);
-    // end of estimation using regression tree
-  } else {  // estimate 1 transform per baseclass (if enough count)
-    out_mllr->Init(num_baseclasses_, dim_);
-    vector<int32> base2xforms(num_baseclasses_, -1);
-    for (int32 bclass_index = 0; bclass_index < num_baseclasses_;
-         ++bclass_index) {
-      if (baseclass_stats_[bclass_index]->beta_ > opts.min_count) {
-        base2xforms[bclass_index] = bclass_index;
-        xform_mat.SetUnit();
-        BaseFloat obj_old = MllrAuxFunction(xform_mat,
-                                            *(baseclass_stats_[bclass_index]));
-        ComputeMllrMatrix(baseclass_stats_[bclass_index]->K_,
-                          baseclass_stats_[bclass_index]->G_, &xform_mat);
-        out_mllr->SetParameters(xform_mat, bclass_index);
-        BaseFloat obj_new = MllrAuxFunction(xform_mat,
-                                            *(baseclass_stats_[bclass_index]));
-        KALDI_LOG << "MLLR: base-class " << (bclass_index)
-                  << ": Auxiliary function impr per frame is "
-                  << ((obj_new-obj_old)/baseclass_stats_[bclass_index]->beta_);
-        KALDI_ASSERT(obj_new >= obj_old - (std::abs(obj_new)+std::abs(obj_old))*1.0e-05);
-        tot_t += baseclass_stats_[bclass_index]->beta_;
-        tot_auxf_impr += obj_new - obj_old;
-      } else {
-        KALDI_WARN << "For baseclass "  << (bclass_index) << " count = "
-                   << (baseclass_stats_[bclass_index]->beta_) << " < "
-                   << opts.min_count << ": not updating MLLR";
-        tot_t += baseclass_stats_[bclass_index]->beta_;
-      }
-    }  // end looping over all baseclasses
-    out_mllr->set_bclass2xforms(base2xforms);
-  }  // end of estimating one transform per baseclass
-  if (auxf_impr != NULL) *auxf_impr = tot_auxf_impr;
-  if (t != NULL) *t = tot_t;
-}
-
-}  // namespace kaldi
-
diff --git a/src/transform/regtree-mllr-diag-gmm.h b/src/transform/regtree-mllr-diag-gmm.h
deleted file mode 100644
index 49a8cc41dd0..00000000000
--- a/src/transform/regtree-mllr-diag-gmm.h
+++ /dev/null
@@ -1,164 +0,0 @@
-// transform/regtree-mllr-diag-gmm.h
-
-// Copyright 2009-2011  Saarland University;  Jan Silovsky
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_TRANSFORM_REGTREE_MLLR_DIAG_GMM_H_
-#define KALDI_TRANSFORM_REGTREE_MLLR_DIAG_GMM_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "gmm/am-diag-gmm.h"
-#include "transform/transform-common.h"
-#include "transform/regression-tree.h"
-#include "util/common-utils.h"
-
-namespace kaldi {
-
-
-///  Configuration variables for FMLLR transforms
-struct RegtreeMllrOptions {
-  BaseFloat min_count;  ///< Minimum occupancy for computing a transform
-
-  /// If 'true', find transforms to generate using regression tree.
-  /// If 'false', generate transforms for each baseclass.
-  bool use_regtree;
-
-  RegtreeMllrOptions(): min_count(1000.0), use_regtree(true) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("mllr-min-count", &min_count,
-                   "Minimum count to estimate an MLLR transform.");
-    opts->Register("mllr-use-regtree", &use_regtree,
-                   "Use a regression-class tree for MLLR.");
-  }
-};
-
-/// An MLLR mean transformation is an affine transformation of Gaussian means.
-class RegtreeMllrDiagGmm {
- public:
-  RegtreeMllrDiagGmm() {}
-
-  /// Allocates memory for transform matrix & bias vector
-  void Init(int32 num_xforms, int32 dim);
-
-  /// Initialize transform matrix to identity and bias vector to zero
-  void SetUnit();
-
-  /// Apply the transform(s) to all the Gaussian means in the model
-  void TransformModel(const RegressionTree &regtree, AmDiagGmm *am);
-
-  /// Get all the transformed means for a given pdf.
-  void GetTransformedMeans(const RegressionTree &regtree, const AmDiagGmm &am,
-                           int32 pdf_index, MatrixBase<BaseFloat> *out) const;
-
-  void Write(std::ostream &out_stream, bool binary) const;
-  void Read(std::istream &in_stream, bool binary);
-
-  /// Mutators
-  void SetParameters(const MatrixBase<BaseFloat> &mat, int32 regclass);
-  void set_bclass2xforms(const std::vector<int32> &in) { bclass2xforms_ = in; }
-
-  /// Accessors
-  const std::vector< Matrix<BaseFloat> > xform_matrices() const {
-    return xform_matrices_;
-  }
-
- private:
-  /// Transform matrices: size() = num_xforms_
-  std::vector< Matrix<BaseFloat> > xform_matrices_;
-  int32 num_xforms_;  ///< Number of transforms == xform_matrices_.size()
-  /// For each baseclass index of which transform to use; -1 => no xform
-  std::vector<int32> bclass2xforms_;
-  int32 dim_;  ///< Dimension of feature vectors
-
-  // Cannot have copy constructor and assigment operator
-  KALDI_DISALLOW_COPY_AND_ASSIGN(RegtreeMllrDiagGmm);
-};
-
-inline void RegtreeMllrDiagGmm::SetParameters(const MatrixBase<BaseFloat> &mat,
-                                              int32 regclass) {
-  xform_matrices_[regclass].CopyFromMat(mat, kNoTrans);
-}
-
-/** Class for computing the maximum-likelihood estimates of the parameters of
- *  an acoustic model that uses diagonal Gaussian mixture models as emission
- *  densities.
- */
-class RegtreeMllrDiagGmmAccs {
- public:
-  RegtreeMllrDiagGmmAccs() {}
-  ~RegtreeMllrDiagGmmAccs() { DeletePointers(&baseclass_stats_); }
-
-  void Init(int32 num_bclass, int32 dim);
-  void SetZero();
-
-  /// Accumulate stats for a single GMM in the model; returns log likelihood.
-  /// This does not work with multiple feature transforms.
-  BaseFloat AccumulateForGmm(const RegressionTree &regtree,
-                             const AmDiagGmm &am,
-                             const VectorBase<BaseFloat> &data,
-                             int32 pdf_index, BaseFloat weight);
-
-  /// Accumulate stats for a single Gaussian component in the model.
-  void AccumulateForGaussian(const RegressionTree &regtree,
-                             const AmDiagGmm &am,
-                             const VectorBase<BaseFloat> &data,
-                             int32 pdf_index, int32 gauss_index,
-                             BaseFloat weight);
-
-  void Update(const RegressionTree &regtree, const RegtreeMllrOptions &opts,
-              RegtreeMllrDiagGmm *out_mllr, BaseFloat *auxf_impr,
-              BaseFloat *t) const;
-
-  void Write(std::ostream &out_stream, bool binary) const;
-  void Read(std::istream &in_stream, bool binary, bool add);
-
-  /// Accessors
-  int32 Dim() const { return dim_; }
-  int32 NumBaseClasses() const { return num_baseclasses_; }
-  const std::vector<AffineXformStats*> &baseclass_stats() const {
-    return baseclass_stats_;
-  }
-
- private:
-  /// Per-baseclass stats; used for accumulation
-  std::vector<AffineXformStats*> baseclass_stats_;
-  int32 num_baseclasses_;    ///< Number of baseclasses
-  int32 dim_;    ///< Dimension of feature vectors
-
-  /// Returns the MLLR objective function for a given transform and baseclass.
-  BaseFloat MllrObjFunction(const Matrix<BaseFloat> &xform,
-                            int32 bclass_id) const;
-
-  // Cannot have copy constructor and assigment operator
-  KALDI_DISALLOW_COPY_AND_ASSIGN(RegtreeMllrDiagGmmAccs);
-};
-
-typedef TableWriter< KaldiObjectHolder<RegtreeMllrDiagGmm> >
-            RegtreeMllrDiagGmmWriter;
-typedef RandomAccessTableReader< KaldiObjectHolder<RegtreeMllrDiagGmm> >
-            RandomAccessRegtreeMllrDiagGmmReader;
-typedef RandomAccessTableReaderMapped< KaldiObjectHolder<RegtreeMllrDiagGmm> >
-            RandomAccessRegtreeMllrDiagGmmReaderMapped;
-typedef SequentialTableReader< KaldiObjectHolder<RegtreeMllrDiagGmm> >
-            RegtreeMllrDiagGmmSeqReader;
-
-}  // namespace kaldi
-
-#endif  // KALDI_TRANSFORM_REGTREE_MLLR_DIAG_GMM_H_
diff --git a/src/tree/Makefile b/src/tree/Makefile
index 8e10eb6c7ea..f13e7a3c2d8 100644
--- a/src/tree/Makefile
+++ b/src/tree/Makefile
@@ -13,7 +13,7 @@ OBJFILES = event-map.o context-dep.o clusterable-classes.o cluster-utils.o \
 					 build-tree-utils.o build-tree.o build-tree-questions.o tree-renderer.o
 
 LIBNAME = kaldi-tree
-ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/tree/build-tree-test.cc b/src/tree/build-tree-test.cc
index f8a1b58fbc7..5c2bac0e73d 100644
--- a/src/tree/build-tree-test.cc
+++ b/src/tree/build-tree-test.cc
@@ -34,21 +34,21 @@ void TestGenRandStats() {
     for (size_t i = 0;i < (size_t)num_phones;i++)
       phone_ids[i] = (i == 0 ? (Rand() % 2) : phone_ids[i-1] + 1 + (Rand()%2));
     int32 max_phone = *std::max_element(phone_ids.begin(), phone_ids.end());
-    std::vector<int32> hmm_lengths(max_phone+1);
+    std::vector<int32> num_pdf_classes(max_phone+1);
     std::vector<bool> is_ctx_dep(max_phone+1);
 
     for (int32 i = 0; i <= max_phone; i++) {
-      hmm_lengths[i] = 1 + Rand() % 3;
+      num_pdf_classes[i] = 1 + Rand() % 3;
       is_ctx_dep[i] = (RandUniform() < ctx_dep_prob);  // true w.p. ctx_dep_prob.
     }
     for (size_t i = 0;i < (size_t) num_phones;i++) {
-      KALDI_VLOG(2) <<  "For idx = "<< i << ", (phone_id, hmm_length, is_ctx_dep) == " << (phone_ids[i]) << " " << (hmm_lengths[phone_ids[i]]) << " " << (is_ctx_dep[phone_ids[i]]);
+      KALDI_VLOG(2) <<  "For idx = "<< i << ", (phone_id, num_pdf_classes, is_ctx_dep) == " << (phone_ids[i]) << " " << (num_pdf_classes[phone_ids[i]]) << " " << (is_ctx_dep[phone_ids[i]]);
     }
     BuildTreeStatsType stats;
     // put false for all_covered argument.
     // if it doesn't really ensure that all are covered with true, this will induce
     // failure in the test of context-fst.
-    GenRandStats(dim, num_stats, N, P, phone_ids, hmm_lengths, is_ctx_dep, false, &stats);
+    GenRandStats(dim, num_stats, N, P, phone_ids, num_pdf_classes, is_ctx_dep, false, &stats);
     std::cout << "Writing random stats.";
     std::cout <<"dim = " << dim << '\n';
     std::cout <<"num_phones = " << num_phones << '\n';
@@ -58,7 +58,7 @@ void TestGenRandStats() {
     std::cout << "is-ctx-dep = ";
     for (size_t i = 0;i < is_ctx_dep.size();i++)
       WriteBasicType(std::cout, false, static_cast<bool>(is_ctx_dep[i]));
-    std::cout << "hmm_lengths = "; WriteIntegerVector(std::cout, false, hmm_lengths);
+    std::cout << "num_pdf_classes = "; WriteIntegerVector(std::cout, false, num_pdf_classes);
     std::cout << "phone_ids = "; WriteIntegerVector(std::cout, false, phone_ids);
     std::cout << "Stats are: \n";
     WriteBuildTreeStats(std::cout, false, stats);
@@ -69,10 +69,10 @@ void TestGenRandStats() {
       EventValueType central_phone;
       bool b = EventMap::Lookup(stats[i].first, P, &central_phone);
       KALDI_ASSERT(b);
-      EventValueType position;
-      b = EventMap::Lookup(stats[i].first, kPdfClass, &position);
+      EventValueType pdf_class;
+      b = EventMap::Lookup(stats[i].first, kPdfClass, &pdf_class);
       KALDI_ASSERT(b);
-      KALDI_ASSERT(position>=0 && position < hmm_lengths[central_phone]);
+      KALDI_ASSERT(pdf_class >= 1 && pdf_class <= num_pdf_classes[central_phone]);
 
       for (EventKeyType j = 0; j < N; j++) {
         if (j != P) {  // non-"central" phone.
@@ -102,20 +102,20 @@ void TestBuildTree() {
     for (size_t i = 0;i < (size_t)num_phones;i++)
       phone_ids[i] = (i == 0 ? (Rand() % 2) : phone_ids[i-1] + 1 + (Rand()%2));
     int32 max_phone = *std::max_element(phone_ids.begin(), phone_ids.end());
-    std::vector<int32> hmm_lengths(max_phone+1);
+    std::vector<int32> num_pdf_classes(max_phone+1);
     std::vector<bool> is_ctx_dep(max_phone+1);
 
     for (int32 i = 0; i <= max_phone; i++) {
-      hmm_lengths[i] = 1 + Rand() % 3;
+      num_pdf_classes[i] = 1 + Rand() % 3;
       is_ctx_dep[i] = (RandUniform() < ctx_dep_prob);  // true w.p. ctx_dep_prob.
     }
     for (size_t i = 0;i < (size_t) num_phones;i++) {
-      KALDI_VLOG(2) <<  "For idx = "<< i << ", (phone_id, hmm_length, is_ctx_dep) == " << (phone_ids[i]) << " " << (hmm_lengths[phone_ids[i]]) << " " << (is_ctx_dep[phone_ids[i]]);
+      KALDI_VLOG(2) <<  "For idx = "<< i << ", (phone_id, num_pdf_classes, is_ctx_dep) == " << (phone_ids[i]) << " " << (num_pdf_classes[phone_ids[i]]) << " " << (is_ctx_dep[phone_ids[i]]);
     }
     // Generate rand stats.  These were tested in TestGenRandStats() above.
     BuildTreeStatsType stats;
     bool ensure_all_covered = false;
-    GenRandStats(dim, num_stats, N, P, phone_ids, hmm_lengths, is_ctx_dep, ensure_all_covered, &stats);
+    GenRandStats(dim, num_stats, N, P, phone_ids, num_pdf_classes, is_ctx_dep, ensure_all_covered, &stats);
 
     {  // print out the stats.
       std::cout << "Writing random stats.";
@@ -127,7 +127,7 @@ void TestBuildTree() {
       std::cout << "is-ctx-dep = ";
       for (size_t i = 0;i < is_ctx_dep.size();i++)
         WriteBasicType(std::cout, false, static_cast<bool>(is_ctx_dep[i]));
-      std::cout << "hmm_lengths = "; WriteIntegerVector(std::cout, false, hmm_lengths);
+      std::cout << "num_pdf_classes = "; WriteIntegerVector(std::cout, false, num_pdf_classes);
       std::cout << "phone_ids = "; WriteIntegerVector(std::cout, false, phone_ids);
       std::cout << "Stats are: \n";
       WriteBuildTreeStats(std::cout, false, stats);
@@ -172,11 +172,11 @@ void TestBuildTree() {
         bool round_num_leaves = true;
 
         EventMap *tree_not_rounded =
-               BuildTree(qopts, phone_sets, hmm_lengths, share_roots,
+               BuildTree(qopts, phone_sets, num_pdf_classes, share_roots,
                          do_split, stats, thresh, max_leaves, 0.0, P,
                          false);
 
-        tree = BuildTree(qopts, phone_sets, hmm_lengths, share_roots,
+        tree = BuildTree(qopts, phone_sets, num_pdf_classes, share_roots,
                          do_split, stats, thresh, max_leaves, 0.0, P,
                          round_num_leaves);
 
@@ -214,7 +214,7 @@ void TestBuildTree() {
 
         KALDI_ASSERT(num_removed < 8);
       } else {
-        tree = BuildTree(qopts, phone_sets, hmm_lengths, share_roots,
+        tree = BuildTree(qopts, phone_sets, num_pdf_classes, share_roots,
                          do_split, stats, thresh, max_leaves, 0.0, P,
                          false);
       }
@@ -235,4 +235,3 @@ int main() {
   kaldi::TestGenRandStats();
   kaldi::TestBuildTree();
 }
-
diff --git a/src/tree/build-tree-utils.cc b/src/tree/build-tree-utils.cc
index 254d7ec36d8..a1490db51bf 100644
--- a/src/tree/build-tree-utils.cc
+++ b/src/tree/build-tree-utils.cc
@@ -1024,7 +1024,8 @@ EventMap *GetStubMap(int32 P,
         }
       }
       std::map<EventValueType, EventAnswerType> m;
-      for (EventAnswerType p = 0; p < max_len; p++)
+      // Note: pdf-classes are 1-based.
+      for (EventAnswerType p = 1; p <= max_len; p++)
         m[p] = (*num_leaves_out)++;
       return new TableEventMap(kPdfClass,  // split on hmm-position
                                m);
diff --git a/src/tree/build-tree.cc b/src/tree/build-tree.cc
index 534f3352def..67fd03e46ca 100644
--- a/src/tree/build-tree.cc
+++ b/src/tree/build-tree.cc
@@ -29,7 +29,7 @@ namespace kaldi {
 
 void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P,
                   const std::vector<int32> &phone_ids,
-                  const std::vector<int32> &phone2hmm_length,
+                  const std::vector<int32> &phone2num_pdf_classes,
                   const std::vector<bool> &is_ctx_dep,
                   bool ensure_all_phones_covered,
                   BuildTreeStatsType *stats_out) {
@@ -41,7 +41,7 @@ void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P,
   KALDI_ASSERT(phone_ids.size() != 0);
   KALDI_ASSERT(stats_out != NULL && stats_out->empty());
   int32 max_phone = *std::max_element(phone_ids.begin(), phone_ids.end());
-  KALDI_ASSERT(phone2hmm_length.size() >= static_cast<size_t>(1 + max_phone));
+  KALDI_ASSERT(phone2num_pdf_classes.size() >= static_cast<size_t>(1 + max_phone));
   KALDI_ASSERT(is_ctx_dep.size() >= static_cast<size_t>(1 + max_phone));
 
   // Make sure phone id's distinct.
@@ -68,15 +68,16 @@ void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P,
     std::vector<int32> phone_vec(N);
     for (size_t i = 0;i < (size_t)N;i++) phone_vec[i] = phone_ids[(Rand() % num_phones)];
 
-    int32 hmm_length = phone2hmm_length[phone_vec[P]];
-    KALDI_ASSERT(hmm_length > 0);
+    int32 num_pdf_classes = phone2num_pdf_classes[phone_vec[P]];
+    KALDI_ASSERT(num_pdf_classes > 0);
     covered[phone_vec[P]] = true;
 
     // For each position [in the central phone]...
-    for (int32 j = 0; j < hmm_length; j++) {
+    for (int32 j = 0; j < num_pdf_classes; j++) {
       // create event vector.
       EventType event_vec;
-      event_vec.push_back(std::make_pair(kPdfClass, (EventValueType)j));  // record the position.
+      // Use j+1 in next line becuase pdf-classes are 1-based.
+      event_vec.push_back(std::make_pair(kPdfClass, (EventValueType)(j + 1)));  // record the position.
       for (size_t pos = 0; pos < (size_t)N; pos++) {
         if (pos == (size_t)(P) || is_ctx_dep[phone_vec[P]])
           event_vec.push_back(std::make_pair((EventKeyType)pos, (EventValueType)phone_vec[pos]));
@@ -92,7 +93,7 @@ void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P,
         Vector<BaseFloat> weights(N);  // weight of each component.
         for (int32 k = 0; k < N; k++) {
           BaseFloat k_pos = (N - 0.5 - k) / N;  // between 0 and 1, less for lower k...
-          BaseFloat j_pos = (hmm_length - 0.5 - j) / hmm_length;
+          BaseFloat j_pos = (num_pdf_classes - 0.5 - j) / num_pdf_classes;
           // j_pos is between 0 and 1, less for lower j.
 
           BaseFloat weight = j_pos*k_pos + (1.0-j_pos)*(1.0-k_pos);
@@ -652,7 +653,7 @@ void AutomaticallyObtainQuestions(BuildTreeStatsType &stats,
                << "stats disappeared: the size changed from " << stats.size()
                << " to " << retained_stats.size() << ".  You might be using "
                << "a nonstandard topology but forgot to modify the "
-               << "--pdf-class-list option (it defaults to { 1 } which is "
+               << "--pdf-class-list option (it defaults to { 2 } which is "
                << "the central state in a 3-state left-to-right topology)."
                << " E.g. a 1-state HMM topology would require the option "
                << "--pdf-class-list=0.";
diff --git a/src/tree/build-tree.h b/src/tree/build-tree.h
index 498ac5a8e19..e457d50e622 100644
--- a/src/tree/build-tree.h
+++ b/src/tree/build-tree.h
@@ -51,7 +51,7 @@ namespace kaldi {
  *                 roots are shared together (prior to decision-tree splitting).
  * @param phone2num_pdf_classes [in] A map from phones to the number of
  *                 \ref pdf_class "pdf-classes"
- *                 in the phone (this info is derived from the HmmTopology object)
+ *                 in the phone (this info is derived from the Topology object)
  * @param share_roots [in] A vector the same size as phone_sets; says for each
  *                phone set whether the root should be shared among all the
  *                pdf-classes or not.
@@ -72,11 +72,11 @@ namespace kaldi {
  *                  or a negative value (e.g. -1) sets it to the smallest likelihood
  *                  change seen during the splitting algorithm; this typically causes
  *                  about a 20% reduction in the number of leaves.
- 
+
  * @param P [in] The central position of the phone context window, e.g. 1 for a
  *                triphone system.
- * @param round_num_leaves [in]  If true, then the number of leaves in the 
- *                  final tree is made a multiple of 8. This is done by 
+ * @param round_num_leaves [in]  If true, then the number of leaves in the
+ *                  final tree is made a multiple of 8. This is done by
  *                  further clustering the leaves after they are first
  *                  clustered based on log-likelihood change.
  *                  (See cluster_thresh above) (default: true)
@@ -93,7 +93,7 @@ EventMap *BuildTree(Questions &qopts,
                     BaseFloat thresh,
                     int32 max_leaves,
                     BaseFloat cluster_thresh,  // typically == thresh.  If negative, use smallest split.
-                    int32 P, 
+                    int32 P,
                     bool round_num_leaves = true);
 
 
@@ -122,7 +122,7 @@ EventMap *BuildTree(Questions &qopts,
  *                 roots are shared together (prior to decision-tree splitting).
  * @param phone2num_pdf_classes [in] A map from phones to the number of
  *                 \ref pdf_class "pdf-classes"
- *                 in the phone (this info is derived from the HmmTopology object)
+ *                 in the phone (this info is derived from the Topology object)
  * @param share_roots [in] A vector the same size as phone_sets; says for each
  *                phone set whether the root should be shared among all the
  *                pdf-classes or not.
@@ -131,7 +131,7 @@ EventMap *BuildTree(Questions &qopts,
  *                 (generally true for non-silence phones).
  * @param stats [in] The statistics used in tree-building.
  * @param max_leaves_first [in] Maximum number of leaves it will create in first
- *                  level of decision tree. 
+ *                  level of decision tree.
  * @param max_leaves_second [in] Maximum number of leaves it will create in second
  *                  level of decision tree.  Must be > max_leaves_first.
  * @param cluster_leaves [in] Boolean value; if true, we post-cluster the leaves produced
@@ -180,7 +180,8 @@ EventMap *BuildTreeTwoLevel(Questions &qopts,
 /// @param N [in] context-size (typically 3)
 /// @param P [in] central-phone position in zero-based numbering (typically 1)
 /// @param phone_ids [in] integer ids of phones
-/// @param hmm_lengths [in] lengths of hmm for phone, indexed by phone.
+/// @param num_pdf_classes [in] number of pdf-classes for each phone, indexed by phone.
+///                    Note: pdf-classes are 1-based.
 /// @param is_ctx_dep [in] boolean array indexed by phone, saying whether each phone
 ///     is context dependent.
 /// @param ensure_all_phones_covered [in] Boolean argument: if true, GenRandStats
@@ -189,7 +190,7 @@ EventMap *BuildTreeTwoLevel(Questions &qopts,
 
 void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P,
                   const std::vector<int32> &phone_ids,
-                  const std::vector<int32> &hmm_lengths,
+                  const std::vector<int32> &num_pdf_classes,
                   const std::vector<bool> &is_ctx_dep,
                   bool ensure_all_phones_covered,
                   BuildTreeStatsType *stats_out);
diff --git a/src/tree/context-dep.cc b/src/tree/context-dep.cc
index 57efbc733b5..c9943c0bb1f 100644
--- a/src/tree/context-dep.cc
+++ b/src/tree/context-dep.cc
@@ -199,7 +199,7 @@ void ContextDependency::EnumeratePairs(
   to_pdf_->MultiMap(vec, &forward_pdfs);
   SortAndUniq(&forward_pdfs);
 
-  if (self_loop_pdf_class < 0) {
+  if (self_loop_pdf_class <= 0) {
     // Invalid pdf-class because there was no self-loop.  Return pairs
     // where the self-loop pdf-id is -1.
     for (int32 forward_pdf: forward_pdfs) {
@@ -227,7 +227,7 @@ void ContextDependency::EnumeratePairs(
     // Choose 'position' as a phone position in 'context' that's currently
     // -1, and that is as close as possible to the central position P.
     int32 position = 0;
-    int32 min_dist = N_ - 1;
+    int32 min_dist = N_;
     for (int32 i = 0; i < N_; i++) {
       int32 dist = (P_ - i > 0) ? (P_ - i) : (i - P_);
       if (phone_window[i] == -1 && dist < min_dist) {
@@ -301,7 +301,8 @@ void ContextDependency::GetPdfInfo(
     KALDI_ASSERT(static_cast<size_t>(phone) < num_pdf_classes.size());
     EventAnswerType len = num_pdf_classes[phone];
 
-    for (int32 pos = 0; pos < len; pos++) {
+    // Pdf-classes are 1-based.
+    for (int32 pos = 1; pos <= len; pos++) {
       vec.resize(2);
       vec[0] = std::make_pair(static_cast<EventKeyType>(P_),
                               static_cast<EventValueType>(phone));
diff --git a/src/tree/context-dep.h b/src/tree/context-dep.h
index 1508616c970..8914afc4373 100644
--- a/src/tree/context-dep.h
+++ b/src/tree/context-dep.h
@@ -39,7 +39,7 @@ namespace kaldi {
 static const EventKeyType kPdfClass = -1;  // The "name" to which we assign the
 // pdf-class (generally corresponds to position in the HMM, zero-based);
 // must not be used for any other event.  I.e. the value corresponding to
-// this key is the pdf-class (see hmm-topology.h for explanation of what this is).
+// this key is the pdf-class (see topology.h for explanation of what this is).
 
 
 /* ContextDependency is quite a generic decision tree.
@@ -99,10 +99,8 @@ class ContextDependency: public ContextDependencyInterface {
 
   /// GetPdfInfo returns a vector indexed by pdf-id, saying for each pdf which
   /// pairs of (phone, pdf-class) it can correspond to.  (Usually just one).
-  /// c.f. hmm/hmm-topology.h for meaning of pdf-class.
-  /// This is the old, simpler interface of GetPdfInfo(), and that this one can
-  /// only be called if the HmmTopology object's IsHmm() function call returns
-  /// true.
+  /// c.f. hmm/topology.h for meaning of pdf-class.
+  /// This is the old, simpler interface of GetPdfInfo().
   virtual void GetPdfInfo(
       const std::vector<int32> &phones,  // list of phones
       const std::vector<int32> &num_pdf_classes,  // indexed by phone,
diff --git a/src/util/Makefile b/src/util/Makefile
index acfab8b8de1..de030b5dce3 100644
--- a/src/util/Makefile
+++ b/src/util/Makefile
@@ -15,6 +15,6 @@ OBJFILES = text-utils.o kaldi-io.o kaldi-holder.o kaldi-table.o \
 
 LIBNAME = kaldi-util
 
-ADDLIBS = ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/util/text-utils.h b/src/util/text-utils.h
index 02f4bf483fc..d2a13c391f4 100644
--- a/src/util/text-utils.h
+++ b/src/util/text-utils.h
@@ -272,7 +272,7 @@ void ParseConfigLines(const std::vector<std::string> &lines,
 
 
 /// Returns true if 'name' would be a valid name for a component or node in a
-/// nnet3Nnet.  This is a nonempty string beginning with A-Za-z_, and containing only
+/// nnet3::Nnet.  This is a nonempty string beginning with A-Za-z_, and containing only
 /// '-', '_', '.', A-Z, a-z, or 0-9.
 bool IsValidName(const std::string &name);