OpenNMT · Sep 25, 2018 · Sep 25, 2018 · Sep 28, 2018 · Oct 14, 2018 · Dec 3, 2018
diff --git a/.gitignore b/.gitignore
@@ -95,6 +95,9 @@ env/
 venv/
 ENV/
 
+# My changes
+eduamf/
+
 # Spyder project settings
 .spyderproject
 .spyproject

diff --git a/.travis.yml b/.travis.yml
@@ -17,7 +17,7 @@ before_install:
   # Useful for debugging any issues with conda
   - conda info -a
   # freeze the supported pytorch version for consistency
-  - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pytorch=0.4.0 -c soumith
+  - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pytorch=0.4.1 cuda92 -c pytorch
   - source activate test-environment
   # use requirements.txt for dependencies
   - pip install -r requirements.txt
@@ -32,15 +32,15 @@ install:
 script:
   - wget -O /tmp/im2text.tgz http://lstm.seas.harvard.edu/latex/im2text_small.tgz; tar zxf /tmp/im2text.tgz -C /tmp/; head /tmp/im2text/src-train.txt > /tmp/im2text/src-train-head.txt; head /tmp/im2text/tgt-train.txt > /tmp/im2text/tgt-train-head.txt; head /tmp/im2text/src-val.txt > /tmp/im2text/src-val-head.txt; head /tmp/im2text/tgt-val.txt > /tmp/im2text/tgt-val-head.txt
   - wget -O /tmp/speech.tgz http://lstm.seas.harvard.edu/latex/speech.tgz; tar zxf /tmp/speech.tgz -C /tmp/; head /tmp/speech/src-train.txt > /tmp/speech/src-train-head.txt; head /tmp/speech/tgt-train.txt > /tmp/speech/tgt-train-head.txt; head /tmp/speech/src-val.txt > /tmp/speech/src-val-head.txt; head /tmp/speech/tgt-val.txt > /tmp/speech/tgt-val-head.txt
-  - wget -O /tmp/test_model_speech.pt http://lstm.seas.harvard.edu/latex/test_model_speech.pt
+  - wget -O /tmp/test_model_speech.pt http://lstm.seas.harvard.edu/latex/model_step_2760.pt
   - wget -O /tmp/test_model_im2text.pt http://lstm.seas.harvard.edu/latex/test_model_im2text.pt
   - python -m unittest discover
   # test nmt preprocessing
   - python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data /tmp/data -src_vocab_size 1000 -tgt_vocab_size 1000 && rm -rf /tmp/data*.pt
   # test im2text preprocessing
-  - python preprocess.py -data_type img -src_dir /tmp/im2text/images -train_src /tmp/im2text/src-train.txt -train_tgt /tmp/im2text/tgt-train.txt -valid_src /tmp/im2text/src-val.txt -valid_tgt /tmp/im2text/tgt-val.txt -save_data /tmp/im2text/data && rm -rf /tmp/im2text/data*.pt
+  - python preprocess.py -data_type img -shard_size 3 -src_dir /tmp/im2text/images -train_src /tmp/im2text/src-train.txt -train_tgt /tmp/im2text/tgt-train.txt -valid_src /tmp/im2text/src-val.txt -valid_tgt /tmp/im2text/tgt-val.txt -save_data /tmp/im2text/data && rm -rf /tmp/im2text/data*.pt
   # test speech2text preprocessing
-  - python preprocess.py -data_type audio -src_dir /tmp/speech/an4_dataset -train_src /tmp/speech/src-train.txt -train_tgt /tmp/speech/tgt-train.txt -valid_src /tmp/speech/src-val.txt -valid_tgt /tmp/speech/tgt-val.txt -save_data /tmp/speech/data && rm -rf /tmp/speech/data*.pt
+  - python preprocess.py -data_type audio -shard_size 300 -src_dir /tmp/speech/an4_dataset -train_src /tmp/speech/src-train.txt -train_tgt /tmp/speech/tgt-train.txt -valid_src /tmp/speech/src-val.txt -valid_tgt /tmp/speech/tgt-val.txt -save_data /tmp/speech/data && rm -rf /tmp/speech/data*.pt
   # test nmt translation
   - head data/src-test.txt > /tmp/src-test.txt; python translate.py -model onmt/tests/test_model.pt -src /tmp/src-test.txt -verbose
   # test im2text translation
@@ -50,7 +50,7 @@ script:
   # test nmt preprocessing and training
   - head data/src-val.txt > /tmp/src-val.txt; head data/tgt-val.txt > /tmp/tgt-val.txt; python preprocess.py -train_src /tmp/src-val.txt -train_tgt /tmp/tgt-val.txt -valid_src /tmp/src-val.txt -valid_tgt /tmp/tgt-val.txt -save_data /tmp/q -src_vocab_size 1000 -tgt_vocab_size 1000; python train.py -data /tmp/q -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 5 -rnn_size 10 -train_steps 10 && rm -rf /tmp/q*.pt
   # test nmt preprocessing w/ sharding and training w/copy
-  - head data/src-val.txt > /tmp/src-val.txt; head data/tgt-val.txt > /tmp/tgt-val.txt; python preprocess.py -train_src /tmp/src-val.txt -train_tgt /tmp/tgt-val.txt -valid_src /tmp/src-val.txt -valid_tgt /tmp/tgt-val.txt -max_shard_size 1 -dynamic_dict -save_data /tmp/q -src_vocab_size 1000 -tgt_vocab_size 1000; python train.py -data /tmp/q -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 5 -rnn_size 10 -copy_attn -train_steps 10 && rm -rf /tmp/q*.pt
+  - head data/src-val.txt > /tmp/src-val.txt; head data/tgt-val.txt > /tmp/tgt-val.txt; python preprocess.py -train_src /tmp/src-val.txt -train_tgt /tmp/tgt-val.txt -valid_src /tmp/src-val.txt -valid_tgt /tmp/tgt-val.txt -shard_size 1 -dynamic_dict -save_data /tmp/q -src_vocab_size 1000 -tgt_vocab_size 1000; python train.py -data /tmp/q -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 5 -rnn_size 10 -copy_attn -train_steps 10 && rm -rf /tmp/q*.pt
 
   # test im2text preprocessing and training
   - head /tmp/im2text/src-val.txt > /tmp/im2text/src-val-head.txt; head /tmp/im2text/tgt-val.txt > /tmp/im2text/tgt-val-head.txt; python preprocess.py -data_type img -src_dir /tmp/im2text/images -train_src /tmp/im2text/src-val-head.txt -train_tgt /tmp/im2text/tgt-val-head.txt -valid_src /tmp/im2text/src-val-head.txt -valid_tgt /tmp/im2text/tgt-val-head.txt -save_data /tmp/im2text/q; python train.py -model_type img -data /tmp/im2text/q -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 5 -rnn_size 10 -train_steps 10 && rm -rf /tmp/im2text/q*.pt

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,119 @@
+
+**Notes on versioning**
+
+
+## [Unreleased]
+### Fixes and improvements
+
+## [0.8.2](https://github.com/OpenNMT/OpenNMT-py/tree/0.8.2) (2019-02-16)
+* Update documentation and Library example
+* Revamp args
+* Bug fixes, save moving average in FP32
+* Allow FP32 inference for FP16 models
+
+## [0.8.1](https://github.com/OpenNMT/OpenNMT-py/tree/0.8.1) (2019-02-12)
+* Update documentation
+* Random sampling scores fixes
+* Bug fixes
+
+## [0.8.0](https://github.com/OpenNMT/OpenNMT-py/tree/0.8.0) (2019-02-09)
+* Many fixes and code cleaning thanks @flauted, @guillaumekln
+* Datasets code refactor (thanks @flauted) you need to r-preeprocess datasets
+
+### New features
+* FP16 Support: Experimental, using Apex, Checkpoints may break in future version.
+* Continuous exponential moving average (thanks @francoishernandez, and Marian)
+* Relative positions encoding (thanks @francoishernanndez, and Google T2T)
+* Deprecate the old beam search, fast batched beam search supports all options
+
+
+## [0.7.2](https://github.com/OpenNMT/OpenNMT-py/tree/0.7.2) (2019-01-31)
+* Many fixes and code cleaning thanks @bpopeters, @flauted, @guillaumekln
+
+### New features
+* Multilevel fields for better handling of text featuer embeddinggs. 
+
+
+## [0.7.1](https://github.com/OpenNMT/OpenNMT-py/tree/0.7.1) (2019-01-24)
+* Many fixes and code refactoring thanks @bpopeters, @flauted, @guillaumekln
+
+### New features
+* Random sampling thanks @daphnei
+* Enable sharding for huge files at translation
+
+## [0.7.0](https://github.com/OpenNMT/OpenNMT-py/tree/0.7.0) (2019-01-02)
+* Many fixes and code refactoring thanks @benopeters
+* Migrated to Pytorch 1.0
+
+## [0.6.0](https://github.com/OpenNMT/OpenNMT-py/tree/0.6.0) (2018-11-28)
+* Many fixes and code improvements
+* New: Ability to load a yml config file. See examples in config folder.
+
+## [0.5.0](https://github.com/OpenNMT/OpenNMT-py/tree/0.5.0) (2018-10-24)
+* Fixed advance n_best beam in translate_batch_fast
+* Fixed remove valid set vocab from total vocab
+* New: Ability to reset optimizer when using train_from
+* New: create_vocabulary tool + fix when loading existing vocab.
+
+## [0.4.1](https://github.com/OpenNMT/OpenNMT-py/tree/0.4.1) (2018-10-11)
+* Fixed preprocessing files names, cleaning intermediary files.
+
+## [0.4.0](https://github.com/OpenNMT/OpenNMT-py/tree/0.4.0) (2018-10-08)
+* Fixed Speech2Text training (thanks Yuntian)
+
+* Removed -max_shard_size, replaced by -shard_size = number of examples in a shard.
+  Default value = 1M which works fine in most Text dataset cases. (will avoid Ram OOM in most cases)
+
+
+## [0.3.0](https://github.com/OpenNMT/OpenNMT-py/tree/0.3.0) (2018-09-27)
+* Now requires Pytorch 0.4.1
+
+* Multi-node Multi-GPU with Torch Distributed
+
+  New options are:
+  -master_ip: ip address of the master node
+  -master_port: port number of th emaster node
+  -world_size = total number of processes to be run (total GPUs accross all nodes)
+  -gpu_ranks = list of indices of processes accross all nodes
+
+* gpuid is deprecated
+See examples in https://github.com/OpenNMT/OpenNMT-py/blob/master/docs/source/FAQ.md
+
+* Fixes to img2text now working
+
+* New sharding based on number of examples
+
+* Fixes to avoid 0.4.1 deprecated functions.
+
+
+## [0.2.1](https://github.com/OpenNMT/OpenNMT-py/tree/0.2.1) (2018-08-31)
+
+### Fixes and improvements
+
+* First compatibility steps with Pytorch 0.4.1 (non breaking)
+* Fix TranslationServer (when various request try to load the same model at the same time)
+* Fix StopIteration error (python 3.7)
+
+### New features
+* Ensemble at inference (thanks @Waino)
+
+## [0.2](https://github.com/OpenNMT/OpenNMT-py/tree/v0.2) (2018-08-28)
+
+### improvements
+
+* Compatibility fixes with Pytorch 0.4 / Torchtext 0.3
+* Multi-GPU based on Torch Distributed
+* Average Attention Network (AAN) for the Transformer (thanks @francoishernandez )
+* New fast beam search (see -fast in translate.py) (thanks @guillaumekln)
+* Sparse attention / sparsemax (thanks to @bpopeters)
+* Refactoring of many parts of the code base:
+ - change from -epoch to -train_steps -valid_steps (see opts.py)
+ - reorg of the logic train => train_multi / train_single => trainer
+* Many fixes / improvements in the translationserver (thanks @pltrdy @francoishernandez)
+* fix BPTT
+
+## [0.1](https://github.com/OpenNMT/OpenNMT-py/tree/v0.1) (2018-06-08)
+
+### First and Last Release using Pytorch 0.3.x
+
+
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,11 +1,88 @@
+# Contributors
+
 OpenNMT-py is a community developed project and we love developer contributions.
 
+## Guidelines
 Before sending a PR, please do this checklist first:
 
-- Please run `test/pull_request_chk.sh` and fix any errors. When adding new functionality, also add tests to this script. Included checks:
-    1. flake8 and pep8-naming check for coding style;
+- Please run `tools/pull_request_chk.sh` and fix any errors. When adding new functionality, also add tests to this script. Included checks:
+    1. flake8 check for coding style;
     2. unittest;
     3. continuous integration tests listed in `.travis.yml`.
-- When adding/modifying class constructor, please make the arguments as same naming style as its superclass in pytorch.
-- If your change is based on a paper, please include a clear comment and reference in the code.
-- If your function takes/returns tensor arguments, please include assertions to document the sizes. See `GlobalAttention.py` for examples.
+- When adding/modifying class constructor, please make the arguments as same naming style as its superclass in PyTorch.
+- If your change is based on a paper, please include a clear comment and reference in the code (more on that below). 
+
+### Docstrings
+Above all, try to follow the Google docstring format
+([Napoleon example](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html),
+[Google styleguide](http://google.github.io/styleguide/pyguide.html)).
+This makes it easy to include your contributions in the Sphinx documentation. And, do feel free
+to autodoc your contributions in the API ``.rst`` files in the `docs/source` folder! If you do, check that
+your additions look right.
+
+```bash
+cd docs
+# install some dependencies if necessary:
+# recommonmark, sphinx_rtd_theme, sphinxcontrib-bibtex
+make html
+firefox build/html/main.html  # or your browser of choice
+```
+
+Some particular advice:
+- Try to follow Python 3 [``typing`` module](https://docs.python.org/3/library/typing.html) conventions when documenting types.
+    - Exception: use "or" instead of unions for more readability
+    - For external types, use the full "import name". Common abbreviations (e.g. ``np``) are acceptable.
+      For ``torch.Tensor`` types, the ``torch.`` is optional.
+    - Please don't use tics like `` (`str`) `` or rst directives like `` (:obj:`str`) ``. Napoleon handles types
+      very well without additional help, so avoid the clutter.
+- [Google docstrings don't support multiple returns](https://stackoverflow.com/questions/29221551/can-sphinx-napoleon-document-function-returning-multiple-arguments).
+For multiple returns, the following works well with Sphinx and is still very readable.
+  ```python
+  def foo(a, b):
+      """This is my docstring.
+      
+      Args:
+          a (object): Something.
+          b (class): Another thing.
+    
+      Returns:
+          (object, class):
+        
+          * a: Something or rather with a long
+            description that spills over.
+          * b: And another thing.
+      """
+
+      return a, b
+  ```
+- When citing a paper, avoid directly linking in the docstring! Add a Bibtex entry to `docs/source/refs.bib`.
+E.g., to cite "Attention Is All You Need", visit [arXiv](https://arxiv.org/abs/1706.03762), choose the
+[bibtext](https://dblp.uni-trier.de/rec/bibtex/journals/corr/VaswaniSPUJGKP17) link, search `docs/source/refs.bib`
+using `CTRL-F` for `DBLP:journals/corr/VaswaniSPUJGKP17`, and if you do not find it then copy-paste the
+citation into `refs.bib`. Then, in your docstring, use ``:cite:`DBLP:journals/corr/VaswaniSPUJGKP17` ``.
+    - However, a link is better than nothing.
+- Please document tensor shapes. Prefer the format
+  ``` ``(a, b, c)`` ```. This style is easy to read, allows using ``x`` for multplication, and is common
+  (PyTorch uses a few variations on the parentheses format, AllenNLP uses exactly this format, Fairseq uses
+  the parentheses format with single ticks).
+    - Again, a different style is better than no shape documentation.
+- Please avoid unnecessary space characters, try to capitalize, and try to punctuate.
+
+  For multi-line docstrings, add a blank line after the closing ``"""``.
+  Don't use a blank line before the closing quotes.
+
+  ``""" not this """`` ``"""This."""``
+
+  ```python
+  """
+      Not this.
+  """
+  ```
+  ```python
+  """This."""
+  ```
+
+  This note is the least important. Focus on content first, but remember that consistent docs look good.
+- Be sensible about the first line. Generally, one stand-alone summary line (per the Google guidelines) is good.
+  Sometimes, it's better to cut directly to the args or an extended description. It's always acceptable to have a
+  "trailing" citation.
diff --git a/README.md b/README.md
@@ -5,15 +5,16 @@
 
 This is a [Pytorch](https://github.com/pytorch/pytorch)
 port of [OpenNMT](https://github.com/OpenNMT/OpenNMT),
-an open-source (MIT) neural machine translation system. It is designed to be research friendly to try out new ideas in translation, summary, image-to-text, morphology, and many other domains.
+an open-source (MIT) neural machine translation system. It is designed to be research friendly to try out new ideas in translation, summary, image-to-text, morphology, and many other domains. Some companies have proven the code to be production ready.
 
-Codebase is relatively stable, but PyTorch is still evolving. We currently only support PyTorch 0.4 and recommend forking if you need to have stable code.
-
-OpenNMT-py is run as a collaborative open-source project. It is maintained by [Sasha Rush](http://github.com/srush) (Cambridge, MA), [Ben Peters](http://github.com/bpopeters) (Lisbon), and [Jianyu Zhan](http://github.com/jianyuzhan) (Shanghai). The original code was written by [Adam Lerer](http://github.com/adamlerer) (NYC). 
 We love contributions. Please consult the Issues page for any [Contributions Welcome](https://github.com/OpenNMT/OpenNMT-py/issues?q=is%3Aissue+is%3Aopen+label%3A%22contributions+welcome%22) tagged post. 
 
 <center style="padding: 40px"><img width="70%" src="http://opennmt.github.io/simple-attn.png" /></center>
 
+Before raising an issue, make sure you read the requirements and the documentation examples.
+
+Unless there is a bug, please use the [Forum](http://forum.opennmt.net) or [Gitter](https://gitter.im/OpenNMT/OpenNMT-py) to ask questions.
+
 
 Table of Contents
 =================
@@ -22,6 +23,7 @@ Table of Contents
   * [Features](#features)
   * [Quickstart](#quickstart)
   * [Run on FloydHub](#run-on-floydhub)
+  * [Acknowledgements](#acknowledgements)
   * [Citation](#citation)
 
 ## Requirements
@@ -32,29 +34,27 @@ All dependencies can be installed via:
 pip install -r requirements.txt
 ```
 
-Note that we currently only support PyTorch 0.4.
+Note that we currently only support PyTorch 1.0.0
 
 ## Features
 
-The following OpenNMT features are implemented:
-
 - [data preprocessing](http://opennmt.net/OpenNMT-py/options/preprocess.html)
 - [Inference (translation) with batching and beam search](http://opennmt.net/OpenNMT-py/options/translate.html)
 - [Multiple source and target RNN (lstm/gru) types and attention (dotprod/mlp) types](http://opennmt.net/OpenNMT-py/options/train.html#model-encoder-decoder)
-- [TensorBoard/Crayon logging](http://opennmt.net/OpenNMT-py/options/train.html#logging)
+- [TensorBoard](http://opennmt.net/OpenNMT-py/options/train.html#logging)
 - [Source word features](http://opennmt.net/OpenNMT-py/options/train.html#model-embeddings)
 - [Pretrained Embeddings](http://opennmt.net/OpenNMT-py/FAQ.html#how-do-i-use-pretrained-embeddings-e-g-glove)
 - [Copy and Coverage Attention](http://opennmt.net/OpenNMT-py/options/train.html#model-attention)
 - [Image-to-text processing](http://opennmt.net/OpenNMT-py/im2text.html)
 - [Speech-to-text processing](http://opennmt.net/OpenNMT-py/speech2text.html)
 - ["Attention is all you need"](http://opennmt.net/OpenNMT-py/FAQ.html#how-do-i-use-the-transformer-model)
+- [Multi-GPU](http://opennmt.net/OpenNMT-py/FAQ.html##do-you-support-multi-gpu)
 - Inference time loss functions.
-
-Beta Features (committed):
-- multi-GPU
-- Structured attention
 - [Conv2Conv convolution model]
 - SRU "RNNs faster than CNN" paper
+- FP16 training (mixed-precision with Apex)
+- Protection of terms.
+- Web service with for large texts.
 
 ## Quickstart
 
@@ -96,8 +96,11 @@ python train.py -data data/demo -save_model demo-model
 
 The main train command is quite simple. Minimally it takes a data file
 and a save file.  This will run the default model, which consists of a
-2-layer LSTM with 500 hidden units on both the encoder/decoder. You
-can also add `-gpuid 1` to use (say) GPU 1.
+2-layer LSTM with 500 hidden units on both the encoder/decoder.
+If you want to train on GPU, you need to set, as an example:
+CUDA_VISIBLE_DEVICES=1,3
+`-world_size 2 -gpu_ranks 0 1` to use (say) GPU 1 and 3 on this node only.
+To know more about distributed training on single or multi nodes, read the FAQ section.
 
 ### Step 3: Translate
 
@@ -119,18 +122,38 @@ Click this button to open a Workspace on [FloydHub](https://www.floydhub.com/?ut
 
 ## Pretrained embeddings (e.g. GloVe)
 
-Go to tutorial: [How to use GloVe pre-trained embeddings in OpenNMT-py](http://forum.opennmt.net/t/how-to-use-glove-pre-trained-embeddings-in-opennmt-py/1011)
+Please see the FAQ: [How to use GloVe pre-trained embeddings in OpenNMT-py](http://opennmt.net/OpenNMT-py/FAQ.html#how-do-i-use-pretrained-embeddings-e-g-glove)
 
 ## Pretrained Models
 
 The following pretrained models can be downloaded and used with translate.py.
 
 http://opennmt.net/Models-py/
 
+## Acknowledgements
+
+OpenNMT-py is run as a collaborative open-source project.
+The original code was written by [Adam Lerer](http://github.com/adamlerer) (NYC) to reproduce OpenNMT-Lua using Pytorch.
 
+Major contributors are:
+[Sasha Rush](https://github.com/srush) (Cambridge, MA)
+[Vincent Nguyen](https://github.com/vince62s) (Ubiqus)
+[Ben Peters](http://github.com/bpopeters) (Lisbon)
+[Sebastian Gehrmann](https://github.com/sebastianGehrmann) (Harvard NLP)
+[Yuntian Deng](https://github.com/da03) (Harvard NLP)
+[Guillaume Klein](https://github.com/guillaumekln) (Systran)
+[Paul Tardy](https://github.com/pltrdy) (Ubiqus / Lium)
+[François Hernandez](https://github.com/francoishernandez) (Ubiqus)
+[Jianyu Zhan](http://github.com/jianyuzhan) (Shanghai)
+[Dylan Flaute](http://github.com/flauted (University of Dayton)
+and more !
+
+OpentNMT-py belongs to the OpenNMT project along with OpenNMT-Lua and OpenNMT-tf.
 
 ## Citation
 
+[OpenNMT: Neural Machine Translation Toolkit](https://arxiv.org/pdf/1805.11462)
+
 [OpenNMT technical report](https://doi.org/10.18653/v1/P17-4012)
 
 ```

diff --git a/config/config-rnn-summarization.yml b/config/config-rnn-summarization.yml
@@ -0,0 +1,31 @@
+data: data/cnndm/CNNDM
+save_model: models/cnndm
+save_checkpoint_steps: 10000
+keep_checkpoint: 10
+seed: 3435
+train_steps: 100000
+valid_steps: 10000
+report_every: 100
+
+encoder_type: brnn
+word_vec_size: 128
+rnn_size: 512
+layers: 1
+
+optim: adagrad
+learning_rate: 0.15
+adagrad_accumulator_init: 0.1
+max_grad_norm: 2
+
+batch_size: 16
+dropout: 0.0
+
+copy_attn: 'true'
+global_attention: mlp
+reuse_copy_attn: 'true'
+bridge: 'true'
+
+world_size: 2
+gpu_ranks:
+- 0
+- 1
diff --git a/config/config-transformer-base-1GPU.yml b/config/config-transformer-base-1GPU.yml
@@ -0,0 +1,42 @@
+data: exp/dataset.de-en
+save_model: exp/model.de-en
+save_checkpoint_steps: 10000
+keep_checkpoint: 10
+seed: 3435
+train_steps: 500000
+valid_steps: 10000
+warmup_steps: 8000
+report_every: 100
+
+decoder_type: transformer
+encoder_type: transformer
+word_vec_size: 512
+rnn_size: 512
+layers: 6
+transformer_ff: 2048
+heads: 8
+
+accum_count: 8
+optim: adam
+adam_beta1: 0.9
+adam_beta2: 0.998
+decay_method: noam
+learning_rate: 2.0
+max_grad_norm: 0.0
+
+batch_size: 4096
+batch_type: tokens
+normalization: tokens
+dropout: 0.1
+label_smoothing: 0.1
+
+max_generator_batches: 2
+
+param_init: 0.0
+param_init_glorot: 'true'
+position_encoding: 'true'
+
+world_size: 1
+gpu_ranks:
+- 0
+
diff --git a/config/config-transformer-base-4GPU.yml b/config/config-transformer-base-4GPU.yml
@@ -0,0 +1,45 @@
+data: exp/dataset.de-en
+save_model: exp/model.de-en
+save_checkpoint_steps: 10000
+keep_checkpoint: 10
+seed: 3435
+train_steps: 200000
+valid_steps: 10000
+warmup_steps: 8000
+report_every: 100
+
+decoder_type: transformer
+encoder_type: transformer
+word_vec_size: 512
+rnn_size: 512
+layers: 6
+transformer_ff: 2048
+heads: 8
+
+accum_count: 2
+optim: adam
+adam_beta1: 0.9
+adam_beta2: 0.998
+decay_method: noam
+learning_rate: 2.0
+max_grad_norm: 0.0
+
+batch_size: 4096
+batch_type: tokens
+normalization: tokens
+dropout: 0.1
+label_smoothing: 0.1
+
+max_generator_batches: 2
+
+param_init: 0.0
+param_init_glorot: 'true'
+position_encoding: 'true'
+
+world_size: 4
+gpu_ranks:
+- 0
+- 1
+- 2
+- 3
+
diff --git a/data/README.md b/data/README.md
@@ -4,4 +4,4 @@
 
 > python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000
 
-> python train.py -data data/data -save_model /n/rush_lab/data/tmp_ -gpuid 0 -rnn_size 100 -word_vec_size 50 -layers 1 -train_steps 100 -optim adam  -learning_rate 0.001
+> python train.py -data data/data -save_model /n/rush_lab/data/tmp_ -world_size 1 -gpu_ranks 0 -rnn_size 100 -word_vec_size 50 -layers 1 -train_steps 100 -optim adam  -learning_rate 0.001
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -3,3 +3,4 @@ sphinxcontrib.bibtex
 sphinxcontrib.mermaid
 sphinx-rtd-theme
 recommonmark
+sphinx-argparse
diff --git a/docs/source/CONTRIBUTING.md b/docs/source/CONTRIBUTING.md
@@ -2,12 +2,87 @@
 
 OpenNMT-py is a community developed project and we love developer contributions.
 
+## Guidelines
 Before sending a PR, please do this checklist first:
 
 - Please run `tools/pull_request_chk.sh` and fix any errors. When adding new functionality, also add tests to this script. Included checks:
     1. flake8 check for coding style;
     2. unittest;
     3. continuous integration tests listed in `.travis.yml`.
-- When adding/modifying class constructor, please make the arguments as same naming style as its superclass in pytorch.
-- If your change is based on a paper, please include a clear comment and reference in the code. 
-- If your function takes/returns tensor arguments, please include assertions to document the sizes. See `GlobalAttention.py` for examples. 
+- When adding/modifying class constructor, please make the arguments as same naming style as its superclass in PyTorch.
+- If your change is based on a paper, please include a clear comment and reference in the code (more on that below). 
+
+### Docstrings
+Above all, try to follow the Google docstring format
+([Napoleon example](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html),
+[Google styleguide](http://google.github.io/styleguide/pyguide.html)).
+This makes it easy to include your contributions in the Sphinx documentation. And, do feel free
+to autodoc your contributions in the API ``.rst`` files in the `docs/source` folder! If you do, check that
+your additions look right.
+
+```bash
+cd docs
+# install some dependencies if necessary:
+# recommonmark, sphinx_rtd_theme, sphinxcontrib-bibtex
+make html
+firefox build/html/main.html  # or your browser of choice
+```
+
+Some particular advice:
+- Try to follow Python 3 [``typing`` module](https://docs.python.org/3/library/typing.html) conventions when documenting types.
+    - Exception: use "or" instead of unions for more readability
+    - For external types, use the full "import name". Common abbreviations (e.g. ``np``) are acceptable.
+      For ``torch.Tensor`` types, the ``torch.`` is optional.
+    - Please don't use tics like `` (`str`) `` or rst directives like `` (:obj:`str`) ``. Napoleon handles types
+      very well without additional help, so avoid the clutter.
+- [Google docstrings don't support multiple returns](https://stackoverflow.com/questions/29221551/can-sphinx-napoleon-document-function-returning-multiple-arguments).
+For multiple returns, the following works well with Sphinx and is still very readable.
+  ```python
+  def foo(a, b):
+      """This is my docstring.
+      
+      Args:
+          a (object): Something.
+          b (class): Another thing.
+    
+      Returns:
+          (object, class):
+        
+          * a: Something or rather with a long
+            description that spills over.
+          * b: And another thing.
+      """
+
+      return a, b
+  ```
+- When citing a paper, avoid directly linking in the docstring! Add a Bibtex entry to `docs/source/refs.bib`.
+E.g., to cite "Attention Is All You Need", visit [arXiv](https://arxiv.org/abs/1706.03762), choose the
+[bibtext](https://dblp.uni-trier.de/rec/bibtex/journals/corr/VaswaniSPUJGKP17) link, search `docs/source/refs.bib`
+using `CTRL-F` for `DBLP:journals/corr/VaswaniSPUJGKP17`, and if you do not find it then copy-paste the
+citation into `refs.bib`. Then, in your docstring, use ``:cite:`DBLP:journals/corr/VaswaniSPUJGKP17` ``.
+    - However, a link is better than nothing.
+- Please document tensor shapes. Prefer the format
+  ``` ``(a, b, c)`` ```. This style is easy to read, allows using ``x`` for multplication, and is common
+  (PyTorch uses a few variations on the parentheses format, AllenNLP uses exactly this format, Fairseq uses
+  the parentheses format with single ticks).
+    - Again, a different style is better than no shape documentation.
+- Please avoid unnecessary space characters, try to capitalize, and try to punctuate.
+
+  For multi-line docstrings, add a blank line after the closing ``"""``.
+  Don't use a blank line before the closing quotes.
+
+  ``""" not this """`` ``"""This."""``
+
+  ```python
+  """
+      Not this.
+  """
+  ```
+  ```python
+  """This."""
+  ```
+
+  This note is the least important. Focus on content first, but remember that consistent docs look good.
+- Be sensible about the first line. Generally, one stand-alone summary line (per the Google guidelines) is good.
+  Sometimes, it's better to cut directly to the args or an extended description. It's always acceptable to have a
+  "trailing" citation.
diff --git a/docs/source/FAQ.md b/docs/source/FAQ.md
@@ -9,14 +9,14 @@ the script is a slightly modified version of ylhsiehs one2.
 Usage:
 
 ```
-embeddings_to_torch.py [-h] -emb_file EMB_FILE -output_file OUTPUT_FILE -dict_file DICT_FILE [-verbose]
-
-emb_file: GloVe like embedding file i.e. CSV [word] [dim1] ... [dim_d]
-
-output_file: a filename to save the output as PyTorch serialized tensors2
-
-dict_file: dict output from OpenNMT-py preprocessing
+embeddings_to_torch.py [-h] [-emb_file_both EMB_FILE_BOTH]
+                       [-emb_file_enc EMB_FILE_ENC]
+                       [-emb_file_dec EMB_FILE_DEC] -output_file
+                       OUTPUT_FILE -dict_file DICT_FILE [-verbose]
+                       [-skip_lines SKIP_LINES]
+                       [-type {GloVe,word2vec}]
 ```
+Run embeddings_to_torch.py -h for more usagecomplete info.
 
 Example
 
@@ -43,7 +43,7 @@ python preprocess.py \
 3) prepare embeddings:
 
 ```
-./tools/embeddings_to_torch.py -emb_file "glove_dir/glove.6B.100d.txt" \
+./tools/embeddings_to_torch.py -emb_file_both "glove_dir/glove.6B.100d.txt" \
 -dict_file "data/data.vocab.pt" \
 -output_file "data/embeddings"
 ```
@@ -62,21 +62,22 @@ python train.py -save_model data/model \
 ```
 
 
-## How do I use the Transformer model?
+## How do I use the Transformer model? Do you support multi-gpu?
 
 The transformer model is very sensitive to hyperparameters. To run it
 effectively you need to set a bunch of different options that mimic the Google
 setup. We have confirmed the following command can replicate their WMT results.
 
 ```
-python  train.py -data /tmp/de2/data -save_model /tmp/extra -gpuid 1 \
-        -layers 6 -rnn_size 512 -word_vec_size 512   \
+python  train.py -data /tmp/de2/data -save_model /tmp/extra \
+        -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8  \
         -encoder_type transformer -decoder_type transformer -position_encoding \
-        -train_steps 100000  -max_generator_batches 32 -dropout 0.1 \
-        -batch_size 4096 -batch_type tokens -normalization tokens  -accum_count 4 \
+        -train_steps 200000  -max_generator_batches 2 -dropout 0.1 \
+        -batch_size 4096 -batch_type tokens -normalization tokens  -accum_count 2 \
         -optim adam -adam_beta2 0.998 -decay_method noam -warmup_steps 8000 -learning_rate 2 \
         -max_grad_norm 0 -param_init 0  -param_init_glorot \
-        -label_smoothing 0.1  
+        -label_smoothing 0.1 -valid_steps 10000 -save_checkpoint_steps 10000 \
+        -world_size 4 -gpu_ranks 0 1 2 3 
 ```
 
 Here are what each of the parameters mean:
@@ -87,9 +88,21 @@ Here are what each of the parameters mean:
 * `batch_type tokens`, `normalization tokens`, `accum_count 4`: batch and normalize based on number of tokens and not sentences. Compute gradients based on four batches. 
 - `label_smoothing 0.1`: use label smoothing loss. 
 
+Multi GPU settings
+First you need to make sure you export CUDA_VISIBLE_DEVICES=0,1,2,3
+If you want to use GPU id 1 and 3 of your OS, you will need to export CUDA_VISIBLE_DEVICES=1,3
+* `world_size 4 gpu_ranks 0 1 2 3`: This will use 4 GPU on this node only.
+
+If you want to use 2 nodes with 2 GPU each, you need to set -master_ip and master_port, and
+* `world_size 4 gpu_ranks 0 1`: on the first node
+* `world_size 4 gpu_ranks 2 3`: on the second node
+* `accum_count 2`: This will accumulate over 2 batches before updating parameters.
+
+if you use a regular network card (1 Gbps) then we suggest to use a higher accum_count to minimize the inter-node communication.
 
-## Do you support multi-gpu?
+## How can I ensemble Models at inference?
 
-Currently our system does not support multi-gpu. It will be coming soon. 
+You can specify several models in the translate.py command line: -model model1_seed1 model2_seed2
+Bear in mind that your models must share the same traget vocabulary.
 
 
diff --git a/docs/source/Library.ipynb b/docs/source/Library.ipynb
@@ -12,8 +12,9 @@
     "import torch.nn as nn\n",
     "\n",
     "import onmt\n",
-    "import onmt.io\n",
-    "import onmt.modules"
+    "import onmt.inputters\n",
+    "import onmt.modules\n",
+    "import onmt.utils"
    ]
   },
   {
@@ -30,8 +31,8 @@
    "outputs": [],
    "source": [
     "vocab = dict(torch.load(\"../../data/data.vocab.pt\"))\n",
-    "src_padding = vocab[\"src\"].stoi[onmt.io.PAD_WORD]\n",
-    "tgt_padding = vocab[\"tgt\"].stoi[onmt.io.PAD_WORD]"
+    "src_padding = vocab[\"src\"].stoi[onmt.inputters.PAD_WORD]\n",
+    "tgt_padding = vocab[\"tgt\"].stoi[onmt.inputters.PAD_WORD]"
    ]
   },
   {
@@ -53,22 +54,22 @@
     "encoder_embeddings = onmt.modules.Embeddings(emb_size, len(vocab[\"src\"]),\n",
     "                                             word_padding_idx=src_padding)\n",
     "\n",
-    "encoder = onmt.modules.RNNEncoder(hidden_size=rnn_size, num_layers=1, \n",
+    "encoder = onmt.encoders.RNNEncoder(hidden_size=rnn_size, num_layers=1, \n",
     "                                 rnn_type=\"LSTM\", bidirectional=True,\n",
     "                                 embeddings=encoder_embeddings)\n",
     "\n",
     "decoder_embeddings = onmt.modules.Embeddings(emb_size, len(vocab[\"tgt\"]),\n",
     "                                             word_padding_idx=tgt_padding)\n",
-    "decoder = onmt.modules.InputFeedRNNDecoder(hidden_size=rnn_size, num_layers=1, \n",
+    "decoder = onmt.decoders.decoder.InputFeedRNNDecoder(hidden_size=rnn_size, num_layers=1, \n",
     "                                           bidirectional_encoder=True,\n",
     "                                           rnn_type=\"LSTM\", embeddings=decoder_embeddings)\n",
-    "model = onmt.modules.NMTModel(encoder, decoder)\n",
+    "model = onmt.models.model.NMTModel(encoder, decoder)\n",
     "\n",
     "# Specify the tgt word generator and loss computation module\n",
     "model.generator = nn.Sequential(                                                                                                                        \n",
     "            nn.Linear(rnn_size, len(vocab[\"tgt\"])),                                                                                           \n",
     "            nn.LogSoftmax())\n",
-    "loss = onmt.Loss.NMTLossCompute(model.generator, vocab[\"tgt\"]) "
+    "loss = onmt.utils.loss.NMTLossCompute(model.generator, vocab[\"tgt\"]) "
    ]
   },
   {
@@ -84,8 +85,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "optim = onmt.Optim(method=\"sgd\", lr=1, max_grad_norm=2)\n",
-    "optim.set_parameters(model.parameters())"
+    "optim = onmt.utils.optimizers.Optimizer(method=\"sgd\", lr=1, max_grad_norm=2)\n",
+    "optim.set_parameters(model.named_parameters())"
    ]
   },
   {
@@ -102,8 +103,8 @@
    "outputs": [],
    "source": [
     "# Load some data\n",
-    "data = torch.load(\"../../data/data.train.pt\")\n",
-    "valid_data = torch.load(\"../../data/data.valid.pt\")\n",
+    "data = torch.load(\"../../data/data.train.1.pt\")\n",
+    "valid_data = torch.load(\"../../data/data.valid.1.pt\")\n",
     "data.load_fields(vocab)\n",
     "valid_data.load_fields(vocab)\n",
     "data.examples = data.examples[:100]                                    "
@@ -122,11 +123,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_iter = onmt.io.OrderedIterator(                                                                                                                            \n",
+    "train_iter = onmt.inputters.OrderedIterator(                                                                                                                            \n",
     "                dataset=data, batch_size=10, \n",
     "                device=-1,                                                                                                                                                                                 \n",
     "                repeat=False)\n",
-    "valid_iter = onmt.io.OrderedIterator(                                                                                                                            \n",
+    "valid_iter = onmt.inputters.OrderedIterator(                                                                                                                            \n",
     "                dataset=valid_data, batch_size=10,                                                                                                                                                                                       \n",
     "                device=-1,\n",
     "                train=False) "
@@ -176,7 +177,7 @@
     }
    ],
    "source": [
-    "trainer = onmt.Trainer(model, train_iter, valid_iter, loss, loss, optim)\n",
+    "trainer = onmt.Trainer(model, loss, loss, optim)\n",
     "\n",
     "def report_func(*args):\n",
     "    stats = args[-1]\n",

diff --git a/docs/source/Library.md b/docs/source/Library.md
diff --git a/docs/source/Summarization.md b/docs/source/Summarization.md
@@ -1,6 +1,19 @@
-# Example: Summarization
+# Summarization
 
-This document describes how to replicate summarization experiments on the CNNDM and gigaword datasets using OpenNMT-py.
+Note: The process and results below are presented in our paper `Bottom-Up Abstractive Summarization`. Please consider citing it if you follow these instructions. 
+
+```
+@inproceedings{gehrmann2018bottom,
+  title={Bottom-Up Abstractive Summarization},
+  author={Gehrmann, Sebastian and Deng, Yuntian and Rush, Alexander},
+  booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
+  pages={4098--4109},
+  year={2018}
+}
+```
+
+
+This document describes how to replicate summarization experiments on the CNN-DM and gigaword datasets using OpenNMT-py.
 In the following, we assume access to a tokenized form of the corpus split into train/valid/test set. You can find the data [here](https://github.com/harvardnlp/sent-summary).
 
 An example article-title pair from Gigaword should look like this:
@@ -16,25 +29,25 @@ An example article-title pair from Gigaword should look like this:
 
 Since we are using copy-attention [1] in the model, we need to preprocess the dataset such that source and target are aligned and use the same dictionary. This is achieved by using the options `dynamic_dict` and `share_vocab`.
 We additionally turn off truncation of the source to ensure that inputs longer than 50 words are not truncated.
-For CNNDM we follow See et al. [2] and additionally truncate the source length at 400 tokens and the target at 100. We also note that in CNNDM, we found models to work better if the target surrounds sentences with tags such that a sentence looks like `<t> w1 w2 w3 . </t>`. If you use this formatting, you can remove the tags after the inference step with the commands `sed -i 's/ <\/t>//g' FILE.txt` and `sed -i 's/<t> //g' FILE.txt`.
+For CNN-DM we follow See et al. [2] and additionally truncate the source length at 400 tokens and the target at 100. We also note that in CNN-DM, we found models to work better if the target surrounds sentences with tags such that a sentence looks like `<t> w1 w2 w3 . </t>`. If you use this formatting, you can remove the tags after the inference step with the commands `sed -i 's/ <\/t>//g' FILE.txt` and `sed -i 's/<t> //g' FILE.txt`.
 
 **Command used**:
 
-(1) CNNDM
+(1) CNN-DM
 
 ```
 python preprocess.py -train_src data/cnndm/train.txt.src \
-                     -train_tgt data/cnndm/train.txt.tgt \
+                     -train_tgt data/cnndm/train.txt.tgt.tagged \
                      -valid_src data/cnndm/val.txt.src \
-                     -valid_tgt data/cnndm/val.txt.tgt \
+                     -valid_tgt data/cnndm/val.txt.tgt.tagged \
                      -save_data data/cnndm/CNNDM \
                      -src_seq_length 10000 \
                      -tgt_seq_length 10000 \
                      -src_seq_length_trunc 400 \
                      -tgt_seq_length_trunc 100 \
                      -dynamic_dict \
                      -share_vocab \
-                     -max_shard_size (500 * 1024 * 1024)
+                     -shard_size 100000
 ```
 
 (2) Gigaword
@@ -48,7 +61,7 @@ python preprocess.py -train_src data/giga/train.article.txt \
                      -src_seq_length 10000 \
                      -dynamic_dict \
                      -share_vocab \
-                     -max_shard_size (500 * 1024 * 1024)
+                     -shard_size 100000
 ```
 
 
@@ -67,12 +80,12 @@ The training procedure described in this section for the most part follows param
 
 
 We are using using a 128-dimensional word-embedding, and 512-dimensional 1 layer LSTM. On the encoder side, we use a bidirectional LSTM (`brnn`), which means that the 512 dimensions are split into 256 dimensions per direction.
-We also use OpenNMT's default learning rate decay, which halves the learning rate after every epoch once the validation perplexity increased after an epoch (or after epoch 8).
+
 We additionally set the maximum norm of the gradient to 2, and renormalize if the gradient norm exceeds this value and do not use any dropout.
 
 **commands used**:
 
-(1) CNNDM
+(1) CNN-DM
 
 ```
 python train.py -save_model models/cnndm \
@@ -83,23 +96,25 @@ python train.py -save_model models/cnndm \
                 -rnn_size 512 \
                 -layers 1 \
                 -encoder_type brnn \
-                -epochs 20 \
+                -train_steps 200000 \
                 -max_grad_norm 2 \
                 -dropout 0. \
                 -batch_size 16 \
+                -valid_batch_size 16 \
                 -optim adagrad \
                 -learning_rate 0.15 \
                 -adagrad_accumulator_init 0.1 \
                 -reuse_copy_attn \
                 -copy_loss_by_seqlength \
                 -bridge \
                 -seed 777 \
-                -gpuid X
+                -world_size 2 \
+                -gpu_ranks 0 1
 ```
 
-(2) CNNDM Transformer
+(2) CNN-DM Transformer
 
-The following script trains the transformer model on CNNDM
+The following script trains the transformer model on CNN-DM
 
 ```
 python -u train.py -data data/cnndm/CNNDM \
@@ -123,13 +138,13 @@ python -u train.py -data data/cnndm/CNNDM \
                    -batch_type tokens \
                    -normalization tokens \
                    -max_generator_batches 2 \
-                   -epochs 25 \
-                   -start_checkpoint_at 8 \
+                   -train_steps 200000 \
                    -accum_count 4 \
                    -share_embeddings \
                    -copy_attn \
                    -param_init_glorot \
-                   -gpuid 3
+                   -world_size 2 \
+                   -gpu_ranks 0 1
 ```
 
 (3) Gigaword
@@ -141,13 +156,13 @@ python train.py -data data/giga/GIGA \
                 -save_model models/giga \
                 -copy_attn \
                 -reuse_copy_attn \
-                -epochs 20
+                -train_steps 200000
 ```
 
 
 ### Inference
 
-During inference, we use beam-search with a beam-size of 5. We also added specific penalties that we can use during decoding, described in the following.
+During inference, we use beam-search with a beam-size of 10. We also added specific penalties that we can use during decoding, described in the following.
 
 - `stepwise_penalty`: Applies penalty at every step
 - `coverage_penalty summary`: Uses a penalty that prevents repeated attention to the same source word
@@ -159,12 +174,12 @@ During inference, we use beam-search with a beam-size of 5. We also added specif
 
 **commands used**:
 
-(1) CNNDM
+(1) CNN-DM
 
 ```
 python translate.py -gpu X \
                     -batch_size 20 \
-                    -beam_size 5 \
+                    -beam_size 10 \
                     -model models/cnndm... \
                     -src data/cnndm/test.txt.src \
                     -output testout/cnndm.out \
@@ -184,18 +199,18 @@ python translate.py -gpu X \
 
 ### Evaluation
 
-#### CNNDM
+#### CNN-DM
 
-To evaluate the ROUGE scores on CNNDM, we extended the pyrouge wrapper with additional evaluations such as the amount of repeated n-grams (typically found in models with copy attention), found [here](https://github.com/sebastianGehrmann/rouge-baselines). The repository includes a sub-repo called pyrouge. Make sure to clone the code with the `git clone --recurse-submodules https://github.com/sebastianGehrmann/rouge-baselines` command to check this out as well and follow the installation instructions on the pyrouge repository before calling this script.
+To evaluate the ROUGE scores on CNN-DM, we extended the pyrouge wrapper with additional evaluations such as the amount of repeated n-grams (typically found in models with copy attention), found [here](https://github.com/sebastianGehrmann/rouge-baselines). The repository includes a sub-repo called pyrouge. Make sure to clone the code with the `git clone --recurse-submodules https://github.com/sebastianGehrmann/rouge-baselines` command to check this out as well and follow the installation instructions on the pyrouge repository before calling this script.
 The installation instructions can be found [here](https://github.com/falcondai/pyrouge/tree/9cdbfbda8b8d96e7c2646ffd048743ddcf417ed9#installation). Note that on MacOS, we found that the pointer to your perl installation in line 1 of `pyrouge/RELEASE-1.5.5/ROUGE-1.5.5.pl` might be different from the one you have installed. A simple fix is to change this line to `#!/usr/local/bin/perl -w` if it fails.
 
 It can be run with the following command:
 
 ```
-python baseline.py -s testout/cnndm.out -t data/cnndm/test.txt.tgt -m no_sent_tag -r
+python baseline.py -s testout/cnndm.out -t data/cnndm/test.txt.tgt.tagged -m sent_tag_verbatim -r
 ```
 
-The `no_sent_tag` option strips tags around sentences - when a sentence previously was `<s> w w w w . </s>`, it becomes `w w w w .`.
+The `sent_tag_verbatim` option strips `<t>` and `</t>` tags around sentences - when a sentence previously was `<t> w w w w . </t>`, it becomes `w w w w .`.
 
 #### Gigaword
 
@@ -206,7 +221,9 @@ For evaluation of large test sets such as Gigaword, we use the a parallel python
 
 ### Scores and Models
 
-#### CNNDM
+The website generator has trouble rendering tables, if you can't read the results, please go [here](https://github.com/OpenNMT/OpenNMT-py/blob/master/docs/source/Summarization.md) for correct format.
+
+#### CNN-DM
 
 | Model Type    | Model    | R1 R  | R1 P  | R1 F  | R2 R  | R2 P  | R2 F  | RL R  | RL P  | RL F  |
 | ------------- |  -------- | -----:| -----:| -----:|------:| -----:| -----:|-----: | -----:| -----:|

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -48,7 +48,17 @@
               'sphinx.ext.githubpages',
               'sphinx.ext.napoleon',
               'sphinxcontrib.mermaid',
-              'sphinxcontrib.bibtex']
+              'sphinxcontrib.bibtex',
+              'sphinxarg.ext']
+
+# Show base classes
+autodoc_default_options = {
+    'show-inheritance': True
+}
+
+# Use "variables" section for Attributes instead of weird block things
+# mimicking the function style.
+napoleon_use_ivar = True
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['.templates']
@@ -117,7 +127,13 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['.static']
+html_static_path = ['_static']
+
+html_context = {
+    'css_files': [
+        '_static/theme_overrides.css',  # override wide tables in RTD theme
+        ],
+     }
 
 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.

diff --git a/docs/source/extended.md b/docs/source/extended.md
@@ -1,5 +1,5 @@
 
-# Example: Translation
+# Translation
 
 The example below uses the Moses tokenizer (http://www.statmt.org/moses/) to prepare the data and the moses BLEU script for evaluation. This example if for training for the WMT'16 Multimodal Translation task (http://www.statmt.org/wmt16/multimodal-task.html).
 
@@ -23,17 +23,17 @@ python preprocess.py -train_src data/multi30k/train.en.atok -train_tgt data/mult
 Step 2. Train the model.
 
 ```bash
-python train.py -data data/multi30k.atok.low -save_model multi30k_model -gpuid 0
+python train.py -data data/multi30k.atok.low -save_model multi30k_model -gpu_ranks 0
 ```
 
 Step 3. Translate sentences.
 
 ```bash
-python translate.py -gpu 0 -model multi30k_model_*_e13.pt -src data/multi30k/test.en.atok -tgt data/multi30k/test.de.atok -replace_unk -verbose -output multi30k.test.pred.atok
+python translate.py -gpu 0 -model multi30k_model_*_e13.pt -src data/multi30k/test2016.en.atok -tgt data/multi30k/test2016.de.atok -replace_unk -verbose -output multi30k.test.pred.atok
 ```
 
 And evaluate
 
 ```bash
-perl tools/multi-bleu.perl data/multi30k/test.de.atok < multi30k.test.pred.atok
+perl tools/multi-bleu.perl data/multi30k/test2016.de.atok < multi30k.test.pred.atok
 ```
diff --git a/docs/source/im2text.md b/docs/source/im2text.md
@@ -1,4 +1,4 @@
-# Example: Image to Text
+# Image to Text
 
 A deep learning-based approach to learning the image-to-text conversion, built on top of the <a href="http://opennmt.net/">OpenNMT</a> system. It is completely data-driven, hence can be used for a variety of image-to-text problems, such as image captioning, optical character recognition and LaTeX decompilation. 
 
@@ -37,21 +37,21 @@ wget -O data/im2text.tgz http://lstm.seas.harvard.edu/latex/im2text_small.tgz; t
 python preprocess.py -data_type img -src_dir data/im2text/images/ -train_src data/im2text/src-train.txt \
 					 -train_tgt data/im2text/tgt-train.txt -valid_src data/im2text/src-val.txt \
 					 -valid_tgt data/im2text/tgt-val.txt -save_data data/im2text/demo \
-					 -tgt_seq_length 150 -tgt_words_min_frequency 2
+					 -tgt_seq_length 150 -tgt_words_min_frequency 2 -shard_size 500 -image_channel_size 1
 ```
 
 2) Train the model.
 
 ```
-python train.py -model_type img -data data/im2text/demo -save_model demo-model -gpuid 0 -batch_size 20 \
-				-max_grad_norm 20 -learning_rate 0.1 -word_vec_size 80 -encoder_type brnn
+python train.py -model_type img -data data/im2text/demo -save_model demo-model -gpu_ranks 0 -batch_size 20 \
+				-max_grad_norm 20 -learning_rate 0.1 -word_vec_size 80 -encoder_type brnn -image_channel_size 1
 ```
 
 3) Translate the images.
 
 ```
 python translate.py -data_type img -model demo-model_acc_x_ppl_x_e13.pt -src_dir data/im2text/images \
-					-src data/im2text/src-test.txt -output pred.txt -beam_size 5 -gpu 0 -verbose
+					-src data/im2text/src-test.txt -output pred.txt -max_length 150 -beam_size 5 -gpu 0 -verbose
 ```
 
 The above dataset is sampled from the [im2latex-100k-dataset](http://lstm.seas.harvard.edu/latex/im2text.tgz). We provide a trained model [[link]](http://lstm.seas.harvard.edu/latex/py-model.pt) on this dataset.

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -2,23 +2,43 @@ Contents
 --------
 
 .. toctree::
+      :caption: Getting Started
       :maxdepth: 2
 
       main.md
       quickstart.md
-      onmt.rst
-      onmt.modules.rst
-      onmt.translation.rst
-      onmt.io.rst
-      Library.md
+      FAQ.md
+      CONTRIBUTING.md
+      ref.rst
 
-      options/preprocess.md
-      options/train.md
-      options/translate.md
+
+.. toctree::
+      :caption: Examples
+      :maxdepth: 2
+
+      Library.md
       extended.md
       Summarization.md
       im2text.md
       speech2text.md
-      FAQ.md
-      CONTRIBUTING.md
-      ref.rst
+
+
+.. toctree::
+      :caption: Scripts
+      :maxdepth: 2
+
+      options/preprocess.rst
+      options/train.rst
+      options/translate.rst
+      options/server.rst
+
+
+.. toctree::
+      :caption: API
+      :maxdepth: 2
+
+      onmt.rst
+      onmt.modules.rst
+      onmt.translation.rst
+      onmt.translate.translation_server.rst
+      onmt.inputters.rst
diff --git a/docs/source/main.md b/docs/source/main.md
@@ -1,7 +1,7 @@
 # Overview
 
 
-This portal provides a detailled documentation of the OpenNMT toolkit. It describes how to use the PyTorch project and how it works.
+This portal provides a detailed documentation of the OpenNMT toolkit. It describes how to use the PyTorch project and how it works.
 
 
 
@@ -52,3 +52,5 @@ When using OpenNMT for research please cite our
 You can find additional help or tutorials in the following resources:
 
 * [Gitter channel](https://gitter.im/OpenNMT/openmt-py)
+
+* [Forum](http://forum.opennmt.net/)
diff --git a/docs/source/onmt.modules.rst b/docs/source/onmt.modules.rst
@@ -1,4 +1,4 @@
-Doc: Modules
+Modules
 =============
 
 Core Modules
@@ -11,52 +11,56 @@ Core Modules
 Encoders
 ---------
 
-.. autoclass:: onmt.modules.EncoderBase
+.. autoclass:: onmt.encoders.EncoderBase
     :members:
 
-.. autoclass:: onmt.modules.MeanEncoder
+.. autoclass:: onmt.encoders.MeanEncoder
     :members:
 
-.. autoclass:: onmt.modules.RNNEncoder
+.. autoclass:: onmt.encoders.RNNEncoder
     :members:
 
 
 Decoders
 ---------
 
 
-.. autoclass:: onmt.modules.RNNDecoderBase
+.. autoclass:: onmt.decoders.DecoderBase
     :members:
-
-
-.. autoclass:: onmt.modules.StdRNNDecoder
+
+.. autoclass:: onmt.decoders.decoder.RNNDecoderBase
     :members:
 
+.. autoclass:: onmt.decoders.StdRNNDecoder
+    :members:
 
-.. autoclass:: onmt.modules.InputFeedRNNDecoder
+.. autoclass:: onmt.decoders.InputFeedRNNDecoder
     :members:
 
 Attention
 ----------
 
+.. autoclass:: onmt.modules.AverageAttention
+    :members:
+
 .. autoclass:: onmt.modules.GlobalAttention
     :members:
 
 
 
-Architecture: Transfomer
+Architecture: Transformer
 ----------------------------
 
 .. autoclass:: onmt.modules.PositionalEncoding
     :members:
 
-.. autoclass:: onmt.modules.PositionwiseFeedForward
+.. autoclass:: onmt.modules.position_ffn.PositionwiseFeedForward
     :members:
 
-.. autoclass:: onmt.modules.TransformerEncoder
+.. autoclass:: onmt.encoders.TransformerEncoder
     :members:
 
-.. autoclass:: onmt.modules.TransformerDecoder
+.. autoclass:: onmt.decoders.TransformerDecoder
     :members:
 
 .. autoclass:: onmt.modules.MultiHeadedAttention
@@ -71,23 +75,23 @@ Architecture: Conv2Conv
 and have not been thoroughly tested.)
 
 
-.. autoclass:: onmt.modules.CNNEncoder
+.. autoclass:: onmt.encoders.CNNEncoder
     :members:
 
 
-.. autoclass:: onmt.modules.CNNDecoder
+.. autoclass:: onmt.decoders.CNNDecoder
     :members:
 
 .. autoclass:: onmt.modules.ConvMultiStepAttention
     :members:
 
-.. autoclass:: onmt.modules.WeightNorm
+.. autoclass:: onmt.modules.WeightNormConv2d
     :members:
 
 Architecture: SRU
 ----------------------------
 
-.. autoclass:: onmt.modules.SRU
+.. autoclass:: onmt.models.sru.SRU
     :members:
 
 
@@ -96,13 +100,13 @@ Alternative Encoders
 
 onmt\.modules\.AudioEncoder
 
-.. autoclass:: onmt.modules.AudioEncoder
+.. autoclass:: onmt.encoders.AudioEncoder
     :members:
 
 
 onmt\.modules\.ImageEncoder
 
-.. autoclass:: onmt.modules.ImageEncoder
+.. autoclass:: onmt.encoders.ImageEncoder
     :members:
 
 
@@ -116,5 +120,5 @@ Copy Attention
 Structured Attention
 -------------------------------------------
 
-.. autoclass:: onmt.modules.MatrixTree
+.. autoclass:: onmt.modules.structured_attention.MatrixTree
     :members:
diff --git a/docs/source/onmt.rst b/docs/source/onmt.rst
@@ -1,13 +1,10 @@
-Doc: Framework
+Framework
 =================
 
 Model
 -----
 
-.. autoclass:: onmt.Models.NMTModel
-    :members:
-
-.. autoclass:: onmt.Models.DecoderState
+.. autoclass:: onmt.models.NMTModel
     :members:
 
 Trainer
@@ -17,19 +14,19 @@ Trainer
     :members:
 
 
-.. autoclass:: onmt.Statistics
+.. autoclass:: onmt.utils.Statistics
     :members:
 
 Loss
 ----
 
 
-.. autoclass:: onmt.Loss.LossComputeBase
+.. autoclass:: onmt.utils.loss.LossComputeBase
     :members:
 
 
-Optim
+Optimizer
 -----
 
-.. autoclass:: onmt.Optim.Optim
+.. autoclass:: onmt.utils.Optimizer
     :members:
diff --git a/docs/source/onmt.translation.rst b/docs/source/onmt.translation.rst
@@ -1,4 +1,4 @@
-Doc: Translation
+Translation
 ==================
 
 Translations
@@ -17,10 +17,22 @@ Translator Class
     :members:
 
 
-Beam Search
--------------
+Decoding Strategies
+--------------------
+.. autoclass:: onmt.translate.DecodeStrategy
+    :members:
+
+.. autoclass:: onmt.translate.BeamSearch
+    :members:
+
+.. autofunction:: onmt.translate.random_sampling.sample_with_temperature
+
+.. autoclass:: onmt.translate.RandomSampling
+    :members:
 
-.. autoclass:: onmt.translate.Beam
+Scoring
+--------
+.. autoclass:: onmt.translate.penalties.PenaltyBuilder
     :members:
 
 .. autoclass:: onmt.translate.GNMTGlobalScorer

diff --git a/docs/source/options/preprocess.md b/docs/source/options/preprocess.md
@@ -29,7 +29,14 @@ Output file for the prepared data
 * **-max_shard_size []** 
 For text corpus of large volume, it will be divided into shards of this size to
 preprocess. If 0, the data will be handled as a whole. The unit is in bytes.
-Optimal value should be multiples of 64 bytes.
+Optimal value should be multiples of 64 bytes. A commonly used sharding value is
+131072000. It is recommended to ensure the corpus is shuffled before sharding.
+
+* **-shard_size []** 
+Divide src_corpus and tgt_corpus into smaller multiple src_copus and tgt corpus
+files, then build shards, each shard will have opt.shard_size samples except
+last shard. shard_size=0 means no segmentation shard_size>0 means segment
+dataset into multiple shards, each shard has shard_size samples
 
 ### **Vocab**:
 * **-src_vocab []** 
@@ -84,6 +91,9 @@ Random seed
 * **-report_every [100000]** 
 Report status every this many sentences
 
+* **-log_file []** 
+Output logs to a file under this path.
+
 ### **Speech**:
 * **-sample_rate [16000]** 
 Sample rate.
@@ -96,3 +106,6 @@ Window stride for spectrogram in seconds.
 
 * **-window [hamming]** 
 Window type for spectrogram generation.
+
+* **-image_channel_size [3]** 
+Using grayscale image can training model faster and smaller
diff --git a/docs/source/options/train.md b/docs/source/options/train.md
@@ -59,8 +59,21 @@ Number of layers in the encoder
 * **-dec_layers [2]** 
 Number of layers in the decoder
 
-* **-rnn_size [500]** 
-Size of rnn hidden states
+* **-rnn_size [-1]** 
+Size of rnn hidden states. Overwrites enc_rnn_size and dec_rnn_size
+
+* **-enc_rnn_size [500]** 
+Size of encoder rnn hidden states. Must be equal to dec_rnn_size except for
+speech-to-text.
+
+* **-dec_rnn_size [500]** 
+Size of decoder rnn hidden states. Must be equal to enc_rnn_size except for
+speech-to-text.
+
+* **-audio_enc_pooling [1]** 
+The amount of pooling of audio encoder, either the same amount of pooling across
+all layers indicated by a single number, or different amounts of pooling per
+layer separated by comma.
 
 * **-cnn_kernel_width [3]** 
 Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in conv
@@ -80,19 +93,32 @@ The gate type to use in the RNNs
 * **-brnn []** 
 Deprecated, use `encoder_type`.
 
-* **-brnn_merge [concat]** 
-Merge action for the bidir hidden states
-
 * **-context_gate []** 
 Type of context gate to use. Do not select for no context gate.
 
 ### **Model- Attention**:
 * **-global_attention [general]** 
 The attention type to use: dotprod or general (Luong) or MLP (Bahdanau)
 
+* **-global_attention_function [softmax]** 
+
+* **-self_attn_type [scaled-dot]** 
+Self attention type in Transformer decoder layer -- currently "scaled-dot" or
+"average"
+
+* **-heads [8]** 
+Number of heads for transformer self-attention
+
+* **-transformer_ff [2048]** 
+Size of hidden transformer feed-forward
+
 * **-copy_attn []** 
 Train copy attention layer.
 
+* **-generator_function [softmax]** 
+Which function to use for generating probabilities over the target vocabulary
+(choices: softmax, sparsemax)
+
 * **-copy_attn_force []** 
 When available, train to copy.
 
@@ -113,19 +139,47 @@ Lambda value for coverage.
 Path prefix to the ".train.pt" and ".valid.pt" file path from preprocess.py
 
 * **-save_model [model]** 
-Model filename (the model will be saved as <save_model>_epochN_PPL.pt where PPL
-is the validation perplexity
+Model filename (the model will be saved as <save_model>_N.pt where N is the
+number of steps
+
+* **-save_checkpoint_steps [5000]** 
+Save a checkpoint every X steps
+
+* **-reset_optim [none]** 
+Ability to reset optimizer. Options:
+"all": reset completely the optimizer (train_steps, type of optim, ....), 
+"states": load everything from the checkpoint except Adam states, 
+"keep_states": load Adam states from the checkpoint but apply
+command line changes
+
+* **-keep_checkpoint [-1]** 
+Keep X checkpoints (negative: keep all)
 
 * **-gpuid []** 
-Use CUDA on the listed devices.
+Deprecated see world_size and gpu_ranks.
+
+* **-gpu_ranks []** 
+list of ranks of each process.
+
+* **-world_size [1]** 
+total number of distributed processes.
+
+* **-gpu_backend [nccl]** 
+Type of torch distributed backend
+
+* **-gpu_verbose_level []** 
+Gives more info on each process per GPU.
+
+* **-master_ip [localhost]** 
+IP of master for torch.distributed training.
+
+* **-master_port [10000]** 
+Port of master for torch.distributed training.
 
 * **-seed [-1]** 
 Random seed used for the experiments reproducibility.
 
 ### **Initialization**:
-* **-start_epoch [1]** 
-The epoch from which to start
-
 * **-param_init [0.1]** 
 Parameters are initialized over uniform distribution with support (-param_init,
 param_init). Use 0 to not use initialization
@@ -149,7 +203,7 @@ the decoder side. See README for specific formatting instructions.
 Fix word embeddings on the encoder side.
 
 * **-fix_word_vecs_dec []** 
-Fix word embeddings on the encoder side.
+Fix word embeddings on the decoder side.
 
 ### **Optimization- Type**:
 * **-batch_size [64]** 
@@ -166,15 +220,21 @@ Normalization method of the gradient.
 Accumulate gradient this many times. Approximately equivalent to updating
 batch_size * accum_count batches at once. Recommended for Transformer.
 
+* **-valid_steps [10000]** 
+Perfom validation every X steps
+
 * **-valid_batch_size [32]** 
 Maximum batch size for validation
 
 * **-max_generator_batches [32]** 
 Maximum batches of words in a sequence to run the generator on in parallel.
 Higher is faster, but uses more memory.
 
-* **-epochs [13]** 
-Number of training epochs
+* **-train_steps [100000]** 
+Number of training steps
+
+* **-epochs []** 
+Deprecated epochs see train_steps
 
 * **-optim [sgd]** 
 Optimization method.
@@ -220,13 +280,14 @@ Starting learning rate. Recommended settings: sgd = 1, adagrad = 0.1, adadelta =
 
 * **-learning_rate_decay [0.5]** 
 If update_learning_rate, decay learning rate by this much if (i) perplexity does
-not decrease on the validation set or (ii) epoch has gone past start_decay_at
+not decrease on the validation set or (ii) steps have gone past
+start_decay_steps
 
-* **-start_decay_at [8]** 
-Start decaying every epoch after and including this epoch
+* **-start_decay_steps [50000]** 
+Start decaying every decay_steps after start_decay_steps
 
-* **-start_checkpoint_at []** 
-Start checkpointing every epoch after and including this epoch
+* **-decay_steps [10000]** 
+Decay every decay_steps
 
 * **-decay_method []** 
 Use a custom decay rate.
@@ -238,6 +299,9 @@ Number of warmup steps for custom decay.
 * **-report_every [50]** 
 Print stats at this interval.
 
+* **-log_file []** 
+Output logs to a file under this path.
+
 * **-exp_host []** 
 Send logs to this crayon server.
 
@@ -257,3 +321,6 @@ Sample rate.
 
 * **-window_size [0.02]** 
 Window size for spectrogram in seconds.
+
+* **-image_channel_size [3]** 
+Using grayscale image can training model faster and smaller
diff --git a/docs/source/options/translate.md b/docs/source/options/translate.md
@@ -5,8 +5,9 @@ translate.py
 translate.py
 
 ### **Model**:
-* **-model []** 
-Path to model .pt file
+* **-models []** 
+Path to model .pt file(s). Multiple models can be specified, for ensemble
+decoding.
 
 ### **Data**:
 * **-data_type [text]** 
@@ -38,6 +39,9 @@ Create dynamic dictionaries
 Share source and target vocabulary
 
 ### **Beam**:
+* **-fast []** 
+Use fast beam search (some features may not be supported!)
+
 * **-beam_size [5]** 
 Beam size
 
@@ -54,10 +58,10 @@ Deprecated, use `-max_length` instead
 Apply penalty at every decoding step. Helpful for summary penalty.
 
 * **-length_penalty [none]** 
-Length Penalty to use.
+Length Penalty to use. Options are [wu | avg | none]
 
 * **-coverage_penalty [none]** 
-Coverage Penalty to use.
+Coverage Penalty to use. Options are [wu | summary | none]
 
 * **-alpha []** 
 Google NMT length penalty parameter (higher = longer generation)
@@ -83,6 +87,9 @@ source token
 * **-verbose []** 
 Print scores and predictions for each sentence
 
+* **-log_file []** 
+Output logs to a file under this path.
+
 * **-attn_debug []** 
 Print best attn for each word
 
@@ -111,3 +118,6 @@ Window stride for spectrogram in seconds
 
 * **-window [hamming]** 
 Window type for spectrogram generation
+
+* **-image_channel_size [3]** 
+Using grayscale image can training model faster and smaller
diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md
@@ -35,8 +35,11 @@ python train.py -data data/demo -save_model demo-model
 
 The main train command is quite simple. Minimally it takes a data file
 and a save file.  This will run the default model, which consists of a
-2-layer LSTM with 500 hidden units on both the encoder/decoder. You
-can also add `-gpuid 1` to use (say) GPU 1.
+2-layer LSTM with 500 hidden units on both the encoder/decoder.
+If you want to train on GPU, you need to set, as an example:
+CUDA_VISIBLE_DEVICES=1,3
+`-world_size 2 -gpu_ranks 0 1` to use (say) GPU 1 and 3 on this node only.
+To know more about distributed training on single or multi nodes, read the FAQ section.
 
 ### Step 3: Translate
 

diff --git a/docs/source/refs.bib b/docs/source/refs.bib
@@ -332,3 +332,106 @@ @ARTICLE{2017arXiv170301619N
                           adsurl = {http://adsabs.harvard.edu/abs/2017arXiv170301619N},
                             adsnote = {Provided by the SAO/NASA Astrophysics Data System}
                             }
+
+@article{DBLP:journals/corr/VaswaniSPUJGKP17,
+  author    = {Ashish Vaswani and
+               Noam Shazeer and
+               Niki Parmar and
+               Jakob Uszkoreit and
+               Llion Jones and
+               Aidan N. Gomez and
+               Lukasz Kaiser and
+               Illia Polosukhin},
+  title     = {Attention Is All You Need},
+  journal   = {CoRR},
+  volume    = {abs/1706.03762},
+  year      = {2017},
+  url       = {http://arxiv.org/abs/1706.03762},
+  archivePrefix = {arXiv},
+  eprint    = {1706.03762},
+  timestamp = {Mon, 13 Aug 2018 16:48:37 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/VaswaniSPUJGKP17},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/GehringAGYD17,
+  author    = {Jonas Gehring and
+               Michael Auli and
+               David Grangier and
+               Denis Yarats and
+               Yann N. Dauphin},
+  title     = {Convolutional Sequence to Sequence Learning},
+  journal   = {CoRR},
+  volume    = {abs/1705.03122},
+  year      = {2017},
+  url       = {http://arxiv.org/abs/1705.03122},
+  archivePrefix = {arXiv},
+  eprint    = {1705.03122},
+  timestamp = {Mon, 13 Aug 2018 16:48:03 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/GehringAGYD17},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/abs-1709-02755,
+  author    = {Tao Lei and
+               Yu Zhang and
+               Yoav Artzi},
+  title     = {Training RNNs as Fast as CNNs},
+  journal   = {CoRR},
+  volume    = {abs/1709.02755},
+  year      = {2017},
+  url       = {http://arxiv.org/abs/1709.02755},
+  archivePrefix = {arXiv},
+  eprint    = {1709.02755},
+  timestamp = {Mon, 13 Aug 2018 16:46:29 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1709-02755},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/SeeLM17,
+  author    = {Abigail See and
+               Peter J. Liu and
+               Christopher D. Manning},
+  title     = {Get To The Point: Summarization with Pointer-Generator Networks},
+  journal   = {CoRR},
+  volume    = {abs/1704.04368},
+  year      = {2017},
+  url       = {http://arxiv.org/abs/1704.04368},
+  archivePrefix = {arXiv},
+  eprint    = {1704.04368},
+  timestamp = {Mon, 13 Aug 2018 16:46:08 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/SeeLM17},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/abs-1805-00631,
+  author    = {Biao Zhang and
+               Deyi Xiong and
+               Jinsong Su},
+  title     = {Accelerating Neural Transformer via an Average Attention Network},
+  journal   = {CoRR},
+  volume    = {abs/1805.00631},
+  year      = {2018},
+  url       = {http://arxiv.org/abs/1805.00631},
+  archivePrefix = {arXiv},
+  eprint    = {1805.00631},
+  timestamp = {Mon, 13 Aug 2018 16:46:01 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1805-00631},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/MartinsA16,
+  author    = {Andr{\'{e}} F. T. Martins and
+               Ram{\'{o}}n Fern{\'{a}}ndez Astudillo},
+  title     = {From Softmax to Sparsemax: {A} Sparse Model of Attention and Multi-Label
+               Classification},
+  journal   = {CoRR},
+  volume    = {abs/1602.02068},
+  year      = {2016},
+  url       = {http://arxiv.org/abs/1602.02068},
+  archivePrefix = {arXiv},
+  eprint    = {1602.02068},
+  timestamp = {Mon, 13 Aug 2018 16:49:13 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/MartinsA16},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
diff --git a/docs/source/speech2text.md b/docs/source/speech2text.md
@@ -1,4 +1,4 @@
-# Example: Speech to Text
+# Speech to Text
 
 A deep learning-based approach to learning the speech-to-text conversion, built on top of the <a href="http://opennmt.net/">OpenNMT</a> system.
 
@@ -23,13 +23,13 @@ wget -O data/speech.tgz http://lstm.seas.harvard.edu/latex/speech.tgz; tar zxf d
 1) Preprocess the data.
 
 ```
-python preprocess.py -data_type audio -src_dir data/speech/an4_dataset -train_src data/speech/src-train.txt -train_tgt data/speech/tgt-train.txt -valid_src data/speech/src-val.txt -valid_tgt data/speech/tgt-val.txt -save_data data/speech/demo
+python preprocess.py -data_type audio -src_dir data/speech/an4_dataset -train_src data/speech/src-train.txt -train_tgt data/speech/tgt-train.txt -valid_src data/speech/src-val.txt -valid_tgt data/speech/tgt-val.txt -shard_size 300 -save_data data/speech/demo
 ```
 
 2) Train the model.
 
 ```
-python train.py -model_type audio -data data/speech/demo -save_model demo-model -gpuid 0 -batch_size 16 -max_grad_norm 20 -learning_rate 0.1 -learning_rate_decay 0.98 -train_steps 100000
+python train.py -model_type audio -enc_rnn_size 512 -dec_rnn_size 512 -audio_enc_pooling 1,1,2,2 -dropout 0 -enc_layers 4 -dec_layers 1 -rnn_type LSTM -data data/speech/demo -save_model demo-model -global_attention mlp -gpu_ranks 0 -batch_size 8 -optim adam -max_grad_norm 100 -learning_rate 0.0003 -learning_rate_decay 0.8 -train_steps 100000
 ```
 
 3) Translate the speechs.

diff --git a/extract_embeddings.py b/extract_embeddings.py
@@ -0,0 +1,85 @@
+from __future__ import division
+import torch
+import argparse
+import onmt
+import onmt.model_builder
+import onmt.inputters
+import onmt.opts
+
+from onmt.utils.misc import use_gpu
+from onmt.utils.logging import init_logger, logger
+
+parser = argparse.ArgumentParser(description='translate.py')
+
+parser.add_argument('-model', required=True,
+                    help='Path to model .pt file')
+parser.add_argument('-output_dir', default='.',
+                    help="""Path to output the embeddings""")
+parser.add_argument('-gpu', type=int, default=-1,
+                    help="Device to run on")
+
+
+def write_embeddings(filename, dict, embeddings):
+    with open(filename, 'wb') as file:
+        for i in range(min(len(embeddings), len(dict.itos))):
+            str = dict.itos[i].encode("utf-8")
+            for j in range(len(embeddings[0])):
+                str = str + (" %5f" % (embeddings[i][j])).encode("utf-8")
+            file.write(str + b"\n")
+
+
+def main():
+    dummy_parser = argparse.ArgumentParser(description='train.py')
+    onmt.opts.model_opts(dummy_parser)
+    dummy_opt = dummy_parser.parse_known_args([])[0]
+    opt = parser.parse_args()
+    opt.cuda = opt.gpu > -1
+    if opt.cuda:
+        torch.cuda.set_device(opt.gpu)
+
+    # Add in default model arguments, possibly added since training.
+    checkpoint = torch.load(opt.model,
+                            map_location=lambda storage, loc: storage)
+    model_opt = checkpoint['opt']
+
+    src_dict, tgt_dict = None, None
+
+    # the vocab object is a list of tuple (name, torchtext.Vocab)
+    # we iterate over this list and associate vocabularies based on the name
+    for vocab in checkpoint['vocab']:
+        if vocab[0] == 'src':
+            src_dict = vocab[1]
+        if vocab[0] == 'tgt':
+            tgt_dict = vocab[1]
+    assert src_dict is not None and tgt_dict is not None
+
+    fields = onmt.inputters.load_fields_from_vocab(checkpoint['vocab'])
+
+    model_opt = checkpoint['opt']
+    for arg in dummy_opt.__dict__:
+        if arg not in model_opt:
+            model_opt.__dict__[arg] = dummy_opt.__dict__[arg]
+
+    model = onmt.model_builder.build_base_model(
+        model_opt, fields, use_gpu(opt), checkpoint)
+    encoder = model.encoder
+    decoder = model.decoder
+
+    encoder_embeddings = encoder.embeddings.word_lut.weight.data.tolist()
+    decoder_embeddings = decoder.embeddings.word_lut.weight.data.tolist()
+
+    logger.info("Writing source embeddings")
+    write_embeddings(opt.output_dir + "/src_embeddings.txt", src_dict,
+                     encoder_embeddings)
+
+    logger.info("Writing target embeddings")
+    write_embeddings(opt.output_dir + "/tgt_embeddings.txt", tgt_dict,
+                     decoder_embeddings)
+
+    logger.info('... done.')
+    logger.info('Converting model...')
+
+
+if __name__ == "__main__":
+    init_logger('extract_embeddings.log')
+    main()
diff --git a/mostra_opcoes.lua b/mostra_opcoes.lua
@@ -0,0 +1,13 @@
+require('onmt.init')
+
+local cmd = torch.CmdLine()
+cmd:option('-model', '', 'trained model file')
+cmd:option('-gpuid', 0, 'gpu id to load gpu model')
+local opt = cmd:parse(arg)
+
+if opt.gpuid > 0 then
+  require('cutorch')
+end
+local checkpoint = torch.load(opt.model)
+print(checkpoint.options)
+
diff --git a/onmt/__init__.py b/onmt/__init__.py
@@ -17,4 +17,4 @@
 __all__ = [onmt.inputters, onmt.encoders, onmt.decoders, onmt.models,
            onmt.utils, onmt.modules, "Trainer"]
 
-__version__ = "0.2.0"
+__version__ = "0.8.2"
diff --git a/onmt/decoders/__init__.py b/onmt/decoders/__init__.py
@@ -1 +1,12 @@
 """Module defining decoders."""
+from onmt.decoders.decoder import DecoderBase, InputFeedRNNDecoder, \
+    StdRNNDecoder
+from onmt.decoders.transformer import TransformerDecoder
+from onmt.decoders.cnn_decoder import CNNDecoder
+
+
+str2dec = {"rnn": StdRNNDecoder, "ifrnn": InputFeedRNNDecoder,
+           "cnn": CNNDecoder, "transformer": TransformerDecoder}
+
+__all__ = ["DecoderBase", "TransformerDecoder", "StdRNNDecoder", "CNNDecoder",
+           "InputFeedRNNDecoder", "str2dec"]
diff --git a/onmt/decoders/cnn_decoder.py b/onmt/decoders/cnn_decoder.py
@@ -1,79 +1,88 @@
-"""
-Implementation of the CNN Decoder part of
-  "Convolutional Sequence to Sequence Learning"
+"""Implementation of the CNN Decoder part of
+"Convolutional Sequence to Sequence Learning"
 """
 import torch
 import torch.nn as nn
 
-import onmt.modules
-from onmt.decoders.decoder import DecoderState
-from onmt.utils.misc import aeq
+from onmt.modules import ConvMultiStepAttention, GlobalAttention
 from onmt.utils.cnn_factory import shape_transform, GatedConv
+from onmt.decoders.decoder import DecoderBase
 
 SCALE_WEIGHT = 0.5 ** 0.5
 
 
-class CNNDecoder(nn.Module):
-    """
-    Decoder built on CNN, based on :cite:`DBLP:journals/corr/GehringAGYD17`.
-
+class CNNDecoder(DecoderBase):
+    """Decoder based on "Convolutional Sequence to Sequence Learning"
+    :cite:`DBLP:journals/corr/GehringAGYD17`.
 
     Consists of residual convolutional layers, with ConvMultiStepAttention.
     """
 
     def __init__(self, num_layers, hidden_size, attn_type,
-                 copy_attn, cnn_kernel_width, dropout, embeddings):
+                 copy_attn, cnn_kernel_width, dropout, embeddings,
+                 copy_attn_type):
         super(CNNDecoder, self).__init__()
 
-        # Basic attributes.
-        self.decoder_type = 'cnn'
-        self.num_layers = num_layers
-        self.hidden_size = hidden_size
         self.cnn_kernel_width = cnn_kernel_width
         self.embeddings = embeddings
-        self.dropout = dropout
 
-        # Build the CNN.
+        # Decoder State
+        self.state = {}
+
         input_size = self.embeddings.embedding_size
-        self.linear = nn.Linear(input_size, self.hidden_size)
-        self.conv_layers = nn.ModuleList()
-        for _ in range(self.num_layers):
-            self.conv_layers.append(
-                GatedConv(self.hidden_size, self.cnn_kernel_width,
-                          self.dropout, True))
-
-        self.attn_layers = nn.ModuleList()
-        for _ in range(self.num_layers):
-            self.attn_layers.append(
-                onmt.modules.ConvMultiStepAttention(self.hidden_size))
+        self.linear = nn.Linear(input_size, hidden_size)
+        self.conv_layers = nn.ModuleList(
+            [GatedConv(hidden_size, cnn_kernel_width, dropout, True)
+             for i in range(num_layers)]
+        )
+        self.attn_layers = nn.ModuleList(
+            [ConvMultiStepAttention(hidden_size) for i in range(num_layers)]
+        )
 
         # CNNDecoder has its own attention mechanism.
-        # Set up a separated copy attention layer, if needed.
-        self._copy = False
+        # Set up a separate copy attention layer if needed.
+        assert not copy_attn, "Copy mechanism not yet tested in conv2conv"
         if copy_attn:
-            self.copy_attn = onmt.modules.GlobalAttention(
-                hidden_size, attn_type=attn_type)
-            self._copy = True
-
-    def forward(self, tgt, memory_bank, state, memory_lengths=None, step=None):
+            self.copy_attn = GlobalAttention(
+                hidden_size, attn_type=copy_attn_type)
+        else:
+            self.copy_attn = None
+
+    @classmethod
+    def from_opt(cls, opt, embeddings):
+        """Alternate constructor."""
+        return cls(
+            opt.dec_layers,
+            opt.dec_rnn_size,
+            opt.global_attention,
+            opt.copy_attn,
+            opt.cnn_kernel_width,
+            opt.dropout,
+            embeddings,
+            opt.copy_attn_type)
+
+    def init_state(self, _, memory_bank, enc_hidden):
+        """Init decoder state."""
+        self.state["src"] = (memory_bank + enc_hidden) * SCALE_WEIGHT
+        self.state["previous_input"] = None
+
+    def map_state(self, fn):
+        self.state["src"] = fn(self.state["src"], 1)
+        if self.state["previous_input"] is not None:
+            self.state["previous_input"] = fn(self.state["previous_input"], 1)
+
+    def detach_state(self):
+        self.state["previous_input"] = self.state["previous_input"].detach()
+
+    def forward(self, tgt, memory_bank, step=None, **kwargs):
         """ See :obj:`onmt.modules.RNNDecoderBase.forward()`"""
-        # NOTE: memory_lengths is only here for compatibility reasons
-        #       with onmt.modules.RNNDecoderBase.forward()
-        # CHECKS
-        assert isinstance(state, CNNDecoderState)
-        _, tgt_batch, _ = tgt.size()
-        _, contxt_batch, _ = memory_bank.size()
-        aeq(tgt_batch, contxt_batch)
-        # END CHECKS
-
-        if state.previous_input is not None:
-            tgt = torch.cat([state.previous_input, tgt], 0)
-
-        # Initialize return variables.
-        outputs = []
+
+        if self.state["previous_input"] is not None:
+            tgt = torch.cat([self.state["previous_input"], tgt], 0)
+
+        dec_outs = []
         attns = {"std": []}
-        assert not self._copy, "Copy mechanism not yet tested in conv2conv"
-        if self._copy:
+        if self.copy_attn is not None:
             attns["copy"] = []
 
         emb = self.embeddings(tgt)
@@ -83,17 +92,15 @@ def forward(self, tgt, memory_bank, state, memory_lengths=None, step=None):
         # The output of CNNEncoder.
         src_memory_bank_t = memory_bank.transpose(0, 1).contiguous()
         # The combination of output of CNNEncoder and source embeddings.
-        src_memory_bank_c = state.init_src.transpose(0, 1).contiguous()
+        src_memory_bank_c = self.state["src"].transpose(0, 1).contiguous()
 
-        # Run the forward pass of the CNNDecoder.
         emb_reshape = tgt_emb.contiguous().view(
             tgt_emb.size(0) * tgt_emb.size(1), -1)
         linear_out = self.linear(emb_reshape)
         x = linear_out.view(tgt_emb.size(0), tgt_emb.size(1), -1)
         x = shape_transform(x)
 
-        pad = torch.zeros(x.size(0), x.size(1),
-                          self.cnn_kernel_width - 1, 1)
+        pad = torch.zeros(x.size(0), x.size(1), self.cnn_kernel_width - 1, 1)
 
         pad = pad.type_as(x)
         base_target_emb = x
@@ -107,50 +114,16 @@ def forward(self, tgt, memory_bank, state, memory_lengths=None, step=None):
         output = x.squeeze(3).transpose(1, 2)
 
         # Process the result and update the attentions.
-        outputs = output.transpose(0, 1).contiguous()
-        if state.previous_input is not None:
-            outputs = outputs[state.previous_input.size(0):]
-            attn = attn[:, state.previous_input.size(0):].squeeze()
+        dec_outs = output.transpose(0, 1).contiguous()
+        if self.state["previous_input"] is not None:
+            dec_outs = dec_outs[self.state["previous_input"].size(0):]
+            attn = attn[:, self.state["previous_input"].size(0):].squeeze()
             attn = torch.stack([attn])
         attns["std"] = attn
-        if self._copy:
+        if self.copy_attn is not None:
             attns["copy"] = attn
 
         # Update the state.
-        state.update_state(tgt)
-
-        return outputs, state, attns
-
-    def init_decoder_state(self, _, memory_bank, enc_hidden, with_cache=False):
-        """
-        Init decoder state.
-        """
-        return CNNDecoderState(memory_bank, enc_hidden)
-
-
-class CNNDecoderState(DecoderState):
-    """
-    Init CNN decoder state.
-    """
-
-    def __init__(self, memory_bank, enc_hidden):
-        self.init_src = (memory_bank + enc_hidden) * SCALE_WEIGHT
-        self.previous_input = None
-
-    @property
-    def _all(self):
-        """
-        Contains attributes that need to be updated in self.beam_update().
-        """
-        return (self.previous_input,)
-
-    def detach(self):
-        self.previous_input = self.previous_input.detach()
-
-    def update_state(self, new_input):
-        """ Called for every decoder forward pass. """
-        self.previous_input = new_input
-
-    def repeat_beam_size_times(self, beam_size):
-        """ Repeat beam_size times along batch dimension. """
-        self.init_src = self.init_src.data.repeat(1, beam_size, 1)
+        self.state["previous_input"] = tgt
+        # TODO change the way attns is returned dict => list or tuple (onnx)
+        return dec_outs, attns
diff --git a/onmt/decoders/decoder.py b/onmt/decoders/decoder.py
diff --git a/onmt/decoders/ensemble.py b/onmt/decoders/ensemble.py
@@ -0,0 +1,153 @@
+"""Ensemble decoding.
+
+Decodes using multiple models simultaneously,
+combining their prediction distributions by averaging.
+All models in the ensemble must share a target vocabulary.
+"""
+
+import torch
+import torch.nn as nn
+
+from onmt.encoders.encoder import EncoderBase
+from onmt.decoders.decoder import DecoderBase
+from onmt.models import NMTModel
+import onmt.model_builder
+
+
+class EnsembleDecoderOutput(object):
+    """Wrapper around multiple decoder final hidden states."""
+    def __init__(self, model_dec_outs):
+        self.model_dec_outs = tuple(model_dec_outs)
+
+    def squeeze(self, dim=None):
+        """Delegate squeeze to avoid modifying
+        :func:`onmt.translate.translator.Translator.translate_batch()`
+        """
+        return EnsembleDecoderOutput([
+            x.squeeze(dim) for x in self.model_dec_outs])
+
+    def __getitem__(self, index):
+        return self.model_dec_outs[index]
+
+
+class EnsembleEncoder(EncoderBase):
+    """Dummy Encoder that delegates to individual real Encoders."""
+    def __init__(self, model_encoders):
+        super(EnsembleEncoder, self).__init__()
+        self.model_encoders = nn.ModuleList(model_encoders)
+
+    def forward(self, src, lengths=None):
+        enc_hidden, memory_bank, _ = zip(*[
+            model_encoder(src, lengths)
+            for model_encoder in self.model_encoders])
+        return enc_hidden, memory_bank, lengths
+
+
+class EnsembleDecoder(DecoderBase):
+    """Dummy Decoder that delegates to individual real Decoders."""
+    def __init__(self, model_decoders):
+        model_decoders = nn.ModuleList(model_decoders)
+        attentional = any([dec.attentional for dec in model_decoders])
+        super(EnsembleDecoder, self).__init__(attentional)
+        self.model_decoders = model_decoders
+
+    def forward(self, tgt, memory_bank, memory_lengths=None, step=None):
+        """See :func:`onmt.decoders.decoder.DecoderBase.forward()`."""
+        # Memory_lengths is a single tensor shared between all models.
+        # This assumption will not hold if Translator is modified
+        # to calculate memory_lengths as something other than the length
+        # of the input.
+        dec_outs, attns = zip(*[
+            model_decoder(
+                tgt, memory_bank[i],
+                memory_lengths=memory_lengths, step=step)
+            for i, model_decoder in enumerate(self.model_decoders)])
+        mean_attns = self.combine_attns(attns)
+        return EnsembleDecoderOutput(dec_outs), mean_attns
+
+    def combine_attns(self, attns):
+        result = {}
+        for key in attns[0].keys():
+            result[key] = torch.stack(
+                [attn[key] for attn in attns if attn[key] is not None]).mean(0)
+        return result
+
+    def init_state(self, src, memory_bank, enc_hidden):
+        """ See :obj:`RNNDecoderBase.init_state()` """
+        for i, model_decoder in enumerate(self.model_decoders):
+            model_decoder.init_state(src, memory_bank[i], enc_hidden[i])
+
+    def map_state(self, fn):
+        for model_decoder in self.model_decoders:
+            model_decoder.map_state(fn)
+
+
+class EnsembleGenerator(nn.Module):
+    """
+    Dummy Generator that delegates to individual real Generators,
+    and then averages the resulting target distributions.
+    """
+    def __init__(self, model_generators, raw_probs=False):
+        super(EnsembleGenerator, self).__init__()
+        self.model_generators = nn.ModuleList(model_generators)
+        self._raw_probs = raw_probs
+
+    def forward(self, hidden, attn=None, src_map=None):
+        """
+        Compute a distribution over the target dictionary
+        by averaging distributions from models in the ensemble.
+        All models in the ensemble must share a target vocabulary.
+        """
+        distributions = torch.stack(
+                [mg(h) if attn is None else mg(h, attn, src_map)
+                 for h, mg in zip(hidden, self.model_generators)]
+            )
+        if self._raw_probs:
+            return torch.log(torch.exp(distributions).mean(0))
+        else:
+            return distributions.mean(0)
+
+
+class EnsembleModel(NMTModel):
+    """Dummy NMTModel wrapping individual real NMTModels."""
+    def __init__(self, models, raw_probs=False):
+        encoder = EnsembleEncoder(model.encoder for model in models)
+        decoder = EnsembleDecoder(model.decoder for model in models)
+        super(EnsembleModel, self).__init__(encoder, decoder)
+        self.generator = EnsembleGenerator(
+            [model.generator for model in models], raw_probs)
+        self.models = nn.ModuleList(models)
+
+
+def load_test_model(opt):
+    """Read in multiple models for ensemble."""
+    shared_fields = None
+    shared_model_opt = None
+    models = []
+    for model_path in opt.models:
+        fields, model, model_opt = \
+            onmt.model_builder.load_test_model(opt, model_path=model_path)
+        if shared_fields is None:
+            shared_fields = fields
+        else:
+            for key, field in fields.items():
+                try:
+                    f_iter = iter(field)
+                except TypeError:
+                    f_iter = [(key, field)]
+                for sn, sf in f_iter:
+                    if sf is not None and 'vocab' in sf.__dict__:
+                        sh_field = shared_fields[key]
+                        try:
+                            sh_f_iter = iter(sh_field)
+                        except TypeError:
+                            sh_f_iter = [(key, sh_field)]
+                        sh_f_dict = dict(sh_f_iter)
+                        assert sf.vocab.stoi == sh_f_dict[sn].vocab.stoi, \
+                            "Ensemble models must use the same " \
+                            "preprocessed data"
+        models.append(model)
+        if shared_model_opt is None:
+            shared_model_opt = model_opt
+    ensemble_model = EnsembleModel(models, opt.avg_raw_probs)
+    return shared_fields, ensemble_model, shared_model_opt
diff --git a/onmt/decoders/transformer.py b/onmt/decoders/transformer.py
diff --git a/onmt/encoders/__init__.py b/onmt/encoders/__init__.py
@@ -4,6 +4,13 @@
 from onmt.encoders.rnn_encoder import RNNEncoder
 from onmt.encoders.cnn_encoder import CNNEncoder
 from onmt.encoders.mean_encoder import MeanEncoder
+from onmt.encoders.audio_encoder import AudioEncoder
+from onmt.encoders.image_encoder import ImageEncoder
+
+
+str2enc = {"rnn": RNNEncoder, "brnn": RNNEncoder, "cnn": CNNEncoder,
+           "transformer": TransformerEncoder, "img": ImageEncoder,
+           "audio": AudioEncoder, "mean": MeanEncoder}
 
 __all__ = ["EncoderBase", "TransformerEncoder", "RNNEncoder", "CNNEncoder",
-           "MeanEncoder"]
+           "MeanEncoder", "str2enc"]
diff --git a/onmt/encoders/audio_encoder.py b/onmt/encoders/audio_encoder.py
@@ -1,72 +1,139 @@
-""" Audio encoder """
+"""Audio encoder"""
 import math
+
 import torch.nn as nn
-import torch.nn.functional as F
 
+from torch.nn.utils.rnn import pack_padded_sequence as pack
+from torch.nn.utils.rnn import pad_packed_sequence as unpack
+
+from onmt.utils.rnn_factory import rnn_factory
+from onmt.encoders.encoder import EncoderBase
 
-class AudioEncoder(nn.Module):
-    """
-    A simple encoder convolutional -> recurrent neural network for
-    audio input.
+
+class AudioEncoder(EncoderBase):
+    """A simple encoder CNN -> RNN for audio input.
 
     Args:
-        num_layers (int): number of encoder layers.
-        bidirectional (bool): bidirectional encoder.
-        rnn_size (int): size of hidden states of the rnn.
+        rnn_type (str): Type of RNN (e.g. GRU, LSTM, etc).
+        enc_layers (int): Number of encoder layers.
+        dec_layers (int): Number of decoder layers.
+        brnn (bool): Bidirectional encoder.
+        enc_rnn_size (int): Size of hidden states of the rnn.
+        dec_rnn_size (int): Size of the decoder hidden states.
+        enc_pooling (str): A comma separated list either of length 1
+            or of length ``enc_layers`` specifying the pooling amount.
         dropout (float): dropout probablity.
         sample_rate (float): input spec
         window_size (int): input spec
-
     """
 
-    def __init__(self, num_layers, bidirectional, rnn_size, dropout,
+    def __init__(self, rnn_type, enc_layers, dec_layers, brnn,
+                 enc_rnn_size, dec_rnn_size, enc_pooling, dropout,
                  sample_rate, window_size):
         super(AudioEncoder, self).__init__()
-        self.num_layers = num_layers
-        self.num_directions = 2 if bidirectional else 1
-        self.hidden_size = rnn_size
-
-        self.layer1 = nn.Conv2d(1, 32, kernel_size=(41, 11),
-                                padding=(0, 10), stride=(2, 2))
-        self.batch_norm1 = nn.BatchNorm2d(32)
-        self.layer2 = nn.Conv2d(32, 32, kernel_size=(21, 11),
-                                padding=(0, 0), stride=(2, 1))
-        self.batch_norm2 = nn.BatchNorm2d(32)
-
+        self.enc_layers = enc_layers
+        self.rnn_type = rnn_type
+        self.dec_layers = dec_layers
+        num_directions = 2 if brnn else 1
+        self.num_directions = num_directions
+        assert enc_rnn_size % num_directions == 0
+        enc_rnn_size_real = enc_rnn_size // num_directions
+        assert dec_rnn_size % num_directions == 0
+        self.dec_rnn_size = dec_rnn_size
+        dec_rnn_size_real = dec_rnn_size // num_directions
+        self.dec_rnn_size_real = dec_rnn_size_real
+        self.dec_rnn_size = dec_rnn_size
         input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
-        input_size = int(math.floor(input_size - 41) / 2 + 1)
-        input_size = int(math.floor(input_size - 21) / 2 + 1)
-        input_size *= 32
-        self.rnn = nn.LSTM(input_size, rnn_size,
-                           num_layers=num_layers,
-                           dropout=dropout,
-                           bidirectional=bidirectional)
-
-    def load_pretrained_vectors(self, opt):
-        """ Pass in needed options only when modify function definition."""
-        pass
+        enc_pooling = enc_pooling.split(',')
+        assert len(enc_pooling) == enc_layers or len(enc_pooling) == 1
+        if len(enc_pooling) == 1:
+            enc_pooling = enc_pooling * enc_layers
+        enc_pooling = [int(p) for p in enc_pooling]
+        self.enc_pooling = enc_pooling
 
-    def forward(self, src, lengths=None):
-        "See :obj:`onmt.encoders.encoder.EncoderBase.forward()`"
-        # (batch_size, 1, nfft, t)
-        # layer 1
-        src = self.batch_norm1(self.layer1(src[:, :, :, :]))
+        if dropout > 0:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+        self.W = nn.Linear(enc_rnn_size, dec_rnn_size, bias=False)
+        self.batchnorm_0 = nn.BatchNorm1d(enc_rnn_size, affine=True)
+        self.rnn_0, self.no_pack_padded_seq = \
+            rnn_factory(rnn_type,
+                        input_size=input_size,
+                        hidden_size=enc_rnn_size_real,
+                        num_layers=1,
+                        dropout=dropout,
+                        bidirectional=brnn)
+        self.pool_0 = nn.MaxPool1d(enc_pooling[0])
+        for l in range(enc_layers - 1):
+            batchnorm = nn.BatchNorm1d(enc_rnn_size, affine=True)
+            rnn, _ = \
+                rnn_factory(rnn_type,
+                            input_size=enc_rnn_size,
+                            hidden_size=enc_rnn_size_real,
+                            num_layers=1,
+                            dropout=dropout,
+                            bidirectional=brnn)
+            setattr(self, 'rnn_%d' % (l + 1), rnn)
+            setattr(self, 'pool_%d' % (l + 1),
+                    nn.MaxPool1d(enc_pooling[l + 1]))
+            setattr(self, 'batchnorm_%d' % (l + 1), batchnorm)
 
-        # (batch_size, 32, nfft/2, t/2)
-        src = F.hardtanh(src, 0, 20, inplace=True)
+    @classmethod
+    def from_opt(cls, opt, embeddings=None):
+        """Alternate constructor."""
+        if embeddings is not None:
+            raise ValueError("Cannot use embeddings with AudioEncoder.")
+        return cls(
+            opt.rnn_type,
+            opt.enc_layers,
+            opt.dec_layers,
+            opt.brnn,
+            opt.enc_rnn_size,
+            opt.dec_rnn_size,
+            opt.audio_enc_pooling,
+            opt.dropout,
+            opt.sample_rate,
+            opt.window_size)
 
-        # (batch_size, 32, nfft/2/2, t/2)
-        # layer 2
-        src = self.batch_norm2(self.layer2(src))
-
-        # (batch_size, 32, nfft/2/2, t/2)
-        src = F.hardtanh(src, 0, 20, inplace=True)
+    def forward(self, src, lengths=None):
+        """See :func:`onmt.encoders.encoder.EncoderBase.forward()`"""
+        batch_size, _, nfft, t = src.size()
+        src = src.transpose(0, 1).transpose(0, 3).contiguous() \
+                 .view(t, batch_size, nfft)
+        orig_lengths = lengths
+        lengths = lengths.view(-1).tolist()
 
-        batch_size = src.size(0)
-        length = src.size(3)
-        src = src.view(batch_size, -1, length)
-        src = src.transpose(0, 2).transpose(1, 2)
+        for l in range(self.enc_layers):
+            rnn = getattr(self, 'rnn_%d' % l)
+            pool = getattr(self, 'pool_%d' % l)
+            batchnorm = getattr(self, 'batchnorm_%d' % l)
+            stride = self.enc_pooling[l]
+            packed_emb = pack(src, lengths)
+            memory_bank, tmp = rnn(packed_emb)
+            memory_bank = unpack(memory_bank)[0]
+            t, _, _ = memory_bank.size()
+            memory_bank = memory_bank.transpose(0, 2)
+            memory_bank = pool(memory_bank)
+            lengths = [int(math.floor((length - stride) / stride + 1))
+                       for length in lengths]
+            memory_bank = memory_bank.transpose(0, 2)
+            src = memory_bank
+            t, _, num_feat = src.size()
+            src = batchnorm(src.contiguous().view(-1, num_feat))
+            src = src.view(t, -1, num_feat)
+            if self.dropout and l + 1 != self.enc_layers:
+                src = self.dropout(src)
 
-        output, hidden = self.rnn(src)
+        memory_bank = memory_bank.contiguous().view(-1, memory_bank.size(2))
+        memory_bank = self.W(memory_bank).view(-1, batch_size,
+                                               self.dec_rnn_size)
 
-        return hidden, output
+        state = memory_bank.new_full((self.dec_layers * self.num_directions,
+                                      batch_size, self.dec_rnn_size_real), 0)
+        if self.rnn_type == 'LSTM':
+            # The encoder hidden is  (layers*directions) x batch x dim.
+            encoder_final = (state, state)
+        else:
+            encoder_final = state
+        return encoder_final, memory_bank, orig_lengths.new_tensor(lengths)
diff --git a/onmt/encoders/cnn_encoder.py b/onmt/encoders/cnn_encoder.py
@@ -10,8 +10,7 @@
 
 
 class CNNEncoder(EncoderBase):
-    """
-    Encoder built on CNN based on
+    """Encoder based on "Convolutional Sequence to Sequence Learning"
     :cite:`DBLP:journals/corr/GehringAGYD17`.
     """
 
@@ -25,8 +24,18 @@ def __init__(self, num_layers, hidden_size,
         self.cnn = StackedCNN(num_layers, hidden_size,
                               cnn_kernel_width, dropout)
 
+    @classmethod
+    def from_opt(cls, opt, embeddings):
+        """Alternate constructor."""
+        return cls(
+            opt.enc_layers,
+            opt.enc_rnn_size,
+            opt.cnn_kernel_width,
+            opt.dropout,
+            embeddings)
+
     def forward(self, input, lengths=None, hidden=None):
-        """ See :obj:`onmt.modules.EncoderBase.forward()`"""
+        """See :class:`onmt.modules.EncoderBase.forward()`"""
         self._check_args(input, lengths, hidden)
 
         emb = self.embeddings(input)
@@ -40,4 +49,4 @@ def forward(self, input, lengths=None, hidden=None):
         out = self.cnn(emb_remap)
 
         return emb_remap.squeeze(3).transpose(0, 1).contiguous(), \
-            out.squeeze(3).transpose(0, 1).contiguous()
+            out.squeeze(3).transpose(0, 1).contiguous(), lengths
diff --git a/onmt/encoders/encoder.py b/onmt/encoders/encoder.py
@@ -1,7 +1,5 @@
 """Base class for encoders and generic multi encoders."""
 
-from __future__ import division
-
 import torch.nn as nn
 
 from onmt.utils.misc import aeq
@@ -10,7 +8,7 @@
 class EncoderBase(nn.Module):
     """
     Base encoder class. Specifies the interface used by different encoder types
-    and required by :obj:`onmt.Models.NMTModel`.
+    and required by :class:`onmt.Models.NMTModel`.
 
     .. mermaid::
 
@@ -32,6 +30,10 @@ class EncoderBase(nn.Module):
           E-->G
     """
 
+    @classmethod
+    def from_opt(cls, opt, embeddings=None):
+        raise NotImplementedError
+
     def _check_args(self, src, lengths=None, hidden=None):
         _, n_batch, _ = src.size()
         if lengths is not None:
@@ -41,14 +43,16 @@ def _check_args(self, src, lengths=None, hidden=None):
     def forward(self, src, lengths=None):
         """
         Args:
-            src (:obj:`LongTensor`):
-               padded sequences of sparse indices `[src_len x batch x nfeat]`
-            lengths (:obj:`LongTensor`): length of each sequence `[batch]`
+            src (LongTensor):
+               padded sequences of sparse indices ``(src_len, batch, nfeat)``
+            lengths (LongTensor): length of each sequence ``(batch,)``
 
 
         Returns:
-            (tuple of :obj:`FloatTensor`, :obj:`FloatTensor`):
-                * final encoder state, used to initialize decoder
-                * memory bank for attention, `[src_len x batch x hidden]`
+            (FloatTensor, FloatTensor):
+
+            * final encoder state, used to initialize decoder
+            * memory bank for attention, ``(src_len, batch, hidden)``
         """
+
         raise NotImplementedError
diff --git a/onmt/encoders/image_encoder.py b/onmt/encoders/image_encoder.py
@@ -1,13 +1,13 @@
-""" Image Encoder """
+"""Image Encoder."""
 import torch.nn as nn
 import torch.nn.functional as F
 import torch
 
+from onmt.encoders.encoder import EncoderBase
 
-class ImageEncoder(nn.Module):
-    """
-    A simple encoder convolutional -> recurrent neural network for
-    image src.
+
+class ImageEncoder(EncoderBase):
+    """A simple encoder CNN -> RNN for image src.
 
     Args:
         num_layers (int): number of encoder layers.
@@ -16,13 +16,14 @@ class ImageEncoder(nn.Module):
         dropout (float): dropout probablity.
     """
 
-    def __init__(self, num_layers, bidirectional, rnn_size, dropout):
+    def __init__(self, num_layers, bidirectional, rnn_size, dropout,
+                 image_chanel_size=3):
         super(ImageEncoder, self).__init__()
         self.num_layers = num_layers
         self.num_directions = 2 if bidirectional else 1
         self.hidden_size = rnn_size
 
-        self.layer1 = nn.Conv2d(3, 64, kernel_size=(3, 3),
+        self.layer1 = nn.Conv2d(image_chanel_size, 64, kernel_size=(3, 3),
                                 padding=(1, 1), stride=(1, 1))
         self.layer2 = nn.Conv2d(64, 128, kernel_size=(3, 3),
                                 padding=(1, 1), stride=(1, 1))
@@ -40,23 +41,41 @@ def __init__(self, num_layers, bidirectional, rnn_size, dropout):
         self.batch_norm3 = nn.BatchNorm2d(512)
 
         src_size = 512
-        self.rnn = nn.LSTM(src_size, rnn_size,
+        self.rnn = nn.LSTM(src_size, int(rnn_size / self.num_directions),
                            num_layers=num_layers,
                            dropout=dropout,
                            bidirectional=bidirectional)
         self.pos_lut = nn.Embedding(1000, src_size)
 
+    @classmethod
+    def from_opt(cls, opt, embeddings=None):
+        """Alternate constructor."""
+        if embeddings is not None:
+            raise ValueError("Cannot use embeddings with ImageEncoder.")
+        # why is the model_opt.__dict__ check necessary?
+        if "image_channel_size" not in opt.__dict__:
+            image_channel_size = 3
+        else:
+            image_channel_size = opt.image_channel_size
+        return cls(
+            opt.enc_layers,
+            opt.brnn,
+            opt.enc_rnn_size,
+            opt.dropout,
+            image_channel_size
+        )
+
     def load_pretrained_vectors(self, opt):
-        """ Pass in needed options only when modify function definition."""
+        """Pass in needed options only when modify function definition."""
         pass
 
     def forward(self, src, lengths=None):
-        "See :obj:`onmt.encoders.encoder.EncoderBase.forward()`"
+        """See :func:`onmt.encoders.encoder.EncoderBase.forward()`"""
 
         batch_size = src.size(0)
         # (batch_size, 64, imgH, imgW)
         # layer 1
-        src = F.relu(self.layer1(src[:, :, :, :]-0.5), True)
+        src = F.relu(self.layer1(src[:, :, :, :] - 0.5), True)
 
         # (batch_size, 64, imgH/2, imgW/2)
         src = F.max_pool2d(src, kernel_size=(2, 2), stride=(2, 2))
@@ -94,15 +113,15 @@ def forward(self, src, lengths=None):
         # # (batch_size, 512, H, W)
         all_outputs = []
         for row in range(src.size(2)):
-            inp = src[:, :, row, :].transpose(0, 2)\
+            inp = src[:, :, row, :].transpose(0, 2) \
                 .transpose(1, 2)
-            row_vec = torch.Tensor(batch_size).type_as(inp.data)\
-                                              .long().fill_(row)
+            row_vec = torch.Tensor(batch_size).type_as(inp.data) \
+                .long().fill_(row)
             pos_emb = self.pos_lut(row_vec)
             with_pos = torch.cat(
                 (pos_emb.view(1, pos_emb.size(0), pos_emb.size(1)), inp), 0)
             outputs, hidden_t = self.rnn(with_pos)
             all_outputs.append(outputs)
         out = torch.cat(all_outputs, 0)
 
-        return hidden_t, out
+        return hidden_t, out, lengths
diff --git a/onmt/encoders/mean_encoder.py b/onmt/encoders/mean_encoder.py
@@ -1,29 +1,45 @@
 """Define a minimal encoder."""
-from __future__ import division
-
 from onmt.encoders.encoder import EncoderBase
+from onmt.utils.misc import sequence_mask
+import torch
 
 
 class MeanEncoder(EncoderBase):
     """A trivial non-recurrent encoder. Simply applies mean pooling.
 
     Args:
        num_layers (int): number of replicated layers
-       embeddings (:obj:`onmt.modules.Embeddings`): embedding module to use
+       embeddings (onmt.modules.Embeddings): embedding module to use
     """
 
     def __init__(self, num_layers, embeddings):
         super(MeanEncoder, self).__init__()
         self.num_layers = num_layers
         self.embeddings = embeddings
 
+    @classmethod
+    def from_opt(cls, opt, embeddings):
+        """Alternate constructor."""
+        return cls(
+            opt.enc_layers,
+            embeddings)
+
     def forward(self, src, lengths=None):
-        "See :obj:`EncoderBase.forward()`"
+        """See :func:`EncoderBase.forward()`"""
         self._check_args(src, lengths)
 
         emb = self.embeddings(src)
         _, batch, emb_dim = emb.size()
-        mean = emb.mean(0).expand(self.num_layers, batch, emb_dim)
+
+        if lengths is not None:
+            # we avoid padding while mean pooling
+            mask = sequence_mask(lengths).float()
+            mask = mask / lengths.unsqueeze(1).float()
+            mean = torch.bmm(mask.unsqueeze(1), emb.transpose(0, 1)).squeeze(1)
+        else:
+            mean = emb.mean(0)
+
+        mean = mean.expand(self.num_layers, batch, emb_dim)
         memory_bank = emb
         encoder_final = (mean, mean)
-        return encoder_final, memory_bank
+        return encoder_final, memory_bank, lengths
diff --git a/onmt/encoders/rnn_encoder.py b/onmt/encoders/rnn_encoder.py
@@ -1,6 +1,4 @@
 """Define RNN-based encoders."""
-from __future__ import division
-
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -15,13 +13,13 @@ class RNNEncoder(EncoderBase):
     """ A generic recurrent neural network encoder.
 
     Args:
-       rnn_type (:obj:`str`):
+       rnn_type (str):
           style of recurrent unit to use, one of [RNN, LSTM, GRU, SRU]
        bidirectional (bool) : use a bidirectional RNN
        num_layers (int) : number of stacked layers
        hidden_size (int) : hidden size of each layer
-       dropout (float) : dropout value for :obj:`nn.Dropout`
-       embeddings (:obj:`onmt.modules.Embeddings`): embedding module to use
+       dropout (float) : dropout value for :class:`torch.nn.Dropout`
+       embeddings (onmt.modules.Embeddings): embedding module to use
     """
 
     def __init__(self, rnn_type, bidirectional, num_layers,
@@ -50,8 +48,20 @@ def __init__(self, rnn_type, bidirectional, num_layers,
                                     hidden_size,
                                     num_layers)
 
+    @classmethod
+    def from_opt(cls, opt, embeddings):
+        """Alternate constructor."""
+        return cls(
+            opt.rnn_type,
+            opt.brnn,
+            opt.enc_layers,
+            opt.enc_rnn_size,
+            opt.dropout,
+            embeddings,
+            opt.bridge)
+
     def forward(self, src, lengths=None):
-        "See :obj:`EncoderBase.forward()`"
+        """See :func:`EncoderBase.forward()`"""
         self._check_args(src, lengths)
 
         emb = self.embeddings(src)
@@ -60,8 +70,8 @@ def forward(self, src, lengths=None):
         packed_emb = emb
         if lengths is not None and not self.no_pack_padded_seq:
             # Lengths data is wrapped inside a Tensor.
-            lengths = lengths.view(-1).tolist()
-            packed_emb = pack(emb, lengths)
+            lengths_list = lengths.view(-1).tolist()
+            packed_emb = pack(emb, lengths_list)
 
         memory_bank, encoder_final = self.rnn(packed_emb)
 
@@ -70,7 +80,7 @@ def forward(self, src, lengths=None):
 
         if self.use_bridge:
             encoder_final = self._bridge(encoder_final)
-        return encoder_final, memory_bank
+        return encoder_final, memory_bank, lengths
 
     def _initialize_bridge(self, rnn_type,
                            hidden_size,
@@ -88,9 +98,7 @@ def _initialize_bridge(self, rnn_type,
                                      for _ in range(number_of_states)])
 
     def _bridge(self, hidden):
-        """
-        Forward hidden state through bridge
-        """
+        """Forward hidden state through bridge."""
         def bottle_hidden(linear, states):
             """
             Transform from 3D to 2D, apply linear and return initial size

diff --git a/onmt/encoders/transformer.py b/onmt/encoders/transformer.py
@@ -4,9 +4,8 @@
 
 import torch.nn as nn
 
-import onmt
 from onmt.encoders.encoder import EncoderBase
-# from onmt.utils.misc import aeq
+from onmt.modules import MultiHeadedAttention
 from onmt.modules.position_ffn import PositionwiseFeedForward
 
 
@@ -23,39 +22,38 @@ class TransformerEncoderLayer(nn.Module):
         dropout (float): dropout probability(0-1.0).
     """
 
-    def __init__(self, d_model, heads, d_ff, dropout):
+    def __init__(self, d_model, heads, d_ff, dropout,
+                 max_relative_positions=0):
         super(TransformerEncoderLayer, self).__init__()
 
-        self.self_attn = onmt.modules.MultiHeadedAttention(
-            heads, d_model, dropout=dropout)
+        self.self_attn = MultiHeadedAttention(
+            heads, d_model, dropout=dropout,
+            max_relative_positions=max_relative_positions)
         self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
-        self.layer_norm = onmt.modules.LayerNorm(d_model)
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
         self.dropout = nn.Dropout(dropout)
 
     def forward(self, inputs, mask):
         """
-        Transformer Encoder Layer definition.
-
         Args:
-            inputs (`FloatTensor`): `[batch_size x src_len x model_dim]`
-            mask (`LongTensor`): `[batch_size x src_len x src_len]`
+            inputs (FloatTensor): ``(batch_size, src_len, model_dim)``
+            mask (LongTensor): ``(batch_size, src_len, src_len)``
 
         Returns:
-            (`FloatTensor`):
+            (FloatTensor):
 
-            * outputs `[batch_size x src_len x model_dim]`
+            * outputs ``(batch_size, src_len, model_dim)``
         """
         input_norm = self.layer_norm(inputs)
         context, _ = self.self_attn(input_norm, input_norm, input_norm,
-                                    mask=mask)
+                                    mask=mask, type="self")
         out = self.dropout(context) + inputs
         return self.feed_forward(out)
 
 
 class TransformerEncoder(EncoderBase):
-    """
-    The Transformer encoder from "Attention is All You Need".
-
+    """The Transformer encoder from "Attention is All You Need"
+    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`
 
     .. mermaid::
 
@@ -74,29 +72,42 @@ class TransformerEncoder(EncoderBase):
         heads (int): number of heads
         d_ff (int): size of the inner FF layer
         dropout (float): dropout parameters
-        embeddings (:obj:`onmt.modules.Embeddings`):
+        embeddings (onmt.modules.Embeddings):
           embeddings to use, should have positional encodings
 
     Returns:
-        (`FloatTensor`, `FloatTensor`):
+        (torch.FloatTensor, torch.FloatTensor):
 
-        * embeddings `[src_len x batch_size x model_dim]`
-        * memory_bank `[src_len x batch_size x model_dim]`
+        * embeddings ``(src_len, batch_size, model_dim)``
+        * memory_bank ``(src_len, batch_size, model_dim)``
     """
 
-    def __init__(self, num_layers, d_model, heads, d_ff,
-                 dropout, embeddings):
+    def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings,
+                 max_relative_positions):
         super(TransformerEncoder, self).__init__()
 
-        self.num_layers = num_layers
         self.embeddings = embeddings
         self.transformer = nn.ModuleList(
-            [TransformerEncoderLayer(d_model, heads, d_ff, dropout)
-             for _ in range(num_layers)])
-        self.layer_norm = onmt.modules.LayerNorm(d_model)
+            [TransformerEncoderLayer(
+                d_model, heads, d_ff, dropout,
+                max_relative_positions=max_relative_positions)
+             for i in range(num_layers)])
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+
+    @classmethod
+    def from_opt(cls, opt, embeddings):
+        """Alternate constructor."""
+        return cls(
+            opt.enc_layers,
+            opt.enc_rnn_size,
+            opt.heads,
+            opt.transformer_ff,
+            opt.dropout,
+            embeddings,
+            opt.max_relative_positions)
 
     def forward(self, src, lengths=None):
-        """ See :obj:`EncoderBase.forward()`"""
+        """See :func:`EncoderBase.forward()`"""
         self._check_args(src, lengths)
 
         emb = self.embeddings(src)
@@ -105,11 +116,10 @@ def forward(self, src, lengths=None):
         words = src[:, :, 0].transpose(0, 1)
         w_batch, w_len = words.size()
         padding_idx = self.embeddings.word_padding_idx
-        mask = words.data.eq(padding_idx).unsqueeze(1) \
-            .expand(w_batch, w_len, w_len)
+        mask = words.data.eq(padding_idx).unsqueeze(1)  # [B, 1, T]
         # Run the forward pass of every layer of the tranformer.
-        for i in range(self.num_layers):
-            out = self.transformer[i](out, mask)
+        for layer in self.transformer:
+            out = layer(out, mask)
         out = self.layer_norm(out)
 
-        return emb, out.transpose(0, 1).contiguous()
+        return emb, out.transpose(0, 1).contiguous(), lengths
diff --git a/onmt/inputters/__init__.py b/onmt/inputters/__init__.py
@@ -3,23 +3,24 @@
 Inputters implement the logic of transforming raw data to vectorized inputs,
 e.g., from a line of text to a sequence of embeddings.
 """
-from onmt.inputters.inputter import collect_feature_vocabs, make_features, \
-    collect_features, get_num_features, \
-    load_fields_from_vocab, get_fields, \
-    save_fields_to_vocab, build_dataset, \
-    build_vocab, merge_vocabs, OrderedIterator
-from onmt.inputters.dataset_base import DatasetBase, PAD_WORD, BOS_WORD, \
-    EOS_WORD, UNK
-from onmt.inputters.text_dataset import TextDataset, ShardedTextCorpusIterator
-from onmt.inputters.image_dataset import ImageDataset
-from onmt.inputters.audio_dataset import AudioDataset
+from onmt.inputters.inputter import \
+    load_old_vocab, get_fields, OrderedIterator, \
+    build_vocab, old_style_vocab, filter_example
+from onmt.inputters.dataset_base import Dataset
+from onmt.inputters.text_dataset import text_sort_key, TextDataReader
+from onmt.inputters.image_dataset import img_sort_key, ImageDataReader
+from onmt.inputters.audio_dataset import audio_sort_key, AudioDataReader
+from onmt.inputters.datareader_base import DataReaderBase
 
 
-__all__ = ['PAD_WORD', 'BOS_WORD', 'EOS_WORD', 'UNK', 'DatasetBase',
-           'collect_feature_vocabs', 'make_features',
-           'collect_features', 'get_num_features',
-           'load_fields_from_vocab', 'get_fields',
-           'save_fields_to_vocab', 'build_dataset',
-           'build_vocab', 'merge_vocabs', 'OrderedIterator',
-           'TextDataset', 'ImageDataset', 'AudioDataset',
-           'ShardedTextCorpusIterator']
+str2reader = {
+    "text": TextDataReader, "img": ImageDataReader, "audio": AudioDataReader}
+str2sortkey = {
+    'text': text_sort_key, 'img': img_sort_key, 'audio': audio_sort_key}
+
+
+__all__ = ['Dataset', 'load_old_vocab', 'get_fields', 'DataReaderBase',
+           'filter_example', 'old_style_vocab',
+           'build_vocab', 'OrderedIterator',
+           'text_sort_key', 'img_sort_key', 'audio_sort_key',
+           'TextDataReader', 'ImageDataReader', 'AudioDataReader']
Original file line number	Diff line number	Diff line change
		@@ -4,4 +4,4 @@

		> python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000

		> python train.py -data data/data -save_model /n/rush_lab/data/tmp_ -gpuid 0 -rnn_size 100 -word_vec_size 50 -layers 1 -train_steps 100 -optim adam -learning_rate 0.001
		> python train.py -data data/data -save_model /n/rush_lab/data/tmp_ -world_size 1 -gpu_ranks 0 -rnn_size 100 -word_vec_size 50 -layers 1 -train_steps 100 -optim adam -learning_rate 0.001