diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2b19e98
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,16 @@
+**.DS_Store
+**/.DS_Store
+
+*.pyc
+*.pyo
+
+__pycache__/
+
+data/*
+serialization_dirs/
+nohup_logs/
+.env/
+
+*.ipynb
+*.vscode
+.ipynb_checkpoints
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..a318351
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "SentEval"]
+	path = SentEval
+	url = https://github.com/facebookresearch/SentEval
diff --git a/HMTL_architecture.png b/HMTL_architecture.png
new file mode 100644
index 0000000..6eda142
Binary files /dev/null and b/HMTL_architecture.png differ
diff --git a/README.md b/README.md
index ba7898f..8480ce6 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,61 @@
-# hmtl
-HMTL: Hierarchical Multi-Task Learning
+# HMTL (Hierarchical Multi-Task Learning model) 
+
+A Hierarchical Multi-Task Approach for Learning Embeddings from Semantic Tasks\
+Victor SANH, Thomas WOLF, Sebastian RUDER\
+AAAI 2019
+
+<img src="https://github.com/huggingface/jointmodelMD/blob/master/HMTL_architecture.png" alt="HMTL Architecture" width="350"/>
+
+## About
+
+HMTL is a Hierarchical Multi-Task Learning model which combine a set of four carefully selected semantic tasks (namely Named Entity Recoginition, Entity Mention Detection, Relation Extraction and Coreference Resolution). The model achieves state-of-the-art results on Named Entity Recognition, Entity Mention Detection and Relation Extraction. Using [SentEval](https://github.com/facebookresearch/SentEval), we show that as we move from the bottom to the top layers of the model, the model tend to learn more complex semantic representation.
+
+For more details, we refer to our AAAI paper (LINK Arxiv).
+
+We release here the code for _training_, _fine tuning_ and _evaluating_ HMTL. We hope that this code will be useful for building your own Multi-Task models (hierarchical or not). The code is written in __Python__ and powered by __Pytorch__.
+
+## Dependecies and installation
+
+The main dependencies are:
+- [AllenNLP](https://github.com/allenai/allennlp)
+- [PyTorch](https://pytorch.org/)
+- [SentEval](https://github.com/facebookresearch/SentEval) (only for evaluating the embeddings)
+
+The code works with __Python 3.6__. A stable version of the dependencies is listed in `requirements.txt`.
+
+You can quickly setup a working environment by calling the script `./script/machine_setup.sh`. It installs Python 3.6, create a clean virtual environment, and install all the required dependencies (listed in `requirements.txt`). Please adapt the script depending on your needs.
+
+## Example usage
+
+We base our implementation on the [AllenNLP library](https://github.com/allenai/allennlp). For an introduction to this library, you should check [these tutorials](https://allennlp.org/tutorials).
+
+An experiment is described in a _json_ configuration file (see `configs/*.json` for examples). The configuration file mainly describes the datasets to load, the model to create along with all the hyper-parameters of the model. 
+
+Once you have set up your configuration file (and defined custom classes if needed), you can simply launch a training with the following command and arguments:
+
+```bash
+python train.py --config_file_path configs/hmtl_coref_conll.json --serialization_dir my_first_training
+```
+
+Once the training has started, you can simply follow the training in the terminal or open a [Tensorboard](https://www.tensorflow.org/guide/summaries_and_tensorboard):
+
+```bash
+tensorboard --logdir my_first_training/log
+```
+
+## Evaluating the embeddings with SentEval
+
+We used [SentEval](https://github.com/facebookresearch/SentEval) to assess the linguistic properties learned by the model. `hmtl_senteval.py` gives an example of how we can create an interface between SentEval and HMTL.
+
+## Data
+
+To download the pre-trained embeddings we used in HMTL, you can simply call the script `./script/data_setup.sh`.
+
+We do not attached the datasets used to train HMTL for licensing reasons, but we invite you to collect them by yourself: [OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), [CoNLL2003](https://www.clips.uantwerpen.be/conll2003/ner/), and [ACE2005](https://catalog.ldc.upenn.edu/LDC2006T06). The configuration files expect the datasets to be placed in the `data/` folder.
+
+## References
+
+```
+@article{
+}
+```
diff --git a/configs/coref_ace.json b/configs/coref_ace.json
new file mode 100644
index 0000000..d3e5326
--- /dev/null
+++ b/configs/coref_ace.json
@@ -0,0 +1,150 @@
+{
+	"task_coref":{
+		"task_description":{
+			"task_name": "coref",
+			"validation_metric_name": "coref_f1",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "coref_ace",
+				"max_span_width": 8,
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+		
+		
+			"train_data_path":"./data/ace2005/single_file_train_rahman.gold_conll",
+			"validation_data_path": "./data/ace2005/single_file_dev_rahman.gold_conll",
+			"test_data_path": "./data/ace2005/single_file_test_rahman.gold_conll",
+
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"model": {
+		"type": "coref_custom",
+		
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+		
+		"coref": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 200,
+				"num_layers": 1,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"mention_feedforward": {
+					"input_dim": 2008,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"antecedent_feedforward": {
+					"input_dim": 6044,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"initializer": [
+					[".*linear_layers.*weight", {"type": "xavier_normal"}],
+					[".*scorer._module.weight", {"type": "xavier_normal"}],
+					["_distance_embedding.weight", {"type": "xavier_normal"}],
+					["_span_width_embedding.weight", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_hh.*", {"type": "orthogonal"}]
+				],
+				"lexical_dropout": 0.5,
+				"feature_size": 20,
+				"max_span_width": 8,
+				"spans_per_word": 0.4,
+				"max_antecedents": 70,
+				"eval_on_gold_mentions": false
+			}
+		}
+	},
+	
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		},
+		"iterator_coref": {
+			"type": "bucket",
+			"sorting_keys": [["text", "num_tokens"]],
+			"padding_noise": 0.0,
+			"batch_size": 1
+		  }
+	},
+	
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}
\ No newline at end of file
diff --git a/configs/coref_conll.json b/configs/coref_conll.json
new file mode 100644
index 0000000..d538bac
--- /dev/null
+++ b/configs/coref_conll.json
@@ -0,0 +1,150 @@
+{
+	"task_coref":{
+		"task_description":{
+			"task_name": "coref",
+			"validation_metric_name": "coref_f1",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "coref",
+				"max_span_width": 8,
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+		
+		
+			"train_data_path":"./data/conll-2012_single_file/train.english.v4_gold_conll",
+			"validation_data_path": "./data/conll-2012_single_file/dev.english.v4_gold_conll",
+			"test_data_path": "./data/conll-2012_single_file/test.english.v4_gold_conll",
+
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"model": {
+		"type": "coref_custom",
+		
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+		
+		"coref": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 200,
+				"num_layers": 1,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"mention_feedforward": {
+					"input_dim": 2008,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"antecedent_feedforward": {
+					"input_dim": 6044,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"initializer": [
+					[".*linear_layers.*weight", {"type": "xavier_normal"}],
+					[".*scorer._module.weight", {"type": "xavier_normal"}],
+					["_distance_embedding.weight", {"type": "xavier_normal"}],
+					["_span_width_embedding.weight", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_hh.*", {"type": "orthogonal"}]
+				],
+				"lexical_dropout": 0.5,
+				"feature_size": 20,
+				"max_span_width": 8,
+				"spans_per_word": 0.4,
+				"max_antecedents": 70,
+				"eval_on_gold_mentions": false
+			}
+		}
+	},
+	
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		},
+		"iterator_coref": {
+			"type": "bucket",
+			"sorting_keys": [["text", "num_tokens"]],
+			"padding_noise": 0.0,
+			"batch_size": 1
+		  }
+	},
+	
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}
\ No newline at end of file
diff --git a/configs/emd.json b/configs/emd.json
new file mode 100644
index 0000000..a921499
--- /dev/null
+++ b/configs/emd.json
@@ -0,0 +1,120 @@
+{
+	"task_ner":{
+		"task_description":{
+			"task_name": "ner",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader":{
+				"type": "mention_ace",
+				"label_namespace": "ace_mention_labels",
+				"lazy": false,
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			
+			"train_data_path": "./data/ace2005/train/",
+			"validation_data_path": "./data/ace2005/dev/",
+			"test_data_path": "./data/ace2005/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+
+	"model": {
+		"type": "ner",
+		
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+		
+		"ner": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ace_mention_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 		
+			}
+		}
+	},
+
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		}
+	},
+	
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}
diff --git a/configs/emd_coref_ace.json b/configs/emd_coref_ace.json
new file mode 100644
index 0000000..c5b829f
--- /dev/null
+++ b/configs/emd_coref_ace.json
@@ -0,0 +1,200 @@
+{
+	"task_emd":{
+		"task_description":{
+			"task_name": "emd",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "mention_ace",
+				"label_namespace": "ace_mention_labels",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			"train_data_path": "./data/ace2005/train/",
+			"validation_data_path": "./data/ace2005/dev/",
+			"test_data_path": "./data/ace2005/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"task_coref":{
+		"task_description":{
+			"task_name": "coref",
+			"validation_metric_name": "coref_f1",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "coref_ace",
+				"max_span_width": 8,
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+		
+		
+			"train_data_path":"./data/ace2005/single_file_train.gold_conll",
+			"validation_data_path": "./data/ace2005/single_file_dev.gold_conll",
+			"test_data_path": "./data/ace2005/single_file_test.gold_conll",
+
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"model": {
+		"type": "emd_coref",
+		
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+		
+		"emd": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ace_mention_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 
+			}
+		},
+		
+		"coref": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1316,
+				"hidden_size": 200,
+				"num_layers": 1,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"mention_feedforward": {
+					"input_dim": 2136,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"antecedent_feedforward": {
+					"input_dim": 6428,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"initializer": [
+					[".*linear_layers.*weight", {"type": "xavier_normal"}],
+					[".*scorer._module.weight", {"type": "xavier_normal"}],
+					["_distance_embedding.weight", {"type": "xavier_normal"}],
+					["_span_width_embedding.weight", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_hh.*", {"type": "orthogonal"}]
+				],
+				"lexical_dropout": 0.5,
+				"feature_size": 20,
+				"max_span_width": 8,
+				"spans_per_word": 0.4,
+				"max_antecedents": 70,
+				"eval_on_gold_mentions": false
+			}
+		}
+	},
+
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		},
+		"iterator_coref": {
+			"type": "bucket",
+			"sorting_keys": [["text", "num_tokens"]],
+			"padding_noise": 0.0,
+			"batch_size": 1
+		}
+	},
+	
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}
\ No newline at end of file
diff --git a/configs/emd_relation.json b/configs/emd_relation.json
new file mode 100644
index 0000000..ed3fa90
--- /dev/null
+++ b/configs/emd_relation.json
@@ -0,0 +1,173 @@
+{
+	"task_emd":{
+		"task_description":{
+			"task_name": "emd",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "mention_ace",
+				"label_namespace": "ace_mention_labels",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			"train_data_path": "./data/ace2005/train/",
+			"validation_data_path": "./data/ace2005/dev/",
+			"test_data_path": "./data/ace2005/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"task_relation":{
+		"task_description":{
+			"task_name": "relation",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "relation_ace",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+		
+		
+			"train_data_path": "./data/ace2005/train/",
+			"validation_data_path": "./data/ace2005/dev/",
+			"test_data_path": "./data/ace2005/test/",
+
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},	
+	
+	"model": {
+		"type": "emd_relation",
+		
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+		
+		"emd": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ace_mention_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 
+			}
+		},
+		
+		"relation": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1316,
+				"hidden_size": 64,
+				"num_layers": 3,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"d": 64,
+				"l": 64,
+				"n_classes": 6,
+				"activation": "relu"
+			}
+		}
+	},
+	
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		},
+		"iterator_relation": {
+			"type": "basic",
+			"batch_size": 4
+		}
+	},
+
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}
\ No newline at end of file
diff --git a/configs/hmtl_coref_ace.json b/configs/hmtl_coref_ace.json
new file mode 100644
index 0000000..57a6fdf
--- /dev/null
+++ b/configs/hmtl_coref_ace.json
@@ -0,0 +1,307 @@
+{
+	"task_ner":{
+		"task_description":{
+			"task_name": "ner",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader":{
+				"type": "ner_ontonotes",
+				"label_namespace": "ontonotes_ner_labels",
+				"coding_scheme": "BIOUL",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			
+			"train_data_path": "./data/conll-2012/v4/data/train/",
+			"validation_data_path": "./data/conll-2012/v4/data/development/",
+			"test_data_path": "./data/conll-2012/v4/data/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"task_emd":{
+		"task_description":{
+			"task_name": "emd",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "mention_ace",
+				"label_namespace": "ace_mention_labels",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			"train_data_path": "./data/ace2005/train/",
+			"validation_data_path": "./data/ace2005/dev/",
+			"test_data_path": "./data/ace2005/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"task_relation":{
+		"task_description":{
+			"task_name": "relation",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "relation_ace",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+		
+		
+			"train_data_path": "./data/ace2005/train/",
+			"validation_data_path": "./data/ace2005/dev/",
+			"test_data_path": "./data/ace2005/test/",
+
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"task_coref":{
+		"task_description":{
+			"task_name": "coref",
+			"validation_metric_name": "coref_f1",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "coref_ace",
+				"max_span_width": 8,
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+		
+		
+			"train_data_path":"./data/ace2005/single_file_train_rahman.gold_conll",
+			"validation_data_path": "./data/ace2005/single_file_dev_rahman.gold_conll",
+			"test_data_path": "./data/ace2005/single_file_test_rahman.gold_conll",
+
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"model": {
+		"type": "hmtl",
+		
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+		
+		"ner": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ontonotes_ner_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 		
+			}
+		},
+		
+		"emd": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1316,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ace_mention_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 
+			}
+		},
+		
+		"relation": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1316,
+				"hidden_size": 64,
+				"num_layers": 3,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"d": 64,
+				"l": 64,
+				"n_classes": 6,
+				"activation": "relu"
+			}
+		},
+		
+		"coref": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1316,
+				"hidden_size": 200,
+				"num_layers": 1,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"mention_feedforward": {
+					"input_dim": 2136,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"antecedent_feedforward": {
+					"input_dim": 6428,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"initializer": [
+					[".*linear_layers.*weight", {"type": "xavier_normal"}],
+					[".*scorer._module.weight", {"type": "xavier_normal"}],
+					["_distance_embedding.weight", {"type": "xavier_normal"}],
+					["_span_width_embedding.weight", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_hh.*", {"type": "orthogonal"}]
+				],
+				"lexical_dropout": 0.5,
+				"feature_size": 20,
+				"max_span_width": 8,
+				"spans_per_word": 0.4,
+				"max_antecedents": 70,
+				"eval_on_gold_mentions": false
+			}
+		}
+	},
+	
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		},
+		"iterator_relation": {
+			"type": "basic",
+			"batch_size": 4
+		},
+		"iterator_coref": {
+			"type": "bucket",
+			"sorting_keys": [["text", "num_tokens"]],
+			"padding_noise": 0.0,
+			"batch_size": 1
+		  }
+	},
+	
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}
\ No newline at end of file
diff --git a/configs/hmtl_coref_conll.json b/configs/hmtl_coref_conll.json
new file mode 100644
index 0000000..8aa732b
--- /dev/null
+++ b/configs/hmtl_coref_conll.json
@@ -0,0 +1,307 @@
+{
+	"task_ner":{
+		"task_description":{
+			"task_name": "ner",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader":{
+				"type": "ner_ontonotes",
+				"label_namespace": "ontonotes_ner_labels",
+				"coding_scheme": "BIOUL",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			
+			"train_data_path": "./data/conll-2012/v4/data/train/",
+			"validation_data_path": "./data/conll-2012/v4/data/development/",
+			"test_data_path": "./data/conll-2012/v4/data/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"task_emd":{
+		"task_description":{
+			"task_name": "emd",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "mention_ace",
+				"label_namespace": "ace_mention_labels",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			"train_data_path": "./data/ace2005/train/",
+			"validation_data_path": "./data/ace2005/dev/",
+			"test_data_path": "./data/ace2005/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"task_relation":{
+		"task_description":{
+			"task_name": "relation",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "relation_ace",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+		
+		
+			"train_data_path": "./data/ace2005/train/",
+			"validation_data_path": "./data/ace2005/dev/",
+			"test_data_path": "./data/ace2005/test/",
+
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"task_coref":{
+		"task_description":{
+			"task_name": "coref",
+			"validation_metric_name": "coref_f1",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "coref",
+				"max_span_width": 8,
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+		
+		
+			"train_data_path":"./data/conll-2012_single_file/train.english.v4_gold_conll",
+			"validation_data_path": "./data/conll-2012_single_file/dev.english.v4_gold_conll",
+			"test_data_path": "./data/conll-2012_single_file/test.english.v4_gold_conll",
+
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"model": {
+		"type": "hmtl",
+		
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+		
+		"ner": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ontonotes_ner_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 		
+			}
+		},
+		
+		"emd": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1316,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ace_mention_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 
+			}
+		},
+		
+		"relation": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1316,
+				"hidden_size": 64,
+				"num_layers": 3,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"d": 64,
+				"l": 64,
+				"n_classes": 6,
+				"activation": "relu"
+			}
+		},
+		
+		"coref": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1316,
+				"hidden_size": 200,
+				"num_layers": 1,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"mention_feedforward": {
+					"input_dim": 2136,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"antecedent_feedforward": {
+					"input_dim": 6428,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"initializer": [
+					[".*linear_layers.*weight", {"type": "xavier_normal"}],
+					[".*scorer._module.weight", {"type": "xavier_normal"}],
+					["_distance_embedding.weight", {"type": "xavier_normal"}],
+					["_span_width_embedding.weight", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_hh.*", {"type": "orthogonal"}]
+				],
+				"lexical_dropout": 0.5,
+				"feature_size": 20,
+				"max_span_width": 8,
+				"spans_per_word": 0.4,
+				"max_antecedents": 70,
+				"eval_on_gold_mentions": false
+			}
+		}
+	},
+	
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		},
+		"iterator_relation": {
+			"type": "basic",
+			"batch_size": 4
+		},
+		"iterator_coref": {
+			"type": "bucket",
+			"sorting_keys": [["text", "num_tokens"]],
+			"padding_noise": 0.0,
+			"batch_size": 1
+		  }
+	},
+	
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}
\ No newline at end of file
diff --git a/configs/ner.json b/configs/ner.json
new file mode 100644
index 0000000..d1655ea
--- /dev/null
+++ b/configs/ner.json
@@ -0,0 +1,120 @@
+{
+	"task_ner":{
+		"task_description":{
+			"task_name": "ner",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader":{
+				"type": "ner_ontonotes",
+				"label_namespace": "ontonotes_ner_labels",
+				"coding_scheme": "BIOUL",
+				"lazy": false,
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			
+			"train_data_path": "./data/conll-2012/v4/data/train/",
+			"validation_data_path": "./data/conll-2012/v4/data/development/",
+			"test_data_path": "./data/conll-2012/v4/data/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	"model": {
+		"type": "ner",
+		
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+		
+		"ner": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ontonotes_ner_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 		
+			}
+		}
+	},
+
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		}
+	},
+	
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}
diff --git a/configs/ner_emd.json b/configs/ner_emd.json
new file mode 100644
index 0000000..b5ee471
--- /dev/null
+++ b/configs/ner_emd.json
@@ -0,0 +1,172 @@
+{
+	"task_ner":{
+		"task_description":{
+			"task_name": "ner",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader":{
+				"type": "ner_ontonotes",
+				"label_namespace": "ontonotes_ner_labels",
+				"coding_scheme": "BIOUL",
+				"lazy": false,
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			
+			"train_data_path": "./data/conll-2012/v4/data/train/",
+			"validation_data_path": "./data/conll-2012/v4/data/development/",
+			"test_data_path": "./data/conll-2012/v4/data/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"task_emd":{
+		"task_description":{
+			"task_name": "emd",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "mention_ace",
+				"label_namespace": "ace_mention_labels",
+				"lazy": false,
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			"train_data_path": "./data/ace2005/train",
+			"validation_data_path": "./data/ace2005/dev",
+			"test_data_path": "./data/ace2005/test",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"model": {
+		"type": "ner_emd",
+		
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+		
+		"ner": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ontonotes_ner_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 		
+			}
+		},
+		
+		"emd": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1316,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ace_mention_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 
+			}
+		}
+	},
+
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		}
+	},
+	
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}
\ No newline at end of file
diff --git a/configs/ner_emd_coref_ace.json b/configs/ner_emd_coref_ace.json
new file mode 100644
index 0000000..b47df4f
--- /dev/null
+++ b/configs/ner_emd_coref_ace.json
@@ -0,0 +1,252 @@
+{
+	"task_ner":{
+		"task_description":{
+			"task_name": "ner",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader":{
+				"type": "ner_ontonotes",
+				"label_namespace": "ontonotes_ner_labels",
+				"coding_scheme": "BIOUL",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			
+			"train_data_path": "./data/conll-2012/v4/data/train/",
+			"validation_data_path": "./data/conll-2012/v4/data/development/",
+			"test_data_path": "./data/conll-2012/v4/data/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"task_emd":{
+		"task_description":{
+			"task_name": "emd",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "mention_ace",
+				"label_namespace": "ace_mention_labels",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			"train_data_path": "./data/ace2005/train/",
+			"validation_data_path": "./data/ace2005/dev/",
+			"test_data_path": "./data/ace2005/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"task_coref":{
+		"task_description":{
+			"task_name": "coref",
+			"validation_metric_name": "coref_f1",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "coref_ace",
+				"max_span_width": 8,
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+		
+		
+			"train_data_path":"./data/ace2005/single_file_train.gold_conll",
+			"validation_data_path": "./data/ace2005/single_file_dev.gold_conll",
+			"test_data_path": "./data/ace2005/single_file_test.gold_conll",
+
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"model": {
+		"type": "ner_emd_coref",
+		
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+		
+		"ner": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ontonotes_ner_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 		
+			}
+		},
+		
+		"emd": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1316,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ace_mention_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 
+			}
+		},
+		
+		"coref": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1316,
+				"hidden_size": 200,
+				"num_layers": 1,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"mention_feedforward": {
+					"input_dim": 2136,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"antecedent_feedforward": {
+					"input_dim": 6428,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"initializer": [
+					[".*linear_layers.*weight", {"type": "xavier_normal"}],
+					[".*scorer._module.weight", {"type": "xavier_normal"}],
+					["_distance_embedding.weight", {"type": "xavier_normal"}],
+					["_span_width_embedding.weight", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_hh.*", {"type": "orthogonal"}]
+				],
+				"lexical_dropout": 0.5,
+				"feature_size": 20,
+				"max_span_width": 8,
+				"spans_per_word": 0.4,
+				"max_antecedents": 70,
+				"eval_on_gold_mentions": false
+			}
+		}
+	},
+
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		},
+		"iterator_coref": {
+			"type": "bucket",
+			"sorting_keys": [["text", "num_tokens"]],
+			"padding_noise": 0.0,
+			"batch_size": 1
+		}
+	},
+	
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}
\ No newline at end of file
diff --git a/configs/ner_emd_relation.json b/configs/ner_emd_relation.json
new file mode 100644
index 0000000..08d924c
--- /dev/null
+++ b/configs/ner_emd_relation.json
@@ -0,0 +1,226 @@
+{
+	"task_ner":{
+		"task_description":{
+			"task_name": "ner",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader":{
+				"type": "ner_ontonotes",
+				"label_namespace": "ontonotes_ner_labels",
+				"coding_scheme": "BIOUL",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			
+			"train_data_path": "./data/conll-2012/v4/data/train/",
+			"validation_data_path": "./data/conll-2012/v4/data/development/",
+			"test_data_path": "./data/conll-2012/v4/data/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"task_emd":{
+		"task_description":{
+			"task_name": "emd",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "mention_ace",
+				"label_namespace": "ace_mention_labels",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			"train_data_path": "./data/ace2005/train/",
+			"validation_data_path": "./data/ace2005/dev/",
+			"test_data_path": "./data/ace2005/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+	
+	"task_relation":{
+		"task_description":{
+			"task_name": "relation",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader": {
+				"type": "relation_ace",
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+		
+		
+			"train_data_path": "./data/ace2005/train/",
+			"validation_data_path": "./data/ace2005/dev/",
+			"test_data_path": "./data/ace2005/test/",
+
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},	
+	
+	"model": {
+		"type": "ner_emd_relation",
+		
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+		
+		"ner": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ontonotes_ner_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 		
+			}
+		},
+		
+		"emd": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1316,
+				"hidden_size": 64,
+				"num_layers": 2,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"label_namespace": "ace_mention_labels",
+				"constraint_type": "BIOUL",
+				"dropout": 0.2 
+			}
+		},
+		
+		"relation": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1316,
+				"hidden_size": 64,
+				"num_layers": 3,
+				"dropout": 0.2
+			},
+			
+			"tagger": {
+				"d": 64,
+				"l": 64,
+				"n_classes": 6,
+				"activation": "relu"
+			}
+		}
+	},
+	
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		},
+		"iterator_relation": {
+			"type": "basic",
+			"batch_size": 4
+		}
+	},
+	
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}
\ No newline at end of file
diff --git a/configs/relation.json b/configs/relation.json
new file mode 100644
index 0000000..570dbf7
--- /dev/null
+++ b/configs/relation.json
@@ -0,0 +1,124 @@
+{	
+	"task_relation":{
+		"task_description":{
+			"task_name": "relation",
+			"validation_metric_name": "f1-measure-overall",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+		
+		"data_params":{
+			"dataset_reader":{
+				"type": "relation_ace",
+				"lazy": false,
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+			
+			
+			"train_data_path": "./data/ace2005/train/",
+			"validation_data_path": "./data/ace2005/dev/",
+			"test_data_path": "./data/ace2005/test/",
+			
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+		
+	"model": {
+		"type": "relation",
+		
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+		
+		"relation": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 64,
+				"num_layers": 3,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"d": 64,
+				"l": 64,
+				"n_classes": 6,
+				"activation": "relu"
+			}
+		}
+	},
+	
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		},
+		"iterator_relation": {
+			"type": "basic",
+			"batch_size": 4
+		}
+	},
+
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}
\ No newline at end of file
diff --git a/evaluate.py b/evaluate.py
new file mode 100644
index 0000000..46e7748
--- /dev/null
+++ b/evaluate.py
@@ -0,0 +1,203 @@
+# coding: utf-8
+
+"""
+The ``evaluate.py`` file can be used to
+evaluate a trained model against a dataset
+and report any metrics calculated by the model.
+It requires a configuration file and a directory in
+which to write the results.
+
+.. code-block:: bash
+
+   $ python evaluate.py --help
+    usage: evaluate.py [-h] -s SERIALIZATION_DIR [-g]
+
+    optional arguments:
+    -h, --help            show this help message and exit
+    -s SERIALIZATION_DIR, --serialization_dir SERIALIZATION_DIR
+                            Directory in which to save the model and its logs.
+    -g, --gold_mentions   Whether or not evaluate using gold mentions in
+                            coreference
+"""
+
+import argparse
+import os
+import json
+import itertools
+import re
+from copy import deepcopy
+import tqdm
+from typing import List, Dict, Any, Iterable
+import torch
+
+from allennlp.models.model import Model
+from allennlp.data import Instance
+from allennlp.data.iterators import DataIterator
+from allennlp.common.checks import check_for_gpu
+from allennlp.common.params import Params
+from allennlp.nn import util
+from allennlp.data import Vocabulary
+
+from hmtl.tasks import Task
+from hmtl.common import create_and_set_iterators
+
+import logging
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def evaluate(model: Model,
+             instances: Iterable[Instance],
+             task_name: str,
+             data_iterator: DataIterator,
+             cuda_device: int) -> Dict[str, Any]:
+    """
+    Evaluate a model for a particular task (usually after training).
+    
+    Parameters
+    ----------
+    model : ``allennlp.models.model.Model``, required
+        The model to evaluate
+    instances : ``Iterable[Instance]``, required
+        The (usually test) dataset on which to evalute the model.
+    task_name : ``str``, required
+        The name of the task on which evaluate the model.
+    data_iterator : ``DataIterator``
+        Iterator that go through the dataset.
+    cuda_device : ``int``
+        Cuda device to use.
+        
+    Returns
+    -------
+    metrics :  ``Dict[str, Any]``
+        A dictionary containing the metrics on the evaluated dataset.
+    """
+    check_for_gpu(cuda_device)
+    with torch.no_grad():
+        model.eval()
+
+        iterator = data_iterator(instances, 
+                                num_epochs = 1,
+                                shuffle = False)
+        logger.info("Iterating over dataset")
+        generator_tqdm = tqdm.tqdm(iterator, 
+                                    total = data_iterator.get_num_batches(instances))
+        
+        eval_loss = 0
+        nb_batches = 0
+        for batch in generator_tqdm:
+            batch = util.move_to_device(batch, cuda_device)
+            nb_batches += 1
+            
+            eval_output_dict = model.forward(task_name = task_name, tensor_batch = batch)
+            loss = eval_output_dict["loss"]
+            eval_loss += loss.item()
+            metrics = model.get_metrics(task_name = task_name)
+            metrics["loss"] = float(eval_loss/nb_batches)
+            
+            description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
+            generator_tqdm.set_description(description, refresh = False)
+
+        metrics = model.get_metrics(task_name = task_name, reset = True, full = True)
+        metrics["loss"] = float(eval_loss/nb_batches)
+        return metrics
+
+
+if __name__ == "__main__":
+    ### Evaluate from args ###
+    
+    # Parse arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-s",
+                        "--serialization_dir",
+                        required = True, 
+                        help = "Directory in which to save the model and its logs.", 
+                        type = str)		
+    parser.add_argument("-g",
+                        "--gold_mentions",
+                        action = "store_true",
+                        required = False,
+                        default = False,
+                        help = "Whether or not evaluate using gold mentions in coreference")				
+    args = parser.parse_args()
+    
+    
+    params = Params.from_file(params_file = os.path.join(args.serialization_dir, "config.json"))
+
+    
+    ### Instantiate tasks ###
+    task_list = []
+    task_keys = [key for key in params.keys() if re.search("^task_", key)]
+    
+    for key in task_keys:
+        logger.info("Creating %s", key)
+        task_params = params.pop(key)
+        task_description = task_params.pop("task_description")
+        task_data_params = task_params.pop("data_params")	
+        
+        task = Task.from_params(params = task_description)
+        task_list.append(task)	
+        
+        _, _ = task.load_data_from_params(params = task_data_params)
+        
+    
+    ### Load Vocabulary from files ###
+    vocab = Vocabulary.from_files(os.path.join(args.serialization_dir, "vocabulary"))
+    logger.info("Vocabulary loaded")
+    
+    
+    ### Load the data iterators ###
+    task_list = create_and_set_iterators(params = params, task_list = task_list, vocab = vocab)
+    
+    
+    ### Regularization	###
+    regularizer = None
+    
+    
+    ### Create model ###
+    model_params = params.pop("model")
+    model = Model.from_params(vocab = vocab, params = model_params, regularizer = regularizer)
+    
+    
+    ### Real evaluation ###
+    cuda_device = params.pop("multi_task_trainer").pop_int("cuda_device", -1)
+    
+    metrics = {task._name: {} for task in task_list}
+    for task in task_list:
+        if not task._evaluate_on_test: continue
+
+        logger.info("Task %s will be evaluated using the best epoch weights.", task._name)
+        assert task._test_data is not None, "Task {} wants to be evaluated on test dataset but no there is no test data loaded.".format(task._name)
+        
+        logger.info("Loading the best epoch weights for task %s", task._name)
+        best_model_state_path = os.path.join(args.serialization_dir, "best_{}.th".format(task._name))
+        best_model_state = torch.load(best_model_state_path)
+        best_model = model
+        best_model.load_state_dict(state_dict = best_model_state)
+        
+        test_metric_dict = {}
+        
+        for pair_task in task_list:
+            if not pair_task._evaluate_on_test: continue
+            
+            logger.info("Pair task %s is evaluated with the best model for %s", pair_task._name, task._name)
+            test_metric_dict[pair_task._name] = {}	
+            test_metrics = evaluate(model = best_model,
+                                    task_name = pair_task._name, 
+                                    instances = pair_task._test_data, 
+                                    data_iterator = pair_task._data_iterator, 
+                                    cuda_device = cuda_device)
+        
+            for metric_name, value in test_metrics.items():
+                test_metric_dict[pair_task._name][metric_name] = value
+                
+        metrics[task._name]["test"] = deepcopy(test_metric_dict)
+        logger.info("Finished evaluation of task %s.", task._name)
+        
+    metrics_json = json.dumps(metrics, indent = 2)
+    with open(os.path.join(args.serialization_dir, "evaluate_metrics.json"), "w") as metrics_file:
+        metrics_file.write(metrics_json)
+    
+    logger.info("Metrics: %s", metrics_json)
\ No newline at end of file
diff --git a/fine_tune.py b/fine_tune.py
new file mode 100644
index 0000000..8cb4964
--- /dev/null
+++ b/fine_tune.py
@@ -0,0 +1,157 @@
+# coding: utf-8
+
+"""
+The ``fine_tune.py`` file is used to continue training (or `fine-tune`) a model on a `different
+dataset` than the one it was originally trained on.  It requires a saved model archive file, a path
+to the data you will continue training with, and a directory in which to write the results.
+
+. code-block:: bash
+
+   $ python fine_tune.py --help
+    usage: fine_tune.py [-h] -s SERIALIZATION_DIR -c CONFIG_FILE_PATH -p
+                        PRETRAINED_DIR -m PRETRAINED_MODEL_NAME
+
+    optional arguments:
+    -h, --help            show this help message and exit
+    -s SERIALIZATION_DIR, --serialization_dir SERIALIZATION_DIR
+                            Directory in which to save the model and its logs.
+    -c CONFIG_FILE_PATH, --config_file_path CONFIG_FILE_PATH
+                            Path to parameter file describing the new multi-tasked
+                            model to be fine-tuned.
+    -p PRETRAINED_DIR, --pretrained_dir PRETRAINED_DIR
+                            Directory in which was saved the pre-trained model.
+    -m PRETRAINED_MODEL_NAME, --pretrained_model_name PRETRAINED_MODEL_NAME
+                            Name of the weight file for the pretrained model to
+                            fine-tune in the ``pretrained_dir``.
+"""
+
+import argparse
+import itertools
+import os
+import json
+import re
+from copy import deepcopy
+import torch
+from typing import List, Dict, Any, Tuple
+import logging
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+
+from hmtl.tasks import Task
+from hmtl.training.multi_task_trainer import MultiTaskTrainer
+from hmtl.common import create_and_set_iterators
+from evaluate import evaluate
+from train import train_model
+
+from allennlp.models.model import Model
+from allennlp.data import Vocabulary
+from allennlp.data.iterators import DataIterator
+from allennlp.commands.train import create_serialization_dir
+from allennlp.common.params import Params
+from allennlp.common.checks import ConfigurationError
+from allennlp.nn import RegularizerApplicator
+
+logger = logging.getLogger(__name__)
+
+
+if __name__ == "__main__":
+    # Parse arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-s",
+                        "--serialization_dir",
+                        required = True, 
+                        help = "Directory in which to save the model and its logs.", 
+                        type = str)					
+    parser.add_argument("-c",
+                        "--config_file_path",
+                        required = True,
+                        help = "Path to parameter file describing the new multi-tasked model to be fine-tuned.",
+                        type = str)	
+    parser.add_argument("-p",
+                        "--pretrained_dir",
+                        required = True, 
+                        help = "Directory in which was saved the pre-trained model.", 
+                        type = str)	
+    parser.add_argument("-m",
+                        "--pretrained_model_name",
+                        required = True, 
+                        help = "Name of the weight file for the pretrained model to fine-tune in the ``pretrained_dir``.", 
+                        type = str)			
+    args = parser.parse_args()
+    
+    
+    params = Params.from_file(params_file = args.config_file_path)
+    serialization_dir = args.serialization_dir
+    create_serialization_dir(params, serialization_dir, False)
+    
+    serialization_params = deepcopy(params).as_dict(quiet=True)
+    with open(os.path.join(serialization_dir, "config.json"), "w") as param_file:
+        json.dump(serialization_params, param_file, indent = 4)
+
+
+    ### Instantiate tasks ###
+    task_list = []
+    task_keys = [key for key in params.keys() if re.search("^task_", key)]
+    
+    for key in task_keys:
+        logger.info("Creating %s", key)
+        task_params = params.pop(key)
+        task_description = task_params.pop("task_description")
+        task_data_params = task_params.pop("data_params")	
+        
+        task = Task.from_params(params = task_description)
+        task_list.append(task)	
+        
+        _, _ = task.load_data_from_params(params = task_data_params)
+        
+    
+    ### Load Vocabulary from files and save it to the new serialization_dir ###
+    # PLEASE NOTE that here, we suppose that the vocabulary is the same for the pre-trained model
+    # and the model to fine-tune. The most noticeable implication of this hypothesis is that the label specs
+    # between the two datasets (for pre-training and for fine-tuning) are exactly the same.
+    vocab = Vocabulary.from_files(os.path.join(args.pretrained_dir, "vocabulary"))
+    logger.info("Vocabulary loaded from %s", os.path.join(args.pretrained_dir, "vocabulary"))
+    
+    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))
+    logger.info("Save vocabulary to file %s", os.path.join(serialization_dir, "vocabulary"))
+    
+    
+    ### Load the data iterators for each task ###
+    task_list = create_and_set_iterators(params = params, task_list = task_list, vocab = vocab)
+    
+    
+    ### Load Regularizations	###
+    regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))
+    
+    
+    ### Create model ###
+    model_params = params.pop("model")
+    model = Model.from_params(vocab = vocab, params = model_params, regularizer = regularizer)
+        
+        
+    logger.info("Loading the pretrained model from %s", os.path.join(args.pretrained_dir, args.pretrained_model_name))
+    try:
+        pretrained_model_state_path = os.path.join(args.pretrained_dir, args.pretrained_model_name)
+        pretrained_model_state = torch.load(pretrained_model_state_path)
+        model.load_state_dict(state_dict = pretrained_model_state)
+    except:
+        raise ConfigurationError("It appears that the configuration of the pretrained model and "
+                                "the model to fine-tune are not compatible. "
+                                "Please check the compatibility of the encoders and taggers in the "
+                                "config files.")
+        
+                
+    ### Create multi-task trainer ###
+    multi_task_trainer_params = params.pop("multi_task_trainer")
+    trainer = MultiTaskTrainer.from_params(model = model,
+                                        task_list = task_list,
+                                        serialization_dir = serialization_dir,
+                                        params = multi_task_trainer_params)
+    
+    
+    ### Launch training ###
+    metrics = train_model(multi_task_trainer = trainer,
+                        recover = False)				
+    if metrics is not None:
+        logging.info("Fine-tuning is finished ! Let's have a drink. It's on the house !")
\ No newline at end of file
diff --git a/hmtl/__init__.py b/hmtl/__init__.py
new file mode 100644
index 0000000..e0807c6
--- /dev/null
+++ b/hmtl/__init__.py
@@ -0,0 +1,7 @@
+# coding: utf-8
+
+from hmtl.dataset_readers import *
+from hmtl.modules import *
+from hmtl.models import *
+from hmtl.tasks import *
+from hmtl.training import *
\ No newline at end of file
diff --git a/hmtl/common/__init__.py b/hmtl/common/__init__.py
new file mode 100644
index 0000000..183c8db
--- /dev/null
+++ b/hmtl/common/__init__.py
@@ -0,0 +1,3 @@
+# coding: utf-8
+
+from hmtl.common.util import create_and_set_iterators
\ No newline at end of file
diff --git a/hmtl/common/util.py b/hmtl/common/util.py
new file mode 100644
index 0000000..dc16492
--- /dev/null
+++ b/hmtl/common/util.py
@@ -0,0 +1,53 @@
+# coding: utf-8
+
+"""
+Various utilities that don't fit anwhere else.
+"""
+
+from typing import List, Dict, Any, Tuple
+
+from allennlp.common.params import Params
+from allennlp.data import Vocabulary
+from allennlp.data.iterators import DataIterator
+
+from hmtl.tasks import Task
+
+
+
+def create_and_set_iterators(params: Params,
+                            task_list: List[Task],
+                            vocab: Vocabulary) -> List[Task]:
+    '''
+    Each task/dataset can have its own specific data iterator. If not precised,
+    we use a shared/common data iterator.
+    
+    Parameters
+    ----------
+    params: ``Params``
+        A parameter object specifing an experiment.
+    task_list: ``List[Task]``
+        A list containing the tasks of the model to train.
+        
+    Returns
+    -------
+    task_list: ``List[Task]``
+        The list containing the tasks of the model to train, where each task has a new attribute: the data iterator.
+    '''
+    ### Charge default iterator ###
+    iterators_params = params.pop("iterators")
+    
+    default_iterator_params = iterators_params.pop("iterator")
+    default_iterator = DataIterator.from_params(default_iterator_params)
+    default_iterator.index_with(vocab)
+    
+    ### Charge dataset specific iterators ###
+    for task in task_list:
+        specific_iterator_params = iterators_params.pop("iterator_" + task._name, None)
+        if specific_iterator_params is not None:
+            specific_iterator = DataIterator.from_params(specific_iterator_params)
+            specific_iterator.index_with(vocab)
+            task.set_data_iterator(specific_iterator)
+        else:
+            task.set_data_iterator(default_iterator)
+    
+    return task_list
\ No newline at end of file
diff --git a/hmtl/dataset_readers/__init__.py b/hmtl/dataset_readers/__init__.py
new file mode 100644
index 0000000..fbf8504
--- /dev/null
+++ b/hmtl/dataset_readers/__init__.py
@@ -0,0 +1,6 @@
+# coding: utf-8
+
+from hmtl.dataset_readers.ner_ontonotes import NerOntonotesReader
+from hmtl.dataset_readers.mention_ace import MentionACEReader
+from hmtl.dataset_readers.relation_ace import RelationACEReader
+from hmtl.dataset_readers.coref_ace import CorefACEReader
\ No newline at end of file
diff --git a/hmtl/dataset_readers/coref_ace.py b/hmtl/dataset_readers/coref_ace.py
new file mode 100644
index 0000000..054783f
--- /dev/null
+++ b/hmtl/dataset_readers/coref_ace.py
@@ -0,0 +1,180 @@
+# coding: utf-8
+
+import logging
+import collections
+from typing import Any, Dict, List, Optional, Tuple, DefaultDict, Set
+
+from overrides import overrides
+
+from allennlp.common import Params
+from allennlp.common.file_utils import cached_path
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.fields import Field, ListField, TextField, SpanField, MetadataField, SequenceLabelField
+from allennlp.data.instance import Instance
+from allennlp.data.tokenizers import Token
+from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
+from allennlp.data.dataset_readers.dataset_utils import enumerate_spans
+
+from hmtl.dataset_readers.dataset_utils import ACE
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def canonicalize_clusters(clusters: DefaultDict[int, List[Tuple[int, int]]]) -> List[List[Tuple[int, int]]]:
+    """
+    The CoNLL 2012 data includes 2 annotatated spans which are identical,
+    but have different ids. This checks all clusters for spans which are
+    identical, and if it finds any, merges the clusters containing the
+    identical spans.
+    """
+    merged_clusters: List[Set[Tuple[int, int]]] = []
+    for cluster in clusters.values():
+        cluster_with_overlapping_mention = None
+        for mention in cluster:
+            # Look at clusters we have already processed to
+            # see if they contain a mention in the current
+            # cluster for comparison.
+            for cluster2 in merged_clusters:
+                if mention in cluster2:
+                    # first cluster in merged clusters
+                    # which contains this mention.
+                    cluster_with_overlapping_mention = cluster2
+                    break
+            # Already encountered overlap - no need to keep looking.
+            if cluster_with_overlapping_mention is not None:
+                break
+        if cluster_with_overlapping_mention is not None:
+            # Merge cluster we are currently processing into
+            # the cluster in the processed list.
+            cluster_with_overlapping_mention.update(cluster)
+        else:
+            merged_clusters.append(set(cluster))
+    return [list(c) for c in merged_clusters]
+
+
+@DatasetReader.register("coref_ace")
+class CorefACEReader(DatasetReader):
+    """
+    A dataset reader to read the coref clusters from an ACE dataset
+    previously pre-procesed to fit the CoNLL-coreference format.
+
+    Parameters
+    ----------
+    max_span_width: ``int``, required.
+        The maximum width of candidate spans to consider.
+    token_indexers : ``Dict[str, TokenIndexer]``, optional
+        This is used to index the words in the document.  See :class:`TokenIndexer`.
+        Default is ``{"tokens": SingleIdTokenIndexer()}``.
+    lazy : ``bool``, optional (default = False)
+        Whether or not the dataset should be loaded in lazy way. 
+    """
+    def __init__(self,
+                 max_span_width: int,
+                 token_indexers: Dict[str, TokenIndexer] = None,
+                 lazy: bool = False) -> None:
+        super().__init__(lazy)
+        self._max_span_width = max_span_width
+        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
+
+    @overrides
+    def _read(self, file_path: str):
+        # if `file_path` is a URL, redirect to the cache
+        file_path = cached_path(file_path)
+
+        ace_reader = ACE()
+        for sentences in ace_reader.dataset_document_iterator(file_path):
+            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)
+
+            total_tokens = 0
+            for sentence in sentences:
+                for typed_span in sentence.coref_spans:
+                    # Coref annotations are on a _per sentence_
+                    # basis, so we need to adjust them to be relative
+                    # to the length of the document.
+                    span_id, (start, end) = typed_span
+                    clusters[span_id].append((start + total_tokens,
+                                              end + total_tokens))
+                total_tokens += len(sentence.words)
+
+            canonical_clusters = canonicalize_clusters(clusters)
+            yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
+
+    @overrides
+    def text_to_instance(self,  # type: ignore
+                         sentences: List[List[str]],
+                         gold_clusters: Optional[List[List[Tuple[int, int]]]] = None) -> Instance:
+        # pylint: disable=arguments-differ
+        """
+        Parameters
+        ----------
+        sentences : ``List[List[str]]``, required.
+            A list of lists representing the tokenised words and sentences in the document.
+        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
+            A list of all clusters in the document, represented as word spans. Each cluster
+            contains some number of spans, which can be nested and overlap, but will never
+            exactly match between clusters.
+
+        Returns
+        -------
+        An ``Instance`` containing the following ``Fields``:
+            text : ``TextField``
+                The text of the full document.
+            spans : ``ListField[SpanField]``
+                A ListField containing the spans represented as ``SpanFields``
+                with respect to the document text.
+            span_labels : ``SequenceLabelField``, optional
+                The id of the cluster which each possible span belongs to, or -1 if it does
+                 not belong to a cluster. As these labels have variable length (it depends on
+                 how many spans we are considering), we represent this a as a ``SequenceLabelField``
+                 with respect to the ``spans ``ListField``.
+        """
+        flattened_sentences = [self._normalize_word(word)
+                               for sentence in sentences
+                               for word in sentence]
+
+        metadata: Dict[str, Any] = {"original_text": flattened_sentences}
+        if gold_clusters is not None:
+            metadata["clusters"] = gold_clusters
+
+        text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers)
+
+        cluster_dict = {}
+        if gold_clusters is not None:
+            for cluster_id, cluster in enumerate(gold_clusters):
+                for mention in cluster:
+                    cluster_dict[tuple(mention)] = cluster_id
+
+        spans: List[Field] = []
+        span_labels: Optional[List[int]] = [] if gold_clusters is not None else None
+
+        sentence_offset = 0
+        for sentence in sentences:
+            for start, end in enumerate_spans(sentence,
+                                              offset=sentence_offset,
+                                              max_span_width=self._max_span_width):
+                if span_labels is not None:
+                    if (start, end) in cluster_dict:
+                        span_labels.append(cluster_dict[(start, end)])
+                    else:
+                        span_labels.append(-1)
+
+                spans.append(SpanField(start, end, text_field))
+            sentence_offset += len(sentence)
+
+        span_field = ListField(spans)
+        metadata_field = MetadataField(metadata)
+
+        fields: Dict[str, Field] = {"text": text_field,
+                                    "spans": span_field,
+                                    "metadata": metadata_field}
+        if span_labels is not None:
+            fields["span_labels"] = SequenceLabelField(span_labels, span_field)
+
+        return Instance(fields)
+
+    @staticmethod
+    def _normalize_word(word):
+        if word == "/." or word == "/?":
+            return word[1:]
+        else:
+            return word
diff --git a/hmtl/dataset_readers/dataset_utils/__init__.py b/hmtl/dataset_readers/dataset_utils/__init__.py
new file mode 100644
index 0000000..6610b37
--- /dev/null
+++ b/hmtl/dataset_readers/dataset_utils/__init__.py
@@ -0,0 +1,3 @@
+# coding: utf-8
+
+from hmtl.dataset_readers.dataset_utils.ace import ACE, ACESentence
\ No newline at end of file
diff --git a/hmtl/dataset_readers/dataset_utils/ace.py b/hmtl/dataset_readers/dataset_utils/ace.py
new file mode 100644
index 0000000..b29003a
--- /dev/null
+++ b/hmtl/dataset_readers/dataset_utils/ace.py
@@ -0,0 +1,282 @@
+# coding: utf-8
+
+from typing import DefaultDict, List, Optional, Iterator, Set, Tuple
+from collections import defaultdict
+import codecs
+import os
+import logging
+
+from allennlp.data.dataset_readers.dataset_utils import iob1_to_bioul
+
+from nltk import Tree
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+TypedSpan = Tuple[int, Tuple[int, int]]  # pylint: disable=invalid-name
+TypedStringSpan = Tuple[str, Tuple[int, int]]  # pylint: disable=invalid-name
+
+class ACESentence:
+    """
+    A class representing the annotations available for a single ACE CONLL-formatted sentence.
+
+    Parameters
+    ----------
+    words : ``List[str]``
+        This is the tokens as segmented/tokenized with spayc.
+    mention_tags : ``List[str]``
+        The BIO tags for Entity Mention Detection in the sentence.
+    relations : ``List[Tuple[str, List[str]]]``
+        The relations tags for Relation Extraction in the sentence.
+    last_head_token_relations : ``List[Tuple[str, List[str]]]``
+        The relations tags between last tokens for ARG1 and ARG2 for Relation Extraction in the sentence.
+    coref_spans : ``Set[TypedSpan]``
+        The spans for entity mentions involved in coreference resolution within the sentence.
+        Each element is a tuple composed of (cluster_id, (start_index, end_index)). Indices
+        are `inclusive`.
+    """
+    def __init__(self,
+                words: List[str],
+                mention_tags: List[str],
+                relations: List[Tuple[str, List[str]]],
+                last_head_token_relations: List[Tuple[str, List[str]]],
+                coref_spans: Set[TypedSpan]):
+        self.words = words
+        self.mention_tags = mention_tags
+        self.relations = relations
+        self.last_head_token_relations = last_head_token_relations
+        self.coref_spans = coref_spans
+
+
+class ACE:
+    """
+    This DatasetReader is designed to read in the ACE (2005 or 2004) which
+    have been previously formatted in the format used by the CoNLL format
+    (see for instance OntoNotes dataset).
+    """
+    def dataset_iterator(self, file_path: str) -> Iterator[ACESentence]:
+        """
+        An iterator over the entire dataset, yielding all sentences processed.
+        """
+        for conll_file in self.dataset_path_iterator(file_path):
+            yield from self.sentence_iterator(conll_file)
+
+    @staticmethod
+    def dataset_path_iterator(file_path: str) -> Iterator[str]:
+        """
+        An iterator returning file_paths in a directory
+        containing CONLL-formatted files.
+        """
+        logger.info("Reading ACE CONLL-like sentences from dataset files at: %s", file_path)
+        for root, _, files in list(os.walk(file_path)):
+            for data_file in files:
+                if not data_file.endswith("like_conll"):
+                    continue
+
+                yield os.path.join(root, data_file)
+
+    def dataset_document_iterator(self, file_path: str) -> Iterator[List[ACESentence]]:
+        """
+        An iterator over CONLL-formatted files which yields documents, regardless
+        of the number of document annotations in a particular file.
+        """
+        with codecs.open(file_path, 'r', encoding='utf8') as open_file:
+            conll_rows = []
+            document: List[ACESentence] = []
+            for line in open_file:
+                line = line.strip()
+                if line != '' and not line.startswith('#'):
+                    # Non-empty line. Collect the annotation.
+                    conll_rows.append(line)
+                else:
+                    if conll_rows:
+                        document.append(self._conll_rows_to_sentence(conll_rows))
+                        conll_rows = []
+                if line.startswith("#end document"):
+                    yield document
+                    document = []
+            if document:
+                # Collect any stragglers or files which might not
+                # have the '#end document' format for the end of the file.
+                yield document
+
+    def sentence_iterator(self, file_path: str) -> Iterator[ACESentence]:
+        """
+        An iterator over the sentences in an individual CONLL formatted file.
+        """
+        for document in self.dataset_document_iterator(file_path):
+            for sentence in document:
+                yield sentence
+
+    def _conll_rows_to_sentence(self, conll_rows: List[str]) -> ACESentence:
+        sentence: List[str] = []
+        mention_tags: List[str] = []
+        
+        span_labels: List[List[str]] = []
+        current_span_labels: List[str] = []
+        
+        # Cluster id -> List of (start_index, end_index) spans.
+        clusters: DefaultDict[int, List[Tuple[int, int]]] = defaultdict(list)
+        # Cluster id -> List of start_indices which are open for this id.
+        coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)
+        
+        for index, row in enumerate(conll_rows):
+            conll_components = row.split()
+            
+            word = conll_components[1]
+            
+            if not span_labels:
+                span_labels = [[] for _ in conll_components[2:-1]]
+                current_span_labels = [None for _ in conll_components[2:-1]]
+            self._process_span_annotations_for_word(annotations = conll_components[2:-1],
+                                                    span_labels = span_labels,
+                                                    current_span_labels = current_span_labels)
+            
+            #Process coref
+            self._process_coref_span_annotations_for_word(conll_components[-1],
+                                                index,
+                                                clusters,
+                                                coref_stacks)
+
+            sentence.append(word)
+        
+            
+        mention_tags = iob1_to_bioul(span_labels[0])
+        
+        #Process coref clusters
+        coref_span_tuples: Set[TypedSpan] = {(cluster_id, span)
+                                for cluster_id, span_list in clusters.items()
+                                for span in span_list}
+        
+        
+        #Reformat the labels to only keep the the last token of the head
+        #Cf paper, we model relation between last tokens of heads.
+        last_head_token_relations = []
+        bioul_relations = []
+
+        for relation_frame in span_labels[1:]:
+            bioul_relation_frame = iob1_to_bioul(relation_frame)
+            
+            reformatted_frame = []
+            for annotation in bioul_relation_frame:
+                if annotation[:2] in ["L-", "U-"]: 
+                    reformatted_frame.append(annotation[2:])
+                else: 
+                    reformatted_frame.append("*")
+                    
+            last_head_token_relations.append(reformatted_frame)
+            bioul_relations.append(bioul_relation_frame)
+
+        return ACESentence(sentence, mention_tags, bioul_relations, last_head_token_relations, coref_span_tuples)
+        
+        
+    @staticmethod
+    def _process_mention_tags(annotations: List[str]):
+        """
+        Read and pre-process the entity mention tags as a formatted in CoNll-NER-style.
+        """
+        labels = []
+        current_span_label = None
+        for annotation in annotations:
+            label = annotation.strip("()*")
+            if "(" in annotation:
+                bio_label = "B-" + label
+                current_span_label = label
+            elif current_span_label is not None:
+                bio_label = "I-" + current_span_label
+            else:
+                bio_label = "O"
+            if ")" in annotation:
+                current_span_label = None
+            labels.append(bio_label)
+        return labels
+        
+    @staticmethod
+    def _process_span_annotations_for_word(annotations: List[str],
+                                           span_labels: List[List[str]],
+                                           current_span_labels: List[Optional[str]]) -> None:
+        """
+        Given a sequence of different label types for a single word and the current
+        span label we are inside, compute the BIO tag for each label and append to a list.
+
+        Parameters
+        ----------
+        annotations: ``List[str]``
+            A list of labels to compute BIO tags for.
+        span_labels : ``List[List[str]]``
+            A list of lists, one for each annotation, to incrementally collect
+            the BIO tags for a sequence.
+        current_span_labels : ``List[Optional[str]]``
+            The currently open span per annotation type, or ``None`` if there is no open span.
+        """
+        for annotation_index, annotation in enumerate(annotations):
+            # strip all bracketing information to
+            # get the actual propbank label.
+            label = annotation.strip("()*")
+
+            if "(" in annotation:
+                # Entering into a span for a particular semantic role label.
+                # We append the label and set the current span for this annotation.
+                bio_label = "B-" + label
+                span_labels[annotation_index].append(bio_label)
+                current_span_labels[annotation_index] = label
+            elif current_span_labels[annotation_index] is not None:
+                # If there's no '(' token, but the current_span_label is not None,
+                # then we are inside a span.
+                bio_label = "I-" + current_span_labels[annotation_index]
+                span_labels[annotation_index].append(bio_label)
+            else:
+                # We're outside a span.
+                span_labels[annotation_index].append("O")
+            # Exiting a span, so we reset the current span label for this annotation.
+            if ")" in annotation:
+                current_span_labels[annotation_index] = None
+                
+                
+    @staticmethod
+    def _process_coref_span_annotations_for_word(label: str,
+                                                 word_index: int,
+                                                 clusters: DefaultDict[int, List[Tuple[int, int]]],
+                                                 coref_stacks: DefaultDict[int, List[int]]) -> None:
+        """
+        For a given coref label, add it to a currently open span(s), complete a span(s) or
+        ignore it, if it is outside of all spans. This method mutates the clusters and coref_stacks
+        dictionaries.
+
+        Parameters
+        ----------
+        label : ``str``
+            The coref label for this word.
+        word_index : ``int``
+            The word index into the sentence.
+        clusters : ``DefaultDict[int, List[Tuple[int, int]]]``
+            A dictionary mapping cluster ids to lists of inclusive spans into the
+            sentence.
+        coref_stacks: ``DefaultDict[int, List[int]]``
+            Stacks for each cluster id to hold the start indices of active spans (spans
+            which we are inside of when processing a given word). Spans with the same id
+            can be nested, which is why we collect these opening spans on a stack, e.g:
+
+            [Greg, the baker who referred to [himself]_ID1 as 'the bread man']_ID1
+        """
+        if label != "-":
+            for segment in label.split("|"):
+                # The conll representation of coref spans allows spans to
+                # overlap. If spans end or begin at the same word, they are
+                # separated by a "|".
+                if segment[0] == "(":
+                    # The span begins at this word.
+                    if segment[-1] == ")":
+                        # The span begins and ends at this word (single word span).
+                        cluster_id = int(segment[1:-1])
+                        clusters[cluster_id].append((word_index, word_index))
+                    else:
+                        # The span is starting, so we record the index of the word.
+                        cluster_id = int(segment[1:])
+                        coref_stacks[cluster_id].append(word_index)
+                else:
+                    # The span for this id is ending, but didn't start at this word.
+                    # Retrieve the start index from the document state and
+                    # add the span to the clusters for this id.
+                    cluster_id = int(segment[:-1])
+                    start = coref_stacks[cluster_id].pop()
+                    clusters[cluster_id].append((start, word_index))
\ No newline at end of file
diff --git a/hmtl/dataset_readers/mention_ace.py b/hmtl/dataset_readers/mention_ace.py
new file mode 100644
index 0000000..048f060
--- /dev/null
+++ b/hmtl/dataset_readers/mention_ace.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+
+import logging
+from typing import Dict, List, Iterable, Iterator
+
+from overrides import overrides
+import codecs
+
+from allennlp.common import Params
+from allennlp.common.checks import ConfigurationError
+from allennlp.common.file_utils import cached_path
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_utils import iob1_to_bioul
+from allennlp.data.fields import Field, TextField, SequenceLabelField
+from allennlp.data.instance import Instance
+from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
+from allennlp.data.tokenizers import Token
+from allennlp.data.dataset_readers.dataset_utils import Ontonotes, OntonotesSentence
+
+from hmtl.dataset_readers.dataset_utils import ACE, ACESentence
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+        
+@DatasetReader.register("mention_ace")
+class MentionACEReader(DatasetReader):
+    '''
+    A dataset reader to read the Entity Mention Tags from an ACE dataset
+    previously pre-procesed to fit the CoNll-NER format.
+    '''
+    def __init__(self,
+                 token_indexers: Dict[str, TokenIndexer] = None,
+                 label_namespace: str = "ace_mention_labels",
+                 lazy: bool = False) -> None:
+        super().__init__(lazy)
+        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
+        self._label_namespace = label_namespace
+    
+    
+    @staticmethod
+    def _sentence_iterate(ace_reader: ACE,
+                        file_path: str) -> Iterable[ACESentence]:
+        for conll_file in ace_reader.dataset_path_iterator(file_path):
+            yield from ace_reader.sentence_iterator(conll_file)
+    
+    
+    @overrides
+    def _read(self,
+              file_path: str):
+        file_path = cached_path(file_path) # if `file_path` is a URL, redirect to the cache
+        ace_reader = ACE()
+        logger.info("Reading ACE Mention instances from dataset files at: %s", file_path)
+        
+        for sentence in self._sentence_iterate(ace_reader, file_path):
+            tokens = [Token(t) for t in sentence.words]
+            if not sentence.mention_tags:
+                tags = ["O" for _ in tokens]
+            else:
+                tags = sentence.mention_tags
+
+            yield self.text_to_instance(tokens, tags)
+    
+    
+    def text_to_instance(self,
+                         tokens: List[Token],
+                         tags: List[str] = None) -> Instance:
+        # pylint: disable=arguments-differ
+        fields: Dict[str, Field] = {}
+        text_field = TextField(tokens, token_indexers=self._token_indexers)
+        fields['tokens'] = text_field
+        if tags:
+            fields['tags'] = SequenceLabelField(labels = tags, sequence_field = text_field, label_namespace = self._label_namespace)
+        return Instance(fields)
+         
\ No newline at end of file
diff --git a/hmtl/dataset_readers/ner_ontonotes.py b/hmtl/dataset_readers/ner_ontonotes.py
new file mode 100644
index 0000000..f8d0c17
--- /dev/null
+++ b/hmtl/dataset_readers/ner_ontonotes.py
@@ -0,0 +1,107 @@
+# coding: utf-8
+
+import logging
+from typing import Dict, List, Iterable
+
+from overrides import overrides
+
+from allennlp.common import Params
+from allennlp.common.checks import ConfigurationError
+from allennlp.common.file_utils import cached_path
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_utils import iob1_to_bioul
+from allennlp.data.fields import Field, TextField, SequenceLabelField
+from allennlp.data.instance import Instance
+from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
+from allennlp.data.tokenizers import Token
+from allennlp.data.dataset_readers.dataset_utils import Ontonotes, OntonotesSentence
+
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+@DatasetReader.register("ner_ontonotes")
+class NerOntonotesReader(DatasetReader):
+    '''
+    An ``allennlp.data.dataset_readers.dataset_reader.DatasetReader`` for reading
+    NER annotations in CoNll-formatted OntoNotes dataset.
+    
+    NB: This DatasetReader was implemented before the current implementation of 
+    ``OntonotesNamedEntityRecognition`` in AllenNLP. It is thought doing pretty much the same thing.
+    
+    Parameters
+    ----------
+    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
+        We use this to define the input representation for the text.  See :class:`TokenIndexer`.
+        Map a token to an id.
+    domain_identifier : ``str``, optional (default = None)
+        The subdomain to load. If None is specified, the whole dataset is loaded.
+    label_namespace : ``str``, optional (default = "ontonotes_ner_labels")
+        The tag/label namespace for the task/dataset considered.
+    lazy : ``bool``, optional (default = False)
+        Whether or not the dataset should be loaded in lazy way. 
+        Refer to https://github.com/allenai/allennlp/blob/master/tutorials/getting_started/laziness.md
+        for more details about lazyness.
+    coding_scheme: ``str``, optional (default=``IOB1``)
+        Specifies the coding scheme for ``ner_labels`` and ``chunk_labels``.
+        Valid options are ``IOB1`` and ``BIOUL``.  The ``IOB1`` default maintains
+        the original IOB1 scheme in the CoNLL data.
+        In the IOB1 scheme, I is a token inside a span, O is a token outside
+        a span and B is the beginning of span immediately following another
+        span of the same type.
+    '''
+    def __init__(self,
+                 token_indexers: Dict[str, TokenIndexer] = None,
+                 domain_identifier: str = None,
+                 label_namespace: str = "ontonotes_ner_labels",
+                 lazy: bool = False,
+                 coding_scheme: str = "IOB1") -> None:
+        super().__init__(lazy)
+        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
+        self._domain_identifier = domain_identifier
+        self._label_namespace = label_namespace
+        self._coding_scheme = coding_scheme
+        if coding_scheme not in ("IOB1", "BIOUL"):
+            raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme))
+        
+    @overrides
+    def _read(self,
+              file_path: str):
+        file_path = cached_path(file_path) # if `file_path` is a URL, redirect to the cache
+        ontonotes_reader = Ontonotes()
+        logger.info("Reading NER instances from dataset files at: %s", file_path)
+        if self._domain_identifier is not None:
+            logger.info("Filtering to only include file paths containing the %s domain", self._domain_identifier)
+            
+        for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier):
+            tokens = [Token(t) for t in sentence.words]
+            if not sentence.named_entities:
+                tags = ["O" for _ in tokens]
+            else:
+                tags = sentence.named_entities
+                
+            if self._coding_scheme == "BIOUL":
+                tags = iob1_to_bioul(tags)
+                
+            yield self.text_to_instance(tokens, tags)
+          
+        
+    @staticmethod
+    def _ontonotes_subset(ontonotes_reader: Ontonotes,
+                          file_path: str,
+                          domain_identifier: str) -> Iterable[OntonotesSentence]:
+        for conll_file in ontonotes_reader.dataset_path_iterator(file_path):
+            yield from ontonotes_reader.sentence_iterator(conll_file)
+    
+    
+    def text_to_instance(self,
+                         tokens: List[Token],
+                         tags: List[str] = None) -> Instance:
+        # pylint: disable=arguments-differ
+        fields: Dict[str, Field] = {}
+        text_field = TextField(tokens, token_indexers=self._token_indexers)
+        fields['tokens'] = text_field
+        if tags:
+            fields['tags'] = SequenceLabelField(labels = tags, sequence_field = text_field, label_namespace = self._label_namespace)
+        return Instance(fields)
+                
\ No newline at end of file
diff --git a/hmtl/dataset_readers/relation_ace.py b/hmtl/dataset_readers/relation_ace.py
new file mode 100644
index 0000000..55b96aa
--- /dev/null
+++ b/hmtl/dataset_readers/relation_ace.py
@@ -0,0 +1,80 @@
+# coding: utf-8
+
+import logging
+from typing import Dict, List, Iterable, Iterator
+
+from overrides import overrides
+import codecs
+
+from allennlp.common import Params
+from allennlp.common.checks import ConfigurationError
+from allennlp.common.file_utils import cached_path
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_utils import iob1_to_bioul
+from allennlp.data.fields import Field, TextField, SequenceLabelField, ListField
+from allennlp.data.instance import Instance
+from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
+from allennlp.data.tokenizers import Token
+from allennlp.data.dataset_readers.dataset_utils import Ontonotes, OntonotesSentence
+
+from hmtl.dataset_readers.dataset_utils import ACE, ACESentence
+#from hmtl.fields import MultipleSequenceLabelField
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+        
+@DatasetReader.register("relation_ace")
+class RelationACEReader(DatasetReader):
+    """
+    A dataset reader to read the relations links from an ACE dataset
+    previously pre-procesed to fit the CoNLL-SRL format.
+    """
+    def __init__(self,
+                 token_indexers: Dict[str, TokenIndexer] = None,
+                 label_namespace: str = "relation_ace_labels",
+                 lazy: bool = False) -> None:
+        super().__init__(lazy)
+        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
+        self._label_namespace = label_namespace
+    
+    
+    @staticmethod
+    def _sentence_iterate(ace_reader: ACE,
+                        file_path: str) -> Iterable[ACESentence]:
+        for conll_file in ace_reader.dataset_path_iterator(file_path):
+            yield from ace_reader.sentence_iterator(conll_file)
+    
+    
+    @overrides
+    def _read(self,
+              file_path: str):
+        file_path = cached_path(file_path) # if `file_path` is a URL, redirect to the cache
+        ace_reader = ACE()
+        logger.info("Reading Relation labels from dataset files at: %s", file_path)
+        
+        for sentence in self._sentence_iterate(ace_reader, file_path):
+            tokens = [Token(t) for t in sentence.words]
+            
+            if sentence.relations == []:
+                relations = None
+                continue
+            else:
+                relations = sentence.last_head_token_relations
+                yield self.text_to_instance(tokens, relations)
+    
+    
+    def text_to_instance(self,
+                         tokens: List[Token],
+                         relations = None) -> Instance:
+        # pylint: disable=arguments-differ
+        fields: Dict[str, Field] = {}
+        text_field = TextField(tokens, token_indexers=self._token_indexers)
+        fields['text'] = text_field
+        if relations is not None:
+            field_list = []
+            for relation in relations:
+                field_list.append(SequenceLabelField(labels = relation, sequence_field = text_field, label_namespace = self._label_namespace))
+            fields["relations"] = ListField(field_list = field_list)
+        return Instance(fields)
+                
\ No newline at end of file
diff --git a/hmtl/models/__init__.py b/hmtl/models/__init__.py
new file mode 100644
index 0000000..b13b730
--- /dev/null
+++ b/hmtl/models/__init__.py
@@ -0,0 +1,21 @@
+# coding: utf-8
+
+from hmtl.models.coref_custom import CoreferenceCustom
+from hmtl.models.relation_extraction import RelationExtractor
+
+#Single Module
+from hmtl.models.layerNer import LayerNer
+from hmtl.models.layerRelation import LayerRelation
+from hmtl.models.layerCoref import LayerCoref
+
+#Two modules
+from hmtl.models.layerNerEmd import LayerNerEmd
+from hmtl.models.layerEmdRelation import LayerEmdRelation
+from hmtl.models.layerEmdCoref import LayerEmdCoref
+
+#Three modules
+from hmtl.models.layerNerEmdCoref import LayerNerEmdCoref
+from hmtl.models.layerNerEmdRelation import LayerNerEmdRelation
+
+#Four modules
+from hmtl.models.hmtl import HMTL
\ No newline at end of file
diff --git a/hmtl/models/coref_custom.py b/hmtl/models/coref_custom.py
new file mode 100644
index 0000000..45e07e3
--- /dev/null
+++ b/hmtl/models/coref_custom.py
@@ -0,0 +1,204 @@
+import logging
+import math
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from overrides import overrides
+
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules.token_embedders import Embedding
+from allennlp.modules import FeedForward
+from allennlp.modules import Seq2SeqEncoder, TimeDistributed, TextFieldEmbedder, SpanPruner
+from allennlp.modules.span_extractors import SelfAttentiveSpanExtractor, EndpointSpanExtractor
+from allennlp.nn import util, InitializerApplicator, RegularizerApplicator
+from allennlp.training.metrics import MentionRecall, ConllCorefScores
+from allennlp.models.coreference_resolution import CoreferenceResolver
+
+from hmtl.training.metrics import ConllCorefFullScores
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+class CoreferenceCustom(CoreferenceResolver):
+    """
+    This class implements a marginally modified version of ``allennlp.models.coreference_resolution.CoreferenceResolver``
+    which is an implementation of the model of Lee et al., 2017.
+    The two modifications are:
+        1/ Replacing the scorer to be able to get the 3 detailled coreference metrics (B3, MUC, CEAFE),
+        and not only their average.
+        2/ Give the possibility to evaluate with the gold mentions: the model first predict mentions that MIGHT
+        be part of a coreference cluster, and in second time predict the coreference clusters for theses mentions.
+        We leave the possibility of replacing predicting the possible mentions 
+        with the gold mentions in evaluation.
+    """
+    def __init__(self,
+                 vocab: Vocabulary,
+                 text_field_embedder: TextFieldEmbedder,
+                 context_layer: Seq2SeqEncoder,
+                 mention_feedforward: FeedForward,
+                 antecedent_feedforward: FeedForward,
+                 feature_size: int,
+                 max_span_width: int,
+                 spans_per_word: float,
+                 max_antecedents: int,
+                 lexical_dropout: float = 0.2,
+                 initializer: InitializerApplicator = InitializerApplicator(),
+                 regularizer: Optional[RegularizerApplicator] = None,
+                 eval_on_gold_mentions: bool = False) -> None:
+        super(CoreferenceCustom, self).__init__(vocab = vocab, 
+                                                text_field_embedder = text_field_embedder,
+                                                context_layer = context_layer,
+                                                mention_feedforward = mention_feedforward,
+                                                antecedent_feedforward = antecedent_feedforward,
+                                                feature_size = feature_size,
+                                                max_span_width = max_span_width,
+                                                spans_per_word = spans_per_word,
+                                                max_antecedents = max_antecedents,
+                                                lexical_dropout = lexical_dropout,
+                                                initializer = initializer,
+                                                regularizer = regularizer)
+
+        self._conll_coref_scores = ConllCorefFullScores()
+        self._eval_on_gold_mentions = eval_on_gold_mentions
+        
+        if self._eval_on_gold_mentions: 
+            self._use_gold_mentions = False
+        else: 
+            self._use_gold_mentions = None
+        
+    
+    @overrides
+    def get_metrics(self, 
+                    reset: bool = False, 
+                    full:bool = False):
+        mention_recall = self._mention_recall.get_metric(reset = reset)
+        metrics = self._conll_coref_scores.get_metric(reset = reset, full = full)
+        metrics["mention_recall"] = mention_recall
+
+        return metrics
+        
+    @overrides
+    def forward(self,  # type: ignore
+                text: Dict[str, torch.LongTensor],
+                spans: torch.IntTensor,
+                span_labels: torch.IntTensor = None,
+                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        
+        # Shape: (batch_size, document_length, embedding_size)
+        text_embeddings = self._lexical_dropout(self._text_field_embedder(text))
+        
+        document_length = text_embeddings.size(1)
+
+        # Shape: (batch_size, document_length)
+        text_mask = util.get_text_field_mask(text).float()
+
+        # Shape: (batch_size, num_spans)
+        if self._use_gold_mentions:
+            if text_embeddings.is_cuda: device = torch.device('cuda')
+            else: device = torch.device('cpu')
+                
+            s = [torch.as_tensor(pair, dtype = torch.long, device = device) for cluster in metadata[0]["clusters"] for pair in cluster]
+            gm = torch.stack(s, dim = 0).unsqueeze(0).unsqueeze(1)
+            
+            span_mask = (spans.unsqueeze(2) - gm)
+            span_mask = (span_mask[:,:,:,0]==0) + (span_mask[:,:,:,1]==0)
+            span_mask, _ = (span_mask == 2).max(-1)
+            num_spans = span_mask.sum().item()
+            span_mask = span_mask.float()
+        else:
+            span_mask = (spans[:, :, 0] >= 0).squeeze(-1).float()
+            num_spans = spans.size(1)
+        # Shape: (batch_size, num_spans, 2)
+        spans = F.relu(spans.float()).long()
+
+        # Shape: (batch_size, document_length, encoding_dim)
+        contextualized_embeddings = self._context_layer(text_embeddings, text_mask)
+        # Shape: (batch_size, num_spans, 2 * encoding_dim + feature_size)
+        endpoint_span_embeddings = self._endpoint_span_extractor(contextualized_embeddings, spans)
+        # Shape: (batch_size, num_spans, emebedding_size)
+        attended_span_embeddings = self._attentive_span_extractor(text_embeddings, spans)
+
+        # Shape: (batch_size, num_spans, emebedding_size + 2 * encoding_dim + feature_size)
+        span_embeddings = torch.cat([endpoint_span_embeddings, attended_span_embeddings], -1)
+
+        # Prune based on mention scores.
+        num_spans_to_keep = int(math.floor(self._spans_per_word * document_length))
+
+        (top_span_embeddings, top_span_mask,
+         top_span_indices, top_span_mention_scores) = self._mention_pruner(span_embeddings,
+                                                                           span_mask,
+                                                                           num_spans_to_keep)
+        top_span_mask = top_span_mask.unsqueeze(-1)
+        # Shape: (batch_size * num_spans_to_keep)
+        flat_top_span_indices = util.flatten_and_batch_shift_indices(top_span_indices, num_spans)
+
+        # Compute final predictions for which spans to consider as mentions.
+        # Shape: (batch_size, num_spans_to_keep, 2)
+        top_spans = util.batched_index_select(spans,
+                                              top_span_indices,
+                                              flat_top_span_indices)
+
+        # Compute indices for antecedent spans to consider.
+        max_antecedents = min(self._max_antecedents, num_spans_to_keep)
+
+        # Shapes:
+        # (num_spans_to_keep, max_antecedents),
+        # (1, max_antecedents),
+        # (1, num_spans_to_keep, max_antecedents)
+        valid_antecedent_indices, valid_antecedent_offsets, valid_antecedent_log_mask = \
+            self._generate_valid_antecedents(num_spans_to_keep, max_antecedents, util.get_device_of(text_mask))
+        # Select tensors relating to the antecedent spans.
+        # Shape: (batch_size, num_spans_to_keep, max_antecedents, embedding_size)
+        candidate_antecedent_embeddings = util.flattened_index_select(top_span_embeddings,
+                                                                      valid_antecedent_indices)
+
+        # Shape: (batch_size, num_spans_to_keep, max_antecedents)
+        candidate_antecedent_mention_scores = util.flattened_index_select(top_span_mention_scores,
+                                                                          valid_antecedent_indices).squeeze(-1)
+        # Compute antecedent scores.
+        # Shape: (batch_size, num_spans_to_keep, max_antecedents, embedding_size)
+        span_pair_embeddings = self._compute_span_pair_embeddings(top_span_embeddings,
+                                                                  candidate_antecedent_embeddings,
+                                                                  valid_antecedent_offsets)
+        # Shape: (batch_size, num_spans_to_keep, 1 + max_antecedents)
+        coreference_scores = self._compute_coreference_scores(span_pair_embeddings,
+                                                              top_span_mention_scores,
+                                                              candidate_antecedent_mention_scores,
+                                                              valid_antecedent_log_mask)
+
+        # Shape: (batch_size, num_spans_to_keep)
+        _, predicted_antecedents = coreference_scores.max(2)
+        predicted_antecedents -= 1
+
+        output_dict = {"top_spans": top_spans,
+                       "antecedent_indices": valid_antecedent_indices,
+                       "predicted_antecedents": predicted_antecedents}
+        if span_labels is not None:
+            # Find the gold labels for the spans which we kept.
+            pruned_gold_labels = util.batched_index_select(span_labels.unsqueeze(-1),
+                                                           top_span_indices,
+                                                           flat_top_span_indices)
+
+            antecedent_labels = util.flattened_index_select(pruned_gold_labels,
+                                                            valid_antecedent_indices).squeeze(-1)
+            antecedent_labels += valid_antecedent_log_mask.long()
+
+            # Compute labels.
+            # Shape: (batch_size, num_spans_to_keep, max_antecedents + 1)
+            gold_antecedent_labels = self._compute_antecedent_gold_labels(pruned_gold_labels,
+                                                                          antecedent_labels)
+            coreference_log_probs = util.last_dim_log_softmax(coreference_scores, top_span_mask)
+            correct_antecedent_log_probs = coreference_log_probs + gold_antecedent_labels.log()
+            negative_marginal_log_likelihood = -util.logsumexp(correct_antecedent_log_probs).sum()
+
+            self._mention_recall(top_spans, metadata)
+            self._conll_coref_scores(top_spans, valid_antecedent_indices, predicted_antecedents, metadata)
+
+            output_dict["loss"] = negative_marginal_log_likelihood
+            
+        if metadata is not None:
+            output_dict["document"] = [x["original_text"] for x in metadata]
+        return output_dict
\ No newline at end of file
diff --git a/hmtl/models/hmtl.py b/hmtl/models/hmtl.py
new file mode 100644
index 0000000..493ae2c
--- /dev/null
+++ b/hmtl/models/hmtl.py
@@ -0,0 +1,207 @@
+# coding: utf-8
+               
+import os
+import sys
+import logging
+from typing import Dict
+from overrides import overrides
+
+import torch
+
+from allennlp.common import Params
+from allennlp.common.checks import ConfigurationError
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder
+from allennlp.nn import RegularizerApplicator, InitializerApplicator
+from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
+from allennlp.modules import FeedForward
+from allennlp.models.crf_tagger import CrfTagger
+
+from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder
+from hmtl.models.relation_extraction import RelationExtractor
+from hmtl.models import CoreferenceCustom
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+@Model.register("hmtl")
+class HMTL(Model):
+    """
+    A class that implement the full HMTL model.
+    
+    Parameters
+    ----------
+    vocab: ``allennlp.data.Vocabulary``, required.
+        The vocabulary fitted on the data.
+    params: ``allennlp.common.Params``, required
+        Configuration parameters for the multi-task model.
+    regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None)
+        A reguralizer to apply to the model's layers.
+    """
+    def __init__(self,
+                vocab: Vocabulary,
+                params: Params,
+                regularizer: RegularizerApplicator = None):
+                
+        super(HMTL, self).__init__(vocab = vocab, regularizer = regularizer)
+        
+        # Base text Field Embedder
+        text_field_embedder_params = params.pop("text_field_embedder")
+        text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, 
+                                                                params=text_field_embedder_params)
+        self._text_field_embedder = text_field_embedder
+
+        
+        ############
+        # NER Stuffs
+        ############
+        ner_params = params.pop("ner")
+        
+        # Encoder
+        encoder_ner_params = ner_params.pop("encoder")
+        encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params)
+        self._encoder_ner =  encoder_ner
+        
+        # Tagger NER - CRF Tagger
+        tagger_ner_params = ner_params.pop("tagger")
+        tagger_ner = CrfTagger(vocab = vocab,
+                            text_field_embedder = self._text_field_embedder,
+                            encoder = self._encoder_ner,
+                            label_namespace = tagger_ner_params.pop("label_namespace", "labels"),
+                            constraint_type = tagger_ner_params.pop("constraint_type", None),
+                            dropout = tagger_ner_params.pop("dropout", None),
+                            regularizer = regularizer)
+        self._tagger_ner = tagger_ner
+
+        
+        ############
+        # EMD Stuffs
+        ############
+        emd_params = params.pop("emd")
+        
+        # Encoder
+        encoder_emd_params = emd_params.pop("encoder")
+        encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params)
+        self._encoder_emd =  encoder_emd
+        
+        shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder,
+                                                                        previous_encoders = [self._encoder_ner])
+        self._shortcut_text_field_embedder = shortcut_text_field_embedder
+        
+        
+        # Tagger: EMD - CRF Tagger
+        tagger_emd_params = emd_params.pop("tagger")
+        tagger_emd = CrfTagger(vocab = vocab,
+                                text_field_embedder = self._shortcut_text_field_embedder,
+                                encoder = self._encoder_emd,
+                                label_namespace = tagger_emd_params.pop("label_namespace", "labels"),
+                                constraint_type = tagger_emd_params.pop("constraint_type", None),
+                                dropout = tagger_ner_params.pop("dropout", None),
+                                regularizer = regularizer)
+        self._tagger_emd = tagger_emd
+        
+        
+        ############################
+        # Relation Extraction Stuffs
+        ############################
+        relation_params = params.pop("relation")
+        
+        # Encoder
+        encoder_relation_params = relation_params.pop("encoder")
+        encoder_relation = Seq2SeqEncoder.from_params(encoder_relation_params)
+        self._encoder_relation =  encoder_relation
+        
+        shortcut_text_field_embedder_relation = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder,
+                                                                                previous_encoders = [self._encoder_ner, self._encoder_emd])
+        self._shortcut_text_field_embedder_relation = shortcut_text_field_embedder_relation
+        
+        # Tagger: Relation
+        tagger_relation_params = relation_params.pop("tagger")
+        tagger_relation = RelationExtractor(vocab = vocab,
+                                            text_field_embedder = self._shortcut_text_field_embedder_relation,
+                                            context_layer = self._encoder_relation,
+                                            d = tagger_relation_params.pop_int("d"),
+                                            l = tagger_relation_params.pop_int("l"),
+                                            n_classes = tagger_relation_params.pop("n_classes"),
+                                            activation = tagger_relation_params.pop("activation"))
+        self._tagger_relation = tagger_relation	
+        
+        
+        ##############
+        # Coref Stuffs
+        ##############
+        coref_params = params.pop("coref")
+        
+        # Encoder
+        encoder_coref_params = coref_params.pop("encoder")
+        encoder_coref = Seq2SeqEncoder.from_params(encoder_coref_params)
+        self._encoder_coref =  encoder_coref
+        
+        shortcut_text_field_embedder_coref = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder,
+                                                                            previous_encoders = [self._encoder_ner, self._encoder_emd])
+        self._shortcut_text_field_embedder_coref = shortcut_text_field_embedder_coref
+        
+        # Tagger: Coreference
+        tagger_coref_params = coref_params.pop("tagger")
+        eval_on_gold_mentions = tagger_coref_params.pop_bool("eval_on_gold_mentions", False)
+        init_params = tagger_coref_params.pop("initializer", None)
+        initializer = (InitializerApplicator.from_params(init_params)
+                if init_params is not None
+                else InitializerApplicator())
+        
+        tagger_coref = CoreferenceCustom(vocab = vocab,
+                                        text_field_embedder = self._shortcut_text_field_embedder_coref,
+                                        context_layer = self._encoder_coref,
+                                        mention_feedforward = FeedForward.from_params(tagger_coref_params.pop("mention_feedforward")),
+                                        antecedent_feedforward = FeedForward.from_params(tagger_coref_params.pop("antecedent_feedforward")),
+                                        feature_size = tagger_coref_params.pop_int("feature_size"),
+                                        max_span_width = tagger_coref_params.pop_int("max_span_width"),
+                                        spans_per_word = tagger_coref_params.pop_float("spans_per_word"),
+                                        max_antecedents = tagger_coref_params.pop_int("max_antecedents"),
+                                        lexical_dropout = tagger_coref_params.pop_float("lexical_dropout", 0.2),
+                                        initializer = initializer,
+                                        regularizer = regularizer,
+                                        eval_on_gold_mentions = eval_on_gold_mentions)
+        self._tagger_coref = tagger_coref								
+        if eval_on_gold_mentions:
+            self._tagger_coref._eval_on_gold_mentions = True
+            
+        logger.info("Multi-Task Learning Model has been instantiated.")
+        
+    @overrides	
+    def forward(self, 
+                tensor_batch,
+                for_training: bool = False,
+                task_name: str = "ner") -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        
+        tagger = getattr(self, "_tagger_%s" % task_name)
+        
+        if task_name == "coref" and tagger._eval_on_gold_mentions:
+            if for_training: tagger._use_gold_mentions = False
+            else: tagger._use_gold_mentions = True 
+
+        return tagger.forward(**tensor_batch)
+            
+    @overrides
+    def get_metrics(self,
+                    task_name: str,
+                    reset: bool = False,
+                    full: bool = False) -> Dict[str, float]:
+        
+        task_tagger = getattr(self, "_tagger_" + task_name)
+        if full and task_name == "coref":
+            return task_tagger.get_metrics(reset = reset, full = full)
+        else:
+            return task_tagger.get_metrics(reset)
+
+    @classmethod    
+    def from_params(cls,
+                    vocab: Vocabulary,
+                    params: Params,
+                    regularizer: RegularizerApplicator) -> "HMTL":
+        return cls(vocab = vocab,
+                params = params,
+                regularizer = regularizer)
+                                
\ No newline at end of file
diff --git a/hmtl/models/layerCoref.py b/hmtl/models/layerCoref.py
new file mode 100644
index 0000000..b0f7f24
--- /dev/null
+++ b/hmtl/models/layerCoref.py
@@ -0,0 +1,126 @@
+# coding: utf-8
+               
+import os
+import sys
+import logging
+from typing import Dict
+from overrides import overrides
+
+import torch
+
+from allennlp.common import Params
+from allennlp.common.checks import ConfigurationError
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder
+from allennlp.nn import RegularizerApplicator, InitializerApplicator
+from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
+from allennlp.modules import FeedForward
+from allennlp.models.crf_tagger import CrfTagger
+
+from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder
+from hmtl.models.relation_extraction import RelationExtractor
+from hmtl.models import CoreferenceCustom
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+@Model.register("coref_custom")
+class LayerCoref(Model):
+    """
+    A class that implement the one task of HMTL model: Coref (Lee et al).
+    
+    Parameters
+    ----------
+    vocab: ``allennlp.data.Vocabulary``, required.
+        The vocabulary fitted on the data.
+    params: ``allennlp.common.Params``, required
+        Configuration parameters for the multi-task model.
+    regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None)
+        A reguralizer to apply to the model's layers.
+    """
+    def __init__(self,
+                vocab: Vocabulary,
+                params: Params,
+                regularizer: RegularizerApplicator = None):
+                
+        super(LayerCoref, self).__init__(vocab = vocab, regularizer = regularizer)
+        
+         # Base text Field Embedder
+        text_field_embedder_params = params.pop("text_field_embedder")
+        text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, 
+                                                                params=text_field_embedder_params)
+        self._text_field_embedder = text_field_embedder
+        
+        ##############
+        # Coref Stuffs
+        ##############
+        coref_params = params.pop("coref")
+        
+        # Encoder
+        encoder_coref_params = coref_params.pop("encoder")
+        encoder_coref = Seq2SeqEncoder.from_params(encoder_coref_params)
+        self._encoder_coref =  encoder_coref
+        
+        # Tagger: Coreference
+        tagger_coref_params = coref_params.pop("tagger")
+        eval_on_gold_mentions = tagger_coref_params.pop_bool("eval_on_gold_mentions", False)
+        init_params = tagger_coref_params.pop("initializer", None)
+        initializer = (InitializerApplicator.from_params(init_params)
+                if init_params is not None
+                else InitializerApplicator())
+        
+        tagger_coref = CoreferenceCustom(vocab = vocab,
+                                        text_field_embedder = self._text_field_embedder,
+                                        context_layer = self._encoder_coref,
+                                        mention_feedforward = FeedForward.from_params(tagger_coref_params.pop("mention_feedforward")),
+                                        antecedent_feedforward = FeedForward.from_params(tagger_coref_params.pop("antecedent_feedforward")),
+                                        feature_size = tagger_coref_params.pop_int("feature_size"),
+                                        max_span_width = tagger_coref_params.pop_int("max_span_width"),
+                                        spans_per_word = tagger_coref_params.pop_float("spans_per_word"),
+                                        max_antecedents = tagger_coref_params.pop_int("max_antecedents"),
+                                        lexical_dropout = tagger_coref_params.pop_float("lexical_dropout", 0.2),
+                                        initializer = initializer,
+                                        regularizer = regularizer,
+                                        eval_on_gold_mentions = eval_on_gold_mentions)
+        self._tagger_coref = tagger_coref								
+        if eval_on_gold_mentions:
+            self._tagger_coref._eval_on_gold_mentions = True
+            
+        logger.info("Multi-Task Learning Model has been instantiated.")
+        
+    @overrides	
+    def forward(self, 
+                tensor_batch,
+                for_training: bool = False,
+                task_name: str = "coref") -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        
+        tagger = getattr(self, "_tagger_%s" % task_name)
+        
+        if task_name == "coref" and tagger._eval_on_gold_mentions:
+            if for_training: tagger._use_gold_mentions = False
+            else: tagger._use_gold_mentions = True 
+
+        return tagger.forward(**tensor_batch)
+            
+    @overrides
+    def get_metrics(self,
+                    task_name: str,
+                    reset: bool = False,
+                    full: bool = False) -> Dict[str, float]:
+        
+        task_tagger = getattr(self, "_tagger_" + task_name)
+        if full and task_name == "coref":
+            return task_tagger.get_metrics(reset = reset, full = full)
+        else:
+            return task_tagger.get_metrics(reset)
+
+    @classmethod    
+    def from_params(cls,
+                    vocab: Vocabulary,
+                    params: Params,
+                    regularizer: RegularizerApplicator) -> "LayerCoref":
+        return cls(vocab = vocab,
+                params = params,
+                regularizer = regularizer)
\ No newline at end of file
diff --git a/hmtl/models/layerEmdCoref.py b/hmtl/models/layerEmdCoref.py
new file mode 100644
index 0000000..657581d
--- /dev/null
+++ b/hmtl/models/layerEmdCoref.py
@@ -0,0 +1,155 @@
+# coding: utf-8
+               
+import os
+import sys
+import logging
+from typing import Dict
+from overrides import overrides
+
+import torch
+
+from allennlp.common import Params
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder
+from allennlp.nn import RegularizerApplicator, InitializerApplicator
+from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
+from allennlp.modules import FeedForward
+from allennlp.models.crf_tagger import CrfTagger
+
+from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder
+from hmtl.models import CoreferenceCustom
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+@Model.register("emd_coref")
+class LayerEmdCoref(Model):
+    """
+    A class that implement two tasks of HMTL model: EMD (CRF Tagger) and Coref (Lee et al., 2017).
+    
+    Parameters
+    ----------
+    vocab: ``allennlp.data.Vocabulary``, required.
+        The vocabulary fitted on the data.
+    params: ``allennlp.common.Params``, required
+        Configuration parameters for the multi-task model.
+    regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None)
+        A reguralizer to apply to the model's layers.
+    """
+    def __init__(self,
+                vocab: Vocabulary,
+                params: Params,
+                regularizer: RegularizerApplicator = None):
+                
+        super(LayerEmdCoref, self).__init__(vocab = vocab, regularizer = regularizer)
+        
+        # Base text Field Embedder
+        text_field_embedder_params = params.pop("text_field_embedder")
+        text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, 
+                                                                params=text_field_embedder_params)
+        self._text_field_embedder = text_field_embedder
+
+        
+        ############
+        # EMD Stuffs
+        ############
+        emd_params = params.pop("emd")
+        
+        # Encoder
+        encoder_emd_params = emd_params.pop("encoder")
+        encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params)
+        self._encoder_emd =  encoder_emd
+        
+        # Tagger EMD - CRF Tagger
+        tagger_emd_params = emd_params.pop("tagger")
+        tagger_emd = CrfTagger(vocab = vocab,
+                            text_field_embedder = self._text_field_embedder,
+                            encoder = self._encoder_emd,
+                            label_namespace = tagger_emd_params.pop("label_namespace", "labels"),
+                            constraint_type = tagger_emd_params.pop("constraint_type", None),
+                            dropout = tagger_emd_params.pop("dropout", None),
+                            regularizer = regularizer)
+        self._tagger_emd = tagger_emd
+        
+        
+        ##############
+        # Coref Stuffs
+        ##############
+        coref_params = params.pop("coref")
+        
+        # Encoder
+        encoder_coref_params = coref_params.pop("encoder")
+        encoder_coref = Seq2SeqEncoder.from_params(encoder_coref_params)
+        self._encoder_coref =  encoder_coref
+        
+        shortcut_text_field_embedder_coref = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder,
+                                                                            previous_encoders = [self._encoder_emd])
+        self._shortcut_text_field_embedder_coref = shortcut_text_field_embedder_coref
+        
+        # Tagger: Coreference
+        tagger_coref_params = coref_params.pop("tagger")
+        eval_on_gold_mentions = tagger_coref_params.pop_bool("eval_on_gold_mentions", False)
+        init_params = tagger_coref_params.pop("initializer", None)
+        initializer = (InitializerApplicator.from_params(init_params)
+                if init_params is not None
+                else InitializerApplicator())
+        
+        tagger_coref = CoreferenceCustom(vocab = vocab,
+                                        text_field_embedder = self._shortcut_text_field_embedder_coref,
+                                        context_layer = self._encoder_coref,
+                                        mention_feedforward = FeedForward.from_params(tagger_coref_params.pop("mention_feedforward")),
+                                        antecedent_feedforward = FeedForward.from_params(tagger_coref_params.pop("antecedent_feedforward")),
+                                        feature_size = tagger_coref_params.pop_int("feature_size"),
+                                        max_span_width = tagger_coref_params.pop_int("max_span_width"),
+                                        spans_per_word = tagger_coref_params.pop_float("spans_per_word"),
+                                        max_antecedents = tagger_coref_params.pop_int("max_antecedents"),
+                                        lexical_dropout = tagger_coref_params.pop_float("lexical_dropout", 0.2),
+                                        initializer = initializer,
+                                        regularizer = regularizer,
+                                        eval_on_gold_mentions = eval_on_gold_mentions)
+        self._tagger_coref = tagger_coref								
+        if eval_on_gold_mentions:
+            self._tagger_coref._eval_on_gold_mentions = True
+            
+        logger.info("Multi-Task Learning Model has been instantiated.")
+        
+    @overrides    
+    def forward(self, 
+                tensor_batch,
+                for_training: bool = False,
+                task_name: str = "emd") -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        """
+        Special case for forward: for coreference, we can use gold mentions to predict the clusters
+        during evaluation (not during training).
+        """
+        
+        tagger = getattr(self, "_tagger_%s" % task_name)
+        
+        if task_name == "coref" and tagger._eval_on_gold_mentions:
+            if for_training: tagger._use_gold_mentions = False
+            else: tagger._use_gold_mentions = True 
+
+        return tagger.forward(**tensor_batch)
+            
+    @overrides  
+    def get_metrics(self,
+                    task_name: str = "emd",
+                    reset: bool = False,
+                    full: bool = False) -> Dict[str, float]:
+        
+        task_tagger = getattr(self, "_tagger_" + task_name)
+        if full and task_name == "coref":
+            return task_tagger.get_metrics(reset = reset, full = full)
+        else:
+            return task_tagger.get_metrics(reset = reset)
+    
+    @classmethod    
+    def from_params(cls,
+                    vocab: Vocabulary,
+                    params: Params,
+                    regularizer: RegularizerApplicator) -> "LayerEmdCoref":
+        return cls(vocab = vocab,
+                params = params,
+				regularizer = regularizer)
\ No newline at end of file
diff --git a/hmtl/models/layerEmdRelation.py b/hmtl/models/layerEmdRelation.py
new file mode 100644
index 0000000..e39210d
--- /dev/null
+++ b/hmtl/models/layerEmdRelation.py
@@ -0,0 +1,129 @@
+# coding: utf-8
+               
+import os
+import sys
+import logging
+from typing import Dict
+from overrides import overrides
+
+import torch
+
+from allennlp.common import Params
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder
+from allennlp.nn import RegularizerApplicator
+from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
+from allennlp.models.crf_tagger import CrfTagger
+
+from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder
+from hmtl.models.relation_extraction import RelationExtractor
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+@Model.register("emd_relation")
+class LayerEmdRelation(Model):
+    """
+    A class that implement three tasks of HMTL model: EMD (CRF Tagger) and Relation Extraction.
+    
+    Parameters
+    ----------
+    vocab: ``allennlp.data.Vocabulary``, required.
+        The vocabulary fitted on the data.
+    params: ``allennlp.common.Params``, required
+        Configuration parameters for the multi-task model.
+    regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None)
+        A reguralizer to apply to the model's layers.
+    """
+    def __init__(self,
+                vocab: Vocabulary,
+                params: Params,
+                regularizer: RegularizerApplicator = None):
+                
+        super(LayerEmdRelation, self).__init__(vocab = vocab, regularizer = regularizer)
+        
+        
+        # Base text Field Embedder
+        text_field_embedder_params = params.pop("text_field_embedder")
+        text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, 
+                                                                params=text_field_embedder_params)
+        self._text_field_embedder = text_field_embedder
+
+        
+        ############
+        # EMD Stuffs
+        ############
+        emd_params = params.pop("emd")
+        
+        # Encoder
+        encoder_emd_params = emd_params.pop("encoder")
+        encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params)
+        self._encoder_emd =  encoder_emd
+        
+        # Tagger EMD - CRF Tagger
+        tagger_emd_params = emd_params.pop("tagger")
+        tagger_emd = CrfTagger(vocab = vocab,
+                            text_field_embedder = self._text_field_embedder,
+                            encoder = self._encoder_emd,
+                            label_namespace = tagger_emd_params.pop("label_namespace", "labels"),
+                            constraint_type = tagger_emd_params.pop("constraint_type", None),
+                            dropout = tagger_emd_params.pop("dropout", None),
+                            regularizer = regularizer)
+        self._tagger_emd = tagger_emd
+        
+        
+        ############################
+        # Relation Extraction Stuffs
+        ############################
+        relation_params = params.pop("relation")
+        
+        # Encoder
+        encoder_relation_params = relation_params.pop("encoder")
+        encoder_relation = Seq2SeqEncoder.from_params(encoder_relation_params)
+        self._encoder_relation =  encoder_relation
+        
+        shortcut_text_field_embedder_relation = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder,
+                                                                                previous_encoders = [self._encoder_emd])
+        self._shortcut_text_field_embedder_relation = shortcut_text_field_embedder_relation
+        
+        # Tagger: Relation
+        tagger_relation_params = relation_params.pop("tagger")
+        tagger_relation = RelationExtractor(vocab = vocab,
+                                            text_field_embedder = self._shortcut_text_field_embedder_relation,
+                                            context_layer = self._encoder_relation,
+                                            d = tagger_relation_params.pop_int("d"),
+                                            l = tagger_relation_params.pop_int("l"),
+                                            n_classes = tagger_relation_params.pop("n_classes"),
+                                            activation = tagger_relation_params.pop("activation"))
+        self._tagger_relation = tagger_relation								
+
+        logger.info("Multi-Task Learning Model has been instantiated.")
+        
+    @overrides        
+    def forward(self, 
+                tensor_batch,
+                for_training: bool = False,
+                task_name: str = "ner") -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        
+        tagger = getattr(self, "_tagger_%s" % task_name)
+        return tagger.forward(**tensor_batch)
+            
+    @overrides
+    def get_metrics(self,
+                    task_name: str,
+                    reset: bool = False,
+                    full: bool = False) -> Dict[str, float]:
+        
+        task_tagger = getattr(self, "_tagger_" + task_name)
+        return task_tagger.get_metrics(reset)		
+    
+    @classmethod    
+    def from_params(cls,
+                    vocab: Vocabulary,
+                    params: Params,
+                    regularizer: RegularizerApplicator) -> "LayerEmdRelation":
+        return cls(vocab = vocab,
+                params = params,
+				regularizer = regularizer)		
\ No newline at end of file
diff --git a/hmtl/models/layerNer.py b/hmtl/models/layerNer.py
new file mode 100644
index 0000000..945438a
--- /dev/null
+++ b/hmtl/models/layerNer.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+
+import os
+import sys
+import logging
+from typing import Dict
+from overrides import overrides
+
+import torch
+
+from allennlp.common import Params
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder
+from allennlp.nn import RegularizerApplicator
+from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
+from allennlp.models.crf_tagger import CrfTagger
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+        
+@Model.register("ner")
+class LayerNer(Model):
+    """
+    A class that implement the first task of HMTL model: NER (CRF Tagger).
+    
+    Parameters
+    ----------
+    vocab: ``allennlp.data.Vocabulary``, required.
+        The vocabulary fitted on the data.
+    params: ``allennlp.common.Params``, required
+        Configuration parameters for the multi-task model.
+    regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None)
+        A reguralizer to apply to the model's layers.
+    """
+    def __init__(self,
+                vocab: Vocabulary,
+                params: Params,
+                regularizer: RegularizerApplicator = None):
+                
+        super(LayerNer, self).__init__(vocab = vocab, regularizer = regularizer)
+
+        # Base Text Field Embedder
+        text_field_embedder_params = params.pop("text_field_embedder")
+        text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, 
+                                                                params=text_field_embedder_params)
+        self._text_field_embedder = text_field_embedder
+        
+        ############
+        # NER Stuffs
+        ############
+        ner_params = params.pop("ner")
+        
+        # Encoder
+        encoder_ner_params = ner_params.pop("encoder")
+        encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params)
+        self._encoder_ner =  encoder_ner
+        
+        # Tagger NER - CRF Tagger
+        tagger_ner_params = ner_params.pop("tagger")
+        tagger_ner = CrfTagger(vocab = vocab,
+                            text_field_embedder = self._text_field_embedder,
+                            encoder = self._encoder_ner,
+                            label_namespace = tagger_ner_params.pop("label_namespace", "labels"),
+                            constraint_type = tagger_ner_params.pop("constraint_type", None),
+                            dropout = tagger_ner_params.pop("dropout", None),
+                            regularizer = regularizer)
+        self._tagger_ner = tagger_ner
+        
+        logger.info("Multi-Task Learning Model has been instantiated.")
+    
+    @overrides    
+    def forward(self, 
+                tensor_batch,
+                for_training: bool = False,
+                task_name: str = "ner") -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        
+        tagger = getattr(self, "_tagger_%s" % task_name)
+        return tagger.forward(**tensor_batch)
+        
+    @overrides
+    def get_metrics(self,
+                    task_name: str = "ner",
+                    reset: bool = False,
+                    full: bool = False) -> Dict[str, float]:
+        
+        task_tagger = getattr(self, "_tagger_" + task_name)
+        return task_tagger.get_metrics(reset = reset)
+    
+    @classmethod    
+    def from_params(cls,
+                    vocab: Vocabulary,
+                    params: Params,
+                    regularizer: RegularizerApplicator) -> "LayerNer":
+        return cls(vocab = vocab,
+                params = params,
+				regularizer = regularizer)
+        
\ No newline at end of file
diff --git a/hmtl/models/layerNerEmd.py b/hmtl/models/layerNerEmd.py
new file mode 100644
index 0000000..f97c532
--- /dev/null
+++ b/hmtl/models/layerNerEmd.py
@@ -0,0 +1,127 @@
+# coding: utf-8
+               
+import os
+import sys
+import logging
+from typing import Dict
+from overrides import overrides
+
+import torch
+
+from allennlp.common import Params
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder
+from allennlp.nn import RegularizerApplicator
+from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
+from allennlp.models.crf_tagger import CrfTagger
+
+from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+@Model.register("ner_emd")
+class LayerNerEmd(Model):
+    """
+    A class that implement two tasks of HMTL model: NER (CRF Tagger) and EMD (CRF Tagger).
+    
+    Parameters
+    ----------
+    vocab: ``allennlp.data.Vocabulary``, required.
+        The vocabulary fitted on the data.
+    params: ``allennlp.common.Params``, required
+        Configuration parameters for the multi-task model.
+    regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None)
+        A reguralizer to apply to the model's layers.
+    """
+    def __init__(self,
+                vocab: Vocabulary,
+                params: Params,
+                regularizer: RegularizerApplicator = None):
+                
+        super(LayerNerEmd, self).__init__(vocab = vocab, regularizer = regularizer)
+        
+        # Base text Field Embedder
+        text_field_embedder_params = params.pop("text_field_embedder")
+        text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, 
+                                                                params=text_field_embedder_params)
+        self._text_field_embedder = text_field_embedder
+        
+        ############
+        # NER Stuffs
+        ############
+        ner_params = params.pop("ner")
+        
+        # Encoder
+        encoder_ner_params = ner_params.pop("encoder")
+        encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params)
+        self._encoder_ner =  encoder_ner
+        
+        # Tagger NER - CRF Tagger
+        tagger_ner_params = ner_params.pop("tagger")
+        tagger_ner = CrfTagger(vocab = vocab,
+                            text_field_embedder = self._text_field_embedder,
+                            encoder = self._encoder_ner,
+                            label_namespace = tagger_ner_params.pop("label_namespace", "labels"),
+                            constraint_type = tagger_ner_params.pop("constraint_type", None),
+                            dropout = tagger_ner_params.pop("dropout", None),
+                            regularizer = regularizer)
+        self._tagger_ner = tagger_ner
+        
+        
+        ############
+        # EMD Stuffs
+        ############
+        emd_params = params.pop("emd")
+        
+        # Encoder
+        encoder_emd_params = emd_params.pop("encoder")
+        encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params)
+        self._encoder_emd =  encoder_emd
+        
+        shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder,
+                                                                        previous_encoders = [self._encoder_ner])
+        self._shortcut_text_field_embedder = shortcut_text_field_embedder
+        
+        
+        # Tagger: EMD - CRF Tagger
+        tagger_emd_params = emd_params.pop("tagger")
+        tagger_emd = CrfTagger(vocab = vocab,
+                                text_field_embedder = self._shortcut_text_field_embedder,
+                                encoder = self._encoder_emd,
+                                label_namespace = tagger_emd_params.pop("label_namespace", "labels"),
+                                constraint_type = tagger_emd_params.pop("constraint_type", None),
+                                dropout = tagger_ner_params.pop("dropout", None),
+                                regularizer = regularizer)
+        self._tagger_emd = tagger_emd
+        
+        logger.info("Multi-Task Learning Model has been instantiated.")
+        
+    @overrides		
+    def forward(self, 
+                tensor_batch,
+                for_training: bool = False,
+                task_name: str = "ner") -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        
+        tagger = getattr(self, "_tagger_%s" % task_name)
+        return tagger.forward(**tensor_batch)
+        
+    @overrides	
+    def get_metrics(self,
+                    task_name: str,
+                    reset: bool = False,
+                    full: bool = False) -> Dict[str, float]:
+        
+        task_tagger = getattr(self, "_tagger_" + task_name)
+        return task_tagger.get_metrics(reset)
+    
+    @classmethod    
+    def from_params(cls,
+                    vocab: Vocabulary,
+                    params: Params,
+                    regularizer: RegularizerApplicator) -> "LayerNerEmd":
+        return cls(vocab = vocab,
+                params = params,
+                regularizer = regularizer)
diff --git a/hmtl/models/layerNerEmdCoref.py b/hmtl/models/layerNerEmdCoref.py
new file mode 100644
index 0000000..9a8d146
--- /dev/null
+++ b/hmtl/models/layerNerEmdCoref.py
@@ -0,0 +1,183 @@
+# coding: utf-8
+               
+import os
+import sys
+import logging
+from typing import Dict
+from overrides import overrides
+
+import torch
+
+from allennlp.common import Params
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder
+from allennlp.nn import RegularizerApplicator, InitializerApplicator
+from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
+from allennlp.modules import FeedForward
+from allennlp.models.crf_tagger import CrfTagger
+
+from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder
+from hmtl.models import CoreferenceCustom
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+@Model.register("ner_emd_coref")
+class LayerNerEmdCoref(Model):
+    """
+    A class that implement three tasks of HMTL model: NER (CRF Tagger), EMD (CRF Tagger) and Coreference Resolution.
+    
+    Parameters
+    ----------
+    vocab: ``allennlp.data.Vocabulary``, required.
+        The vocabulary fitted on the data.
+    params: ``allennlp.common.Params``, required
+        Configuration parameters for the multi-task model.
+    regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None)
+        A reguralizer to apply to the model's layers.
+    """
+    def __init__(self,
+                vocab: Vocabulary,
+                params: Params,
+                regularizer: RegularizerApplicator = None):
+                
+        super(LayerNerEmdCoref, self).__init__(vocab = vocab, regularizer = regularizer)
+        
+        # Base text Field Embedder
+        text_field_embedder_params = params.pop("text_field_embedder")
+        text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, 
+                                                                params=text_field_embedder_params)
+        self._text_field_embedder = text_field_embedder
+
+        
+        ############
+        # NER Stuffs
+        ############
+        ner_params = params.pop("ner")
+        
+        # Encoder
+        encoder_ner_params = ner_params.pop("encoder")
+        encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params)
+        self._encoder_ner =  encoder_ner
+        
+        # Tagger NER - CRF Tagger
+        tagger_ner_params = ner_params.pop("tagger")
+        tagger_ner = CrfTagger(vocab = vocab,
+                            text_field_embedder = self._text_field_embedder,
+                            encoder = self._encoder_ner,
+                            label_namespace = tagger_ner_params.pop("label_namespace", "labels"),
+                            constraint_type = tagger_ner_params.pop("constraint_type", None),
+                            dropout = tagger_ner_params.pop("dropout", None),
+                            regularizer = regularizer)
+        self._tagger_ner = tagger_ner
+        
+        
+        ############
+        # EMD Stuffs
+        ############
+        emd_params = params.pop("emd")
+        
+        # Encoder
+        encoder_emd_params = emd_params.pop("encoder")
+        encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params)
+        self._encoder_emd =  encoder_emd
+        
+        shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder,
+                                                                        previous_encoders = [self._encoder_ner])
+        self._shortcut_text_field_embedder = shortcut_text_field_embedder
+        
+        
+        # Tagger: EMD - CRF Tagger
+        tagger_emd_params = emd_params.pop("tagger")
+        tagger_emd = CrfTagger(vocab = vocab,
+                                text_field_embedder = self._shortcut_text_field_embedder,
+                                encoder = self._encoder_emd,
+                                label_namespace = tagger_emd_params.pop("label_namespace", "labels"),
+                                constraint_type = tagger_emd_params.pop("constraint_type", None),
+                                dropout = tagger_ner_params.pop("dropout", None),
+                                regularizer = regularizer)
+        self._tagger_emd = tagger_emd
+        
+        
+        ##############
+        # Coref Stuffs
+        ##############
+        coref_params = params.pop("coref")
+        
+        # Encoder
+        encoder_coref_params = coref_params.pop("encoder")
+        encoder_coref = Seq2SeqEncoder.from_params(encoder_coref_params)
+        self._encoder_coref =  encoder_coref
+        
+        shortcut_text_field_embedder_coref = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder,
+                                                                            previous_encoders = [self._encoder_ner, self._encoder_emd])
+        self._shortcut_text_field_embedder_coref = shortcut_text_field_embedder_coref
+        
+        # Tagger: Coreference
+        tagger_coref_params = coref_params.pop("tagger")
+        eval_on_gold_mentions = tagger_coref_params.pop_bool("eval_on_gold_mentions", False)
+        init_params = tagger_coref_params.pop("initializer", None)
+        initializer = (InitializerApplicator.from_params(init_params)
+                if init_params is not None
+                else InitializerApplicator())
+        
+        tagger_coref = CoreferenceCustom(vocab = vocab,
+                                        text_field_embedder = self._shortcut_text_field_embedder_coref,
+                                        context_layer = self._encoder_coref,
+                                        mention_feedforward = FeedForward.from_params(tagger_coref_params.pop("mention_feedforward")),
+                                        antecedent_feedforward = FeedForward.from_params(tagger_coref_params.pop("antecedent_feedforward")),
+                                        feature_size = tagger_coref_params.pop_int("feature_size"),
+                                        max_span_width = tagger_coref_params.pop_int("max_span_width"),
+                                        spans_per_word = tagger_coref_params.pop_float("spans_per_word"),
+                                        max_antecedents = tagger_coref_params.pop_int("max_antecedents"),
+                                        lexical_dropout = tagger_coref_params.pop_float("lexical_dropout", 0.2),
+                                        initializer = initializer,
+                                        regularizer = regularizer,
+                                        eval_on_gold_mentions = eval_on_gold_mentions)
+        self._tagger_coref = tagger_coref								
+        if eval_on_gold_mentions:
+            self._tagger_coref._eval_on_gold_mentions = True
+            
+        logger.info("Multi-Task Learning Model has been instantiated.")
+        
+    @overrides    
+    def forward(self, 
+                tensor_batch,
+                for_training: bool = False,
+                task_name: str = "ner") -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        """
+        Special case for forward: for coreference, we can use gold mentions to predict the clusters
+        during evaluation (not during training).
+        """
+        
+        tagger = getattr(self, "_tagger_%s" % task_name)
+        
+        if task_name == "coref" and tagger._eval_on_gold_mentions:
+            if for_training: tagger._use_gold_mentions = False
+            else: tagger._use_gold_mentions = True 
+
+        return tagger.forward(**tensor_batch)
+            
+    @overrides 
+    def get_metrics(self,
+                    task_name: str,
+                    reset: bool = False,
+                    full: bool = False) -> Dict[str, float]:
+        
+        task_tagger = getattr(self, "_tagger_" + task_name)
+        if full and task_name == "coref":
+            return task_tagger.get_metrics(reset = reset, full = full)
+        else:
+            return task_tagger.get_metrics(reset)
+
+    @classmethod    
+    def from_params(cls,
+                    vocab: Vocabulary,
+                    params: Params,
+                    regularizer: RegularizerApplicator) -> "LayerNerEmdCoref":
+        return cls(vocab = vocab,
+                params = params,
+                regularizer = regularizer)
+                      
\ No newline at end of file
diff --git a/hmtl/models/layerNerEmdRelation.py b/hmtl/models/layerNerEmdRelation.py
new file mode 100644
index 0000000..ed96c27
--- /dev/null
+++ b/hmtl/models/layerNerEmdRelation.py
@@ -0,0 +1,155 @@
+# coding: utf-8
+               
+import os
+import sys
+import logging
+from typing import Dict
+from overrides import overrides
+
+import torch
+
+from allennlp.common import Params
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder
+from allennlp.nn import RegularizerApplicator
+from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
+from allennlp.models.crf_tagger import CrfTagger
+
+from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder
+from hmtl.models.relation_extraction import RelationExtractor
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+@Model.register("ner_emd_relation")
+class LayerNerEmdRelation(Model):
+    """
+    A class that implement three tasks of HMTL model: NER (CRF Tagger), EMD (CRF Tagger) and Relation Extraction.
+    
+    Parameters
+    ----------
+    vocab: ``allennlp.data.Vocabulary``, required.
+        The vocabulary fitted on the data.
+    params: ``allennlp.common.Params``, required
+        Configuration parameters for the multi-task model.
+    regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None)
+        A reguralizer to apply to the model's layers.
+    """
+    def __init__(self,
+                vocab: Vocabulary,
+                params: Params,
+                regularizer: RegularizerApplicator = None):
+                
+        super(LayerNerEmdRelation, self).__init__(vocab = vocab, regularizer = regularizer)
+        
+        # Base text Field Embedder
+        text_field_embedder_params = params.pop("text_field_embedder")
+        text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, 
+                                                                params=text_field_embedder_params)
+        self._text_field_embedder = text_field_embedder
+
+        
+        ############
+        # NER Stuffs
+        ############
+        ner_params = params.pop("ner")
+        
+        # Encoder
+        encoder_ner_params = ner_params.pop("encoder")
+        encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params)
+        self._encoder_ner =  encoder_ner
+        
+        # Tagger NER - CRF Tagger
+        tagger_ner_params = ner_params.pop("tagger")
+        tagger_ner = CrfTagger(vocab = vocab,
+                            text_field_embedder = self._text_field_embedder,
+                            encoder = self._encoder_ner,
+                            label_namespace = tagger_ner_params.pop("label_namespace", "labels"),
+                            constraint_type = tagger_ner_params.pop("constraint_type", None),
+                            dropout = tagger_ner_params.pop("dropout", None),
+                            regularizer = regularizer)
+        self._tagger_ner = tagger_ner
+        
+        
+        ############
+        # EMD Stuffs
+        ############
+        emd_params = params.pop("emd")
+        
+        # Encoder
+        encoder_emd_params = emd_params.pop("encoder")
+        encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params)
+        self._encoder_emd =  encoder_emd
+        
+        shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder,
+                                                                        previous_encoders = [self._encoder_ner])
+        self._shortcut_text_field_embedder = shortcut_text_field_embedder
+        
+        
+        # Tagger: EMD - CRF Tagger
+        tagger_emd_params = emd_params.pop("tagger")
+        tagger_emd = CrfTagger(vocab = vocab,
+                                text_field_embedder = self._shortcut_text_field_embedder,
+                                encoder = self._encoder_emd,
+                                label_namespace = tagger_emd_params.pop("label_namespace", "labels"),
+                                constraint_type = tagger_emd_params.pop("constraint_type", None),
+                                dropout = tagger_ner_params.pop("dropout", None),
+                                regularizer = regularizer)
+        self._tagger_emd = tagger_emd
+        
+        
+        ############################
+        # Relation Extraction Stuffs
+        ############################
+        relation_params = params.pop("relation")
+        
+        # Encoder
+        encoder_relation_params = relation_params.pop("encoder")
+        encoder_relation = Seq2SeqEncoder.from_params(encoder_relation_params)
+        self._encoder_relation =  encoder_relation
+        
+        shortcut_text_field_embedder_relation = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder,
+                                                                                previous_encoders = [self._encoder_ner, self._encoder_emd])
+        self._shortcut_text_field_embedder_relation = shortcut_text_field_embedder_relation
+        
+        # Tagger: Relation
+        tagger_relation_params = relation_params.pop("tagger")
+        tagger_relation = RelationExtractor(vocab = vocab,
+                                            text_field_embedder = self._shortcut_text_field_embedder_relation,
+                                            context_layer = self._encoder_relation,
+                                            d = tagger_relation_params.pop_int("d"),
+                                            l = tagger_relation_params.pop_int("l"),
+                                            n_classes = tagger_relation_params.pop("n_classes"),
+                                            activation = tagger_relation_params.pop("activation"))
+        self._tagger_relation = tagger_relation								
+
+        logger.info("Multi-Task Learning Model has been instantiated.")
+        
+    @overrides
+    def forward(self, 
+                tensor_batch,
+                for_training: bool = False,
+                task_name: str = "ner") -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        
+        tagger = getattr(self, "_tagger_%s" % task_name)
+        return tagger.forward(**tensor_batch)
+            
+    @overrides
+    def get_metrics(self,
+                    task_name: str,
+                    reset: bool = False,
+                    full: bool = False) -> Dict[str, float]:
+        
+        task_tagger = getattr(self, "_tagger_" + task_name)
+        return task_tagger.get_metrics(reset)
+
+    @classmethod    
+    def from_params(cls,
+                    vocab: Vocabulary,
+                    params: Params,
+                    regularizer: RegularizerApplicator) -> "LayerNerEmdRelation":
+        return cls(vocab = vocab,
+                params = params,
+                regularizer = regularizer)
\ No newline at end of file
diff --git a/hmtl/models/layerRelation.py b/hmtl/models/layerRelation.py
new file mode 100644
index 0000000..45b7892
--- /dev/null
+++ b/hmtl/models/layerRelation.py
@@ -0,0 +1,100 @@
+# coding: utf-8
+
+import os
+import sys
+import logging
+from typing import Dict
+from overrides import overrides
+
+import torch
+
+from allennlp.common import Params
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder
+from allennlp.nn import RegularizerApplicator
+from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
+
+from hmtl.models.relation_extraction import RelationExtractor
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+@Model.register("relation")
+class LayerRelation(Model):
+    """
+    A class that implement one task of HMTL model: Relation Extraction.
+    
+    Parameters
+    ----------
+    vocab: ``allennlp.data.Vocabulary``, required.
+        The vocabulary fitted on the data.
+    params: ``allennlp.common.Params``, required
+        Configuration parameters for the multi-task model.
+    regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None)
+        A reguralizer to apply to the model's layers.
+    """
+    def __init__(self,
+                vocab: Vocabulary,
+                params: Params,
+                regularizer: RegularizerApplicator = None):
+                
+        super(LayerRelation, self).__init__(vocab = vocab, regularizer = regularizer)
+
+        # Base text Field Embedder
+        text_field_embedder_params = params.pop("text_field_embedder")
+        text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, 
+                                                                params=text_field_embedder_params)
+        self._text_field_embedder = text_field_embedder
+        
+        ############################
+        # Relation Extraction Stuffs
+        ############################
+        relation_params = params.pop("relation")
+        
+        # Encoder
+        encoder_relation_params = relation_params.pop("encoder")
+        encoder_relation = Seq2SeqEncoder.from_params(encoder_relation_params)
+        self._encoder_relation =  encoder_relation
+        
+        # Tagger: Relation
+        tagger_relation_params = relation_params.pop("tagger")
+        tagger_relation = RelationExtractor(vocab = vocab,
+                                            text_field_embedder = self._text_field_embedder,
+                                            context_layer = self._encoder_relation,
+                                            d = tagger_relation_params.pop_int("d"),
+                                            l = tagger_relation_params.pop_int("l"),
+                                            n_classes = tagger_relation_params.pop("n_classes"),
+                                            activation = tagger_relation_params.pop("activation"))
+        self._tagger_relation = tagger_relation	
+
+        logger.info("Multi-Task Learning Model has been instantiated.")
+
+    @overrides		
+    def forward(self, 
+                tensor_batch,
+                for_training: bool = False,
+                task_name: str = "relation") -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        
+        tagger = getattr(self, "_tagger_%s" % task_name)
+        return tagger.forward(**tensor_batch)
+
+    @overrides		
+    def get_metrics(self,
+                    task_name: str = "relation",
+                    reset: bool = False,
+                    full: bool = False) -> Dict[str, float]:
+        
+        task_tagger = getattr(self, "_tagger_" + task_name)
+        return task_tagger.get_metrics(reset)
+
+    @classmethod    
+    def from_params(cls,
+                    vocab: Vocabulary,
+                    params: Params,
+                    regularizer: RegularizerApplicator) -> "layerRelation":
+        return cls(vocab = vocab,
+                params = params,
+                regularizer = regularizer)
+        
\ No newline at end of file
diff --git a/hmtl/models/relation_extraction.py b/hmtl/models/relation_extraction.py
new file mode 100644
index 0000000..33533b1
--- /dev/null
+++ b/hmtl/models/relation_extraction.py
@@ -0,0 +1,274 @@
+# coding: utf-8
+
+import logging
+import math
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable #from torch.nn.parameter import Parameter, Variable
+
+from overrides import overrides
+
+from allennlp.common import Params
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder
+from allennlp.modules.span_extractors import SelfAttentiveSpanExtractor, EndpointSpanExtractor
+from allennlp.nn import util
+
+from hmtl.training.metrics import RelationF1Measure
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+# Mapping specific to the dataset used in our setting (ACE2005)
+# Please adapt it if necessary
+rel_type_2_idx = {"ORG-AFF": 0,
+				"PHYS": 1,
+				"ART": 2,
+				"PER-SOC": 3,
+				"PART-WHOLE": 4,
+				"GEN-AFF": 5} 
+idx_2_rel_type = {value: key for key, value in rel_type_2_idx.items()}
+
+
+@Model.register("relation_extractor")
+class RelationExtractor(Model):
+	"""
+	A class containing the scoring model for relation extraction.
+	It is derived the model proposed by Bekoulis G. in 
+	"Joint entity recognition and relation extraction as a multi-head selection problem"
+	https://bekou.github.io/papers/eswa2018b/bekoulis_eswa_2018b.pdf
+	
+	Parameters
+	----------
+	vocab: ``allennlp.data.Vocabulary``, required.
+        The vocabulary fitted on the data.
+	text_field_embedder : ``TextFieldEmbedder``, required
+        Used to embed the ``text`` ``TextField`` we get as input to the model.
+    context_layer : ``Seq2SeqEncoder``, required
+        This layer incorporates contextual information for each word in the document.
+	d: ``int``, required
+		The (half) dimension of embedding given	by the encoder context_layer.
+	l: ``int``, required
+		The dimension of the relation extractor scorer embedding.
+	n_classes: ``int``, required
+		The number of different possible relation classes.
+	activation: ``str``, optional (default = "relu")
+		Non-linear activation function for the scorer. Can be either "relu" or "tanh".
+	label_namespace: ``str``, optional (default = "relation_ace_labels")
+		The namespace for the labels of the task of relation extraction.
+	"""
+	def __init__(self,
+				 vocab: Vocabulary,
+				 text_field_embedder: TextFieldEmbedder,
+				 context_layer: Seq2SeqEncoder,
+				 d: int,
+				 l: int,
+				 n_classes: int,
+				 activation: str = "relu",
+				 label_namespace: str = "relation_ace_labels") -> None:
+		super(RelationExtractor, self).__init__(vocab)
+		
+		
+		self._U = nn.Parameter(torch.Tensor(2*d, l))
+		self._W = nn.Parameter(torch.Tensor(2*d, l))
+		self._V = nn.Parameter(torch.Tensor(l, n_classes))
+		self._b = nn.Parameter(torch.Tensor(l))
+		
+		self.init_weights()
+		
+		self._n_classes = n_classes
+		self._activation = activation
+		
+		self._text_field_embedder = text_field_embedder
+		self._context_layer = context_layer
+		
+		self._label_namespace = label_namespace
+		
+		self._relation_metric = RelationF1Measure()
+		
+		self._loss_fn = nn.BCEWithLogitsLoss()
+		
+		
+	def init_weights(self) -> None:
+		"""
+		Initialization for the weights of the model.
+		"""
+		nn.init.kaiming_normal_(self._U)		
+		nn.init.kaiming_normal_(self._W)
+		nn.init.kaiming_normal_(self._V)
+
+		nn.init.normal_(self._b)
+		
+		
+	def multi_class_cross_entropy_loss(self,
+									scores,
+									labels,
+									mask):
+		"""
+		Compute the loss from
+		"""
+		#Compute the mask before computing the loss
+		#Transform the mask that is at the sentence level (#Size: n_batches x padded_document_length)
+		#to a suitable format for the relation labels level
+		padded_document_length = mask.size(1)
+		mask = mask.float() #Size: n_batches x padded_document_length
+		squared_mask = torch.stack([e.view(padded_document_length, 1)*e for e in mask], dim = 0)
+		squared_mask = squared_mask.unsqueeze(-1).repeat(1,1,1,self._n_classes) #Size: n_batches x padded_document_length x padded_document_length x n_classes
+		
+		
+		#The scores (and gold labels) are flattened before using
+		#the binary cross entropy loss.
+		# We thus transform 
+		flat_size = scores.size()
+		scores = scores*squared_mask #Size: n_batches x padded_document_length x padded_document_length x n_classes
+		scores_flat = scores.view(flat_size[0], flat_size[1], flat_size[2]*self._n_classes) #Size: n_batches x padded_document_length x (padded_document_length x n_classes)
+		labels = labels*squared_mask #Size: n_batches x padded_document_length x padded_document_length x n_classes
+		labels_flat = labels.view(flat_size[0], flat_size[1], flat_size[2]*self._n_classes) #Size: n_batches x padded_document_length x (padded_document_length x n_classes)
+			
+		loss = self._loss_fn(scores_flat, labels_flat)
+		
+		#Amplify the loss to actually see something...
+		return 100*loss
+		
+		
+	@overrides
+	def forward(self,
+				text: Dict[str, torch.LongTensor],
+				relations: torch.IntTensor = None) -> Dict[str, torch.Tensor]:
+		# pylint: disable=arguments-differ
+		"""
+		Forward pass of the model.
+		Compute the predictions and the loss (if labels are available).
+		
+		Parameters:
+		----------
+		text: Dict[str, torch.LongTensor]
+			The input sentences which have transformed into indexes (integers) according to a mapping token:str -> token:int
+		relations: torch.IntTensor
+			The gold relations to predict.
+		"""
+		
+		#Text field embedder map the token:int to their word embedding representation token:embedding (whatever these embeddings are).
+		text_embeddings = self._text_field_embedder(text) 
+		#Compute the mask from the text: 1 if there is actually a word in the corresponding sentence, 0 if it has been padded.
+		mask = util.get_text_field_mask(text) #Size: batch_size x padded_document_length
+		
+		
+		#Compute the contextualized representation from the word embeddings.
+		#Usually, _context_layer is a Seq2seq model such as LSTM
+		encoded_text = self._context_layer(text_embeddings, mask) #Size: batch_size x padded_document_length x lstm_output_size
+		
+		
+		###### Relation Scorer ##############
+		#Compute the relation scores
+		left = torch.matmul(encoded_text, self._U) #Size: batch_size x padded_document_length x l
+		right = torch.matmul(encoded_text, self._W) #Size: batch_size x padded_document_length x l
+		
+		left = left.permute(1,0,2)
+		left = left.unsqueeze(3)
+		right = right.permute(0,2,1)
+		right = right.unsqueeze(0)
+		
+		B = left + right
+		B = B.permute(1,0,3,2) #Size: batch_size x padded_document_length x padded_document_length x l
+		
+		outer_sum_bias = B + self._b #Size: batch_size x padded_document_length x padded_document_length x l
+		if self._activation == "relu":
+			activated_outer_sum_bias = F.relu(outer_sum_bias)
+		elif self._activation == "tanh":
+			activated_outer_sum_bias = F.tanh(outer_sum_bias)
+			
+		relation_scores = torch.matmul(activated_outer_sum_bias, self._V) #Size: batch_size x padded_document_length x padded_document_length x n_classes
+		#################################################################
+		
+		
+		batch_size, padded_document_length = mask.size()	
+
+		relation_sigmoid_scores = torch.sigmoid(relation_scores) # F.sigmoid(relation_scores) #Size: batch_size x padded_document_length x padded_document_length x n_classes
+		
+		#predicted_relations[l, i, j, k] == 1 iif we predict a relation k with ARG1==i, ARG2==j in the l-th sentence of the batch 
+		predicted_relations = torch.round(relation_sigmoid_scores) #Size: batch_size x padded_document_length x padded_document_length x n_classes
+		
+		output_dict = {
+			"relation_sigmoid_scores": relation_sigmoid_scores,
+			"predicted_relations": predicted_relations,
+			"mask": mask
+		}
+		
+
+		if relations is not None:
+			#Reformat the gold relations before computing the loss
+			#Size: batch_size x padded_document_length x padded_document_length x n_classes
+			#gold_relations[l, i, j, k] == 1 iif we predict a relation k with ARG1==i, ARG2==j in the l-th sentence of the batch 
+			gold_relations = torch.zeros(batch_size, padded_document_length, padded_document_length, self._n_classes) 
+			
+
+			for exple_idx, exple_tags in enumerate(relations): #going through the batch
+				#rel is a list of list containing the current sentence in the batch
+				#each sublist in rel is of size padded_document_length
+				#and encodes a relation in the sentence where the two non zeros elements
+				#indicate the two words arguments AND the relation type between these two words.
+				for rel in exple_tags: 
+					#relations have been padded, so for each sentence in the batch there are
+					#max_nb_of_relations_in_batch_for_one_sentence relations ie (number of sublist such as rel)
+					#The padded relations are simply list of size padded_document_length filled with 0.
+					if rel.sum().item()==0: continue 
+					
+					for idx in rel.nonzero():
+						label_srt = self.vocab.get_token_from_index(rel[idx].item(), self._label_namespace)
+						arg, rel_type = label_srt.split("_")
+						if arg == "ARG1": x = idx.data[0]
+						else: y = idx.data[0]
+							
+					gold_relations[exple_idx, x, y, rel_type_2_idx[rel_type]] = 1
+			
+			#GPU support
+			if text_embeddings.is_cuda: gold_relations = gold_relations.cuda()
+			
+			
+			#Compute the loss
+			output_dict["loss"] = self.multi_class_cross_entropy_loss(scores = relation_scores, labels = gold_relations, mask = mask)
+			
+			#Compute the metrics with the predictions.
+			self._relation_metric(predictions = predicted_relations, gold_labels = gold_relations, mask = mask)
+			
+		return output_dict
+
+
+	@overrides
+	def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, Any]:
+		"""
+		Decode the predictions
+		"""
+		decoded_predictions = []
+		
+		for instance_tags in output_dict["predicted_relations"]:
+			sentence_length = instance_tags.size(0)
+			decoded_relations = []
+			
+			for arg1, arg2, rel_type_idx in instance_tags.nonzero().data:
+				relation = ["*"]*sentence_length
+				rel_type = idx_2_rel_type[rel_type_idx]
+				relation[arg1] = "ARG1_" + rel_type
+				relation[arg2] = "ARG2_" + rel_type
+				decoded_relations.append(relation)
+				
+			decoded_predictions.append(decoded_relations)
+			
+		output_dict["decoded_predictions"] = decoded_predictions
+		
+		return output_dict
+
+
+	@overrides
+	def get_metrics(self, reset: bool = False) -> Dict[str, float]:		
+		"""
+		Compute the metrics for relation: precision, recall and f1.
+		A relation is considered correct if we can correctly predict the last word of ARG1, the last word of ARG2 and the relation type.
+		"""
+		metric_dict = self._relation_metric.get_metric(reset = reset)
+		return {x: y for x, y in metric_dict.items() if "overall" in x}
diff --git a/hmtl/modules/__init__.py b/hmtl/modules/__init__.py
new file mode 100644
index 0000000..a4d6118
--- /dev/null
+++ b/hmtl/modules/__init__.py
@@ -0,0 +1,4 @@
+# coding: utf-8
+
+from hmtl.modules import seq2seq_encoders
+from hmtl.modules import text_field_embedders
\ No newline at end of file
diff --git a/hmtl/modules/seq2seq_encoders/__init__.py b/hmtl/modules/seq2seq_encoders/__init__.py
new file mode 100644
index 0000000..7aef322
--- /dev/null
+++ b/hmtl/modules/seq2seq_encoders/__init__.py
@@ -0,0 +1,3 @@
+# coding: utf-8
+
+from hmtl.modules.seq2seq_encoders.stacked_gru import StackedGRU
\ No newline at end of file
diff --git a/hmtl/modules/seq2seq_encoders/stacked_gru.py b/hmtl/modules/seq2seq_encoders/stacked_gru.py
new file mode 100644
index 0000000..b029e33
--- /dev/null
+++ b/hmtl/modules/seq2seq_encoders/stacked_gru.py
@@ -0,0 +1,129 @@
+# coding: utf-8
+
+from typing import List
+
+from overrides import overrides
+import torch
+from torch.nn import Dropout, Linear
+from torch.nn import GRU
+
+from allennlp.nn.util import last_dim_softmax, weighted_sum
+from allennlp.modules.seq2seq_encoders.seq2seq_encoder import Seq2SeqEncoder
+from allennlp.common.params import Params
+
+
+@Seq2SeqEncoder.register("stacked_gru")
+class StackedGRU(Seq2SeqEncoder):
+    # pylint: disable=line-too-long
+    """
+    This class implements a multiple layer GRU (RNN).
+    The specificity of this implementation compared to the default one in allennlp
+    (``allennlp.modules.seq2seq_encoders.Seq2SeqEncoder``) is the ability to
+    specify differents hidden state size for each layer of the in the
+    multiple-stacked-layers-GRU.
+    Optionally, different dropouts can be individually specified for each layer of the encoder.
+
+    Parameters
+    ----------
+    input_dim : ``int``, required.
+        The size of the last dimension of the input tensor.
+    hidden_sizes : ``List[int]``, required.
+        The hidden state sizes of each layer of the stacked-GRU.
+    num_layers : ``int``, required.
+        The number of layers to stack in the encoder.
+    bidirectional : ``bool``, required
+        Wheter or not the layers should be bidirectional.
+    dropouts : ``List[float]``, optional (default = None).
+        The dropout probabilities applied to each layer. The length of this list should
+        be equal to the number of layers ``num_layers``.
+    """
+
+    def __init__(self,
+                 input_dim: int,
+                 hidden_sizes: List[int],
+                 num_layers: int,
+                 bidirectional: bool,
+                 dropouts: List[float] = None) -> None:
+        super(StackedGRU, self).__init__()
+
+        self._input_dim = input_dim
+        self._hidden_sizes = hidden_sizes
+        self._num_layers = num_layers
+        self._bidirectional = bidirectional
+        self._dropouts = [0.]*num_layers if dropouts is None else dropouts
+
+        if len(self._hidden_sizes) != self._num_layers:
+            raise ValueError(f"Number of layers ({self._num_layers}) must be equal to the length of hidden state size list ({len(self._hidden_sizes)})")
+        if len(self._dropouts) != self._num_layers:
+            raise ValueError(f"Number of layers ({self._num_layers}) must be equal to the legnth of drouput rates list ({len(self._dropouts)})")
+        
+        self._output_dim = hidden_sizes[-1]
+        if self._bidirectional:
+            self._output_dim *= 2
+
+        self._gru_layers: List[GRU] = [] 
+        for k in range(self._num_layers):
+            input_size = self._input_dim if k==0 else self._hidden_sizes[k-1]
+            if self._bidirectional and (k!=0): 
+                input_size *= 2
+
+            gru_layer = GRU(input_size = input_size,
+                            hidden_size = self._hidden_sizes[k],
+                            dropout = self._dropouts[k],
+                            num_layers = 1,
+                            bidirectional = self._bidirectional)
+            self.add_module(f"gru_{k}", gru_layer)				
+            self._gru_layers.append(gru_layer)
+
+
+    def get_input_dim(self):
+        return self._input_dim
+
+    def get_output_dim(self):
+        return self._output_dim
+
+    @overrides
+    def is_bidirectional(self):
+        return self._bidirectional
+
+    @overrides
+    def forward(self,  # pylint: disable=arguments-differ
+                inputs: torch.Tensor,
+                mask: torch.LongTensor = None) -> torch.FloatTensor:
+        """
+        Parameters
+        ----------
+        inputs : ``torch.FloatTensor``, required.
+            A tensor of shape (batch_size, timesteps, input_dim)
+        mask : ``torch.FloatTensor``, optional (default = None).
+            A tensor of shape (batch_size, timesteps).
+
+        Returns
+        -------
+        A tensor of shape (batch_size, timesteps, output_projection_dim),
+        where output_projection_dim = input_dim by default.
+        """
+        gru = self._gru_layers[0]
+        outputs, _ = gru(inputs)
+        
+        for k in range(1, self._num_layers):
+            gru = self._gru_layers[k]
+            next_outputs, _ = gru(outputs)
+            outputs = next_outputs
+  
+        return outputs
+
+    @classmethod
+    def from_params(cls, params: Params) -> 'StackedGRU':
+        input_dim = params.pop_int('input_dim')
+        hidden_sizes = params.pop('hidden_sizes')
+        dropouts = params.pop('dropouts', None)
+        num_layers = params.pop_int('num_layers')
+        bidirectional = params.pop_bool('bidirectional')
+        params.assert_empty(cls.__name__)
+
+        return cls(input_dim = input_dim,
+                   hidden_sizes = hidden_sizes,
+                   num_layers = num_layers,
+                   bidirectional = bidirectional,
+                   dropouts = dropouts)
\ No newline at end of file
diff --git a/hmtl/modules/text_field_embedders/__init__.py b/hmtl/modules/text_field_embedders/__init__.py
new file mode 100644
index 0000000..f12783b
--- /dev/null
+++ b/hmtl/modules/text_field_embedders/__init__.py
@@ -0,0 +1,3 @@
+# coding: utf-8
+
+from hmtl.modules.text_field_embedders.shortcut_connect_text_field_embedder import ShortcutConnectTextFieldEmbedder
\ No newline at end of file
diff --git a/hmtl/modules/text_field_embedders/shortcut_connect_text_field_embedder.py b/hmtl/modules/text_field_embedders/shortcut_connect_text_field_embedder.py
new file mode 100644
index 0000000..1be64f5
--- /dev/null
+++ b/hmtl/modules/text_field_embedders/shortcut_connect_text_field_embedder.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+
+from typing import Dict, List
+
+import torch
+from overrides import overrides
+
+from allennlp.modules.text_field_embedders.text_field_embedder import TextFieldEmbedder
+from allennlp.modules.seq2seq_encoders.seq2seq_encoder import Seq2SeqEncoder
+import allennlp.nn.util as util
+
+
+@TextFieldEmbedder.register("shortcut_connect_text_field_embedder")
+class ShortcutConnectTextFieldEmbedder(TextFieldEmbedder):
+    """
+    This class implement a specific text field embedder that benefits from the output of 
+    a ``allennlp.modules.seq2seq_encoders.seq2seq_encoder.Seq2SeqEncoder``.
+    It simply concatenate two embeddings vectors: the one from the previous_encoder 
+    (an ``allennlp.modules.seq2seq_encoders.seq2seq_encoder.Seq2SeqEncoder``)  and
+    the one from the base_text_field_embedder 
+    (an ``allennlp.modules.text_field_embedders.text_field_embedder.TextFieldEmbedder``).
+    The latter actually computes the word representation and explains the name of this class
+    "ShortcutConnectTextFieldEmbedder": it will feed the input of a ``Seq2SeqEncoder`` 
+    with the output of the previous_encoder and the output of the base_text_field_embedder,
+    the connection with base_text_field_embedder actually circumventing the previous_encoder.
+    
+    Parameters
+    ----------
+    base_text_field_embedder : ``TextFieldEmbedder``, required
+        The text field embedder that computes the word representation at the base of the model.
+    previous_encoder : ``Seq2SeqEncoder``, required
+        The previous seq2seqencoder.
+    """
+    def __init__(self, 
+                base_text_field_embedder: TextFieldEmbedder,
+                previous_encoders: List[Seq2SeqEncoder]) -> None:
+        super(ShortcutConnectTextFieldEmbedder, self).__init__()
+        self._base_text_field_embedder = base_text_field_embedder
+        self._previous_encoders = previous_encoders
+
+    @overrides
+    def get_output_dim(self) -> int:
+        output_dim = 0
+        output_dim += self._base_text_field_embedder.get_output_dim()
+        output_dim += self._previous_encoders[-1].get_output_dim()
+        
+        return output_dim
+
+    @overrides
+    def forward(self, 
+                text_field_input: Dict[str, torch.Tensor], 
+                num_wrapping_dims: int = 0) -> torch.Tensor:		
+        text_field_embeddings = self._base_text_field_embedder.forward(text_field_input, num_wrapping_dims)
+        base_representation = text_field_embeddings
+        mask = util.get_text_field_mask(text_field_input)
+        
+        
+        for encoder in self._previous_encoders:
+            text_field_embeddings = encoder(text_field_embeddings, mask)
+            text_field_embeddings = torch.cat([base_representation, text_field_embeddings], dim = -1)
+        
+        
+        return torch.cat([text_field_embeddings], dim=-1)
\ No newline at end of file
diff --git a/hmtl/tasks/__init__.py b/hmtl/tasks/__init__.py
new file mode 100644
index 0000000..ad064fd
--- /dev/null
+++ b/hmtl/tasks/__init__.py
@@ -0,0 +1,3 @@
+# coding: utf-8
+
+from hmtl.tasks.task import Task
\ No newline at end of file
diff --git a/hmtl/tasks/task.py b/hmtl/tasks/task.py
new file mode 100644
index 0000000..c04f206
--- /dev/null
+++ b/hmtl/tasks/task.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+
+from typing import List
+from allennlp.common import Params
+from allennlp.commands.train import datasets_from_params
+from allennlp.data.iterators import DataIterator
+from allennlp.common.checks import ConfigurationError
+
+class Task():
+    """
+    A class to encapsulate the necessary informations (and datasets)
+    about each task.
+    
+    Parameters
+    ----------
+    name : ``str``, required
+        The name of the task.
+    validation_metric_name : ``str``, required
+        The name of the validation metric to use to monitor training
+        and select the best epoch.
+    validation_metric_decreases : ``bool``, required
+        Whether or not the validation metric should decrease for improvement.
+    evaluate_on_test : ``bool`, optional (default = False)
+        Whether or not the task should be evaluated on the test set at the end of the training.
+    """
+    def __init__(self,
+                name: str,
+                validation_metric_name: str,
+                validation_metric_decreases: bool,
+                evaluate_on_test: bool = False) -> None:
+        self._name = name
+        
+        self._train_data = None
+        self._validation_data = None
+        self._test_data = None
+        self._evaluate_on_test = evaluate_on_test
+        
+        self._val_metric = validation_metric_name
+        self._val_metric_decreases = validation_metric_decreases
+        
+        self._data_iterator = None
+
+        
+    def set_data_iterator(self,
+                    data_iterator: DataIterator):
+        if data_iterator is not None:
+            self._data_iterator = data_iterator
+        else:
+            ConfigurationError(f"data_iterator cannot be None in set_iterator - Task name: {self._name}")	
+
+        
+    def load_data_from_params(self, 
+                            params: Params):
+        all_datasets = datasets_from_params(params)
+        datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))
+        
+        for dataset in datasets_for_vocab_creation:
+            if dataset not in all_datasets:
+                raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")
+        
+        instances_for_vocab_creation = (instance for key, dataset in all_datasets.items() 
+                                        for instance in dataset 
+                                        if key in datasets_for_vocab_creation)
+        
+        self._instances_for_vocab_creation = instances_for_vocab_creation
+        self._datasets_for_vocab_creation = datasets_for_vocab_creation
+        
+        if 'train' in all_datasets.keys():
+            self._train_data = all_datasets["train"]
+            self._tr_instances = sum(1 for e in self._train_data) # This is horrible if lazy iterator (Iterable)
+        if 'validation' in all_datasets.keys():
+            self._validation_data = all_datasets["validation"]
+            self._val_instances = sum(1 for e in self._validation_data) # This is horrible if lazy iterator (Iterable)
+        if 'test' in all_datasets.keys():
+            self._test_data = all_datasets["test"]
+            self._test_instances = sum(1 for e in self._test_data) # This is horrible if lazy iterator (Iterable)
+        
+        # If trying to evaluate on test set, make sure the dataset is loaded	
+        if self._evaluate_on_test:
+            assert self._test_data is not None
+            
+        #return instances_for_vocab_creation, datasets_for_vocab_creation, all_datasets
+        return instances_for_vocab_creation, datasets_for_vocab_creation
+    
+    @classmethod	
+    def from_params(cls, params: Params) -> "Task":
+        task_name = params.pop("task_name", "ner")
+        validation_metric_name = params.pop("validation_metric_name", "f1-measure-overall")
+        validation_metric_decreases = params.pop_bool("validation_metric_decreases", False)
+        evaluate_on_test = params.pop_bool("evaluate_on_test", False) 
+        
+        params.assert_empty(cls.__name__)
+        return cls(name = task_name,
+                validation_metric_name = validation_metric_name,
+                validation_metric_decreases = validation_metric_decreases,
+                evaluate_on_test = evaluate_on_test)
\ No newline at end of file
diff --git a/hmtl/training/__init__.py b/hmtl/training/__init__.py
new file mode 100644
index 0000000..f7eecc3
--- /dev/null
+++ b/hmtl/training/__init__.py
@@ -0,0 +1,3 @@
+# coding: utf-8
+
+from hmtl.training.sampler_multi_task_trainer import SamplerMultiTaskTrainer
\ No newline at end of file
diff --git a/hmtl/training/metrics/__init__.py b/hmtl/training/metrics/__init__.py
new file mode 100644
index 0000000..d0094ad
--- /dev/null
+++ b/hmtl/training/metrics/__init__.py
@@ -0,0 +1,4 @@
+# coding: utf-8
+
+from hmtl.training.metrics.relation_f1_measure import RelationF1Measure
+from hmtl.training.metrics.conll_coref_full_scores import ConllCorefFullScores
\ No newline at end of file
diff --git a/hmtl/training/metrics/conll_coref_full_scores.py b/hmtl/training/metrics/conll_coref_full_scores.py
new file mode 100644
index 0000000..76bba7a
--- /dev/null
+++ b/hmtl/training/metrics/conll_coref_full_scores.py
@@ -0,0 +1,35 @@
+from overrides import overrides
+
+from allennlp.training.metrics import ConllCorefScores
+
+class ConllCorefFullScores(ConllCorefScores):
+    """
+    This is marginal modification of the class ``allennlp.training.metrics.metric.ConllCorefScores``.
+    It leaves the possibility to get the 3 detailled coreference metrics (B3, MUC, CEAFE),
+    and not only their average.
+    """
+    def __init__(self) -> None:
+        super(ConllCorefFullScores, self).__init__()
+
+    @overrides
+    def get_metric(self, reset: bool = False, full: bool = False):
+        full_metrics = {}
+        if full:
+            for e in self.scorers:
+                metric_name = e.metric.__name__
+                full_metrics[metric_name] = {"precision": e.get_precision(), 
+                                            "recall": e.get_recall(), 
+                                            "f1_score":  e.get_f1()}
+        
+        metrics = (lambda e: e.get_precision(), lambda e: e.get_recall(), lambda e: e.get_f1())
+        precision, recall, f1_score = tuple(sum(metric(e) for e in self.scorers) / len(self.scorers)
+                                    for metric in metrics)
+                                    
+        full_metrics["coref_precision"] = precision
+        full_metrics["coref_recall"] = recall
+        full_metrics["coref_f1"] = f1_score
+        
+        if reset:
+            self.reset()
+            
+        return full_metrics
\ No newline at end of file
diff --git a/hmtl/training/metrics/relation_f1_measure.py b/hmtl/training/metrics/relation_f1_measure.py
new file mode 100644
index 0000000..e3fd299
--- /dev/null
+++ b/hmtl/training/metrics/relation_f1_measure.py
@@ -0,0 +1,109 @@
+from typing import Dict, List, Optional, Set
+from collections import defaultdict
+
+import torch
+
+from allennlp.common.checks import ConfigurationError
+from allennlp.nn.util import get_lengths_from_binary_sequence_mask #, ones_like
+from allennlp.data.vocabulary import Vocabulary
+from allennlp.training.metrics.metric import Metric
+
+@Metric.register("relation_f1")
+class RelationF1Measure(Metric):
+    """
+    """
+    def __init__(self) -> None:
+        """
+        A class for computing the metrics specific to relation extraction.
+        We consider a relation correct if we correctly predict the last of the head of the two arguments and the relation type.
+        """
+        self._true_positives: int = 0
+        self._false_positives: int = 0
+        self._false_negatives: int = 0
+
+    def __call__(self,
+                 predictions: torch.Tensor,
+                 gold_labels: torch.Tensor,
+                 mask: Optional[torch.Tensor] = None):
+        """
+        Update the TP, FP and FN counters.
+        
+        Parameters
+        ----------
+        predictions : ``torch.Tensor``, required.
+            A tensor of predictions of shape (batch_size, sequence_length, num_classes).
+        gold_labels : ``torch.Tensor``, required.
+            A tensor of integer class label of shape (batch_size, sequence_length). It must be the same
+            shape as the ``predictions`` tensor without the ``num_classes`` dimension.
+        mask: ``torch.Tensor``, optional (default = None).
+            A masking tensor the same size as ``gold_labels``.
+        """
+        if mask is None:
+            mask = torch.ones_like(gold_labels) #ones_like(gold_labels)
+        # Get the data from the Variables.
+        predictions, gold_labels, mask = self.unwrap_to_tensors(predictions,
+                                                                gold_labels,
+                                                                mask)
+
+        if (gold_labels.size() != predictions.size()):
+            raise ConfigurationError("Predictions and gold labels don't have the same size.")
+        
+        #Apply mask
+        #Compute the mask before computing the loss
+        #Transform the mask that is at the sentence level (#Size: n_batches x padded_document_length)
+        #to a suitable format for the relation labels level
+        _, padded_document_length, _, n_classes = predictions.size()
+        mask = mask.float()
+        squared_mask = torch.stack([e.view(padded_document_length, 1)*e for e in mask], dim = 0)
+        squared_mask = squared_mask.unsqueeze(-1).repeat(1, 1, 1, n_classes) #Size: n_batches x padded_document_length x padded_document_length x n_classes
+        
+        gold_labels = gold_labels.cpu()
+        
+        predictions = predictions*squared_mask #Size: n_batches x padded_document_length x padded_document_length x n_classes
+        gold_labels = gold_labels*squared_mask #Size: n_batches x padded_document_length x padded_document_length x n_classes
+        
+        
+        # Iterate over timesteps in batch.
+        batch_size = gold_labels.size(0)
+        for i in range(batch_size):
+            flattened_predictions = predictions[i].view(-1).nonzero().cpu().numpy()
+            flattened_gold_labels = gold_labels[i].view(-1).nonzero().cpu().numpy()
+            
+            for prediction in flattened_predictions:
+                if prediction in flattened_gold_labels:
+                    self._true_positives += 1
+                else:
+                    self._false_positives += 1
+            for gold in flattened_gold_labels:
+                if gold not in flattened_predictions:
+                    self._false_negatives += 1
+
+
+    def get_metric(self, reset: bool = False):
+        """
+        Get the metrics and reset the counters if necessary.
+        """
+        all_metrics = {}
+        
+        # Compute the precision, recall and f1 for all spans jointly.
+        precision, recall, f1_measure = self._compute_metrics(self._true_positives,
+                                                              self._false_positives,
+                                                              self._false_negatives)
+        all_metrics["precision-overall"] = precision
+        all_metrics["recall-overall"] = recall
+        all_metrics["f1-measure-overall"] = f1_measure
+        if reset:
+            self.reset()
+        return all_metrics
+
+    @staticmethod
+    def _compute_metrics(true_positives: int, false_positives: int, false_negatives: int):
+        precision = float(true_positives) / float(true_positives + false_positives + 1e-13)
+        recall = float(true_positives) / float(true_positives + false_negatives + 1e-13)
+        f1_measure = 2. * ((precision * recall) / (precision + recall + 1e-13))
+        return precision, recall, f1_measure
+
+    def reset(self):
+        self._true_positives = 0
+        self._false_positives = 0
+        self._false_negatives = 0
diff --git a/hmtl/training/multi_task_trainer.py b/hmtl/training/multi_task_trainer.py
new file mode 100644
index 0000000..f9b345e
--- /dev/null
+++ b/hmtl/training/multi_task_trainer.py
@@ -0,0 +1,380 @@
+# coding: utf-8
+
+import os
+import math
+import time
+from copy import deepcopy
+import random
+import logging
+import itertools
+import shutil
+from tensorboardX import SummaryWriter
+
+from typing import List, Optional, Dict, Any, Tuple
+
+import torch
+import torch.optim.lr_scheduler
+import tqdm
+
+from allennlp.common import Params
+from allennlp.common.checks import ConfigurationError, check_for_gpu
+from allennlp.common.util import peak_memory_mb, gpu_memory_mb
+from allennlp.nn.util import device_mapping, move_to_device
+from allennlp.training.learning_rate_schedulers import LearningRateScheduler
+from allennlp.training.optimizers import Optimizer
+from allennlp.training.trainer import sparse_clip_norm, TensorboardWriter
+from allennlp.models.model import Model
+from allennlp.common.registrable import Registrable
+
+
+from hmtl.tasks import Task
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+class MultiTaskTrainer(Registrable):
+    def __init__(self, 
+                model: Model,
+                task_list: List[Task],
+                optimizer_params: Params,
+                lr_scheduler_params: Params,
+                patience: Optional[int] = None, 
+                num_epochs: int = 20, 
+                serialization_dir: str = None, 
+                cuda_device: int = -1,
+                grad_norm: Optional[float] = None, 
+                grad_clipping: Optional[float] = None, 
+                min_lr: float = 0.00001,
+                no_tqdm: bool = False,
+                summary_interval: int = 50,
+                log_parameter_statistics: bool = False,
+                log_gradient_statistics: bool = False):
+        """ 
+        Parameters
+        ----------
+        model: ``Model``, required.
+            An AllenNLP model to be optimized. Pytorch Modules can also be optimized if
+            their ``forward`` method returns a dictionary with a "loss" key, containing a
+            scalar tensor representing the loss function to be optimized.
+        iterator: ``DataIterator``, required.
+            A method for iterating over a ``Dataset``, yielding padded indexed batches.
+        patience: Optional[int] > 0, optional (default=None)
+            Number of epochs to be patient before early stopping: the training is stopped
+            after ``patience`` epochs with no improvement. If given, it must be ``> 0``.
+            If None, early stopping is disabled.
+        num_epochs: int, optional (default = 20)
+            Number of training epochs.
+        serialization_dir: str, optional (default=None)
+            Path to directory for saving and loading model files. Models will not be saved if
+            this parameter is not passed.
+        cuda_device: int, optional (default = -1)
+            An integer specifying the CUDA device to use. If -1, the CPU is used.
+            Multi-gpu training is not currently supported, but will be once the
+            Pytorch DataParallel API stabilises.
+        grad_norm: float, optional, (default = None).
+            If provided, gradient norms will be rescaled to have a maximum of this value.
+        grad_clipping : float, optional (default = None).
+            If provided, gradients will be clipped `during the backward pass` to have an (absolute)
+            maximum of this value.  If you are getting ``NaNs`` in your gradients during training
+            that are not solved by using ``grad_norm``, you may need this.
+        no_tqdm : bool, optional (default=False)
+            We use ``tqdm`` for logging, which will print a nice progress bar that updates in place
+            after every batch.  This is nice if you're running training on a local shell, but can
+            cause problems with log files from, e.g., a docker image running on kubernetes.  If
+            ``no_tqdm`` is ``True``, we will not use tqdm, and instead log batch statistics using
+            ``logger.info``.
+        """
+        self._model = model
+        parameters_to_train = [(n, p) for n, p in self._model.named_parameters() if p.requires_grad]
+        
+        self._task_list = task_list
+        self._n_tasks = len(self._task_list)
+        
+        self._optimizer_params = optimizer_params
+        self._optimizers = {}
+        self._lr_scheduler_params = lr_scheduler_params
+        self._schedulers = {}
+        for task in self._task_list:
+            task_name = task._name
+            self._optimizers[task_name] = Optimizer.from_params(model_parameters = parameters_to_train,
+                                                                  params = deepcopy(optimizer_params))
+            self._schedulers[task_name] = LearningRateScheduler.from_params(optimizer = self._optimizers[task_name],
+                                                                            params = deepcopy(lr_scheduler_params))
+        
+        self._serialization_dir = serialization_dir
+    
+        self._patience = patience
+        self._num_epochs = num_epochs
+        self._cuda_device = cuda_device
+        if self._cuda_device >= 0:
+            check_for_gpu(self._cuda_device)
+            self._model = self._model.cuda(self._cuda_device)
+        self._grad_norm = grad_norm
+        self._grad_clipping = grad_clipping
+        self._min_lr = min_lr
+
+        self._task_infos = None
+        self._metric_infos = None
+        
+        self._tr_generators = None
+        self._no_tqdm = no_tqdm
+        
+        self._summary_interval = summary_interval  # num batches between logging to tensorboard
+        self._log_parameter_statistics = log_parameter_statistics
+        self._log_gradient_statistics = log_gradient_statistics
+        self._global_step = 0
+        train_log = SummaryWriter(os.path.join(self._serialization_dir, "log", "train"))
+        validation_log = SummaryWriter(os.path.join(self._serialization_dir, "log", "validation"))
+        self._tensorboard = TensorboardWriter(train_log = train_log, validation_log = validation_log)
+
+
+    def train(self, 
+            #tasks: List[Task], 
+            #params: Params,
+            recover: bool = False):
+            
+        raise NotImplementedError
+    
+    
+    def _check_history(self, 
+                    metric_history: List[float], 
+                    cur_score: float, 
+                    should_decrease: bool = False):
+        '''
+        Given a task, the history of the performance on that task,
+        and the current score, check if current score is
+        best so far and if out of patience.
+        
+        Parameters
+        ----------
+        metric_history: List[float], required
+        cur_score: float, required
+        should_decrease: bool, default = False
+            Wheter or not the validation metric should increase while training.
+            For instance, the bigger the f1 score is, the better it is -> should_decrease = False
+            
+        Returns
+        -------
+        best_so_far: bool
+            Whether or not the current epoch is the best so far in terms of the speicified validation metric.
+        out_of_patience: bool
+            Whether or not the training for this specific task should stop (patience parameter).
+        '''
+        patience = self._patience + 1
+        best_fn = min if should_decrease else max
+        best_score = best_fn(metric_history)
+        if best_score == cur_score:
+            best_so_far = metric_history.index(best_score) == len(metric_history) - 1
+        else:
+            best_so_far = False
+
+        out_of_patience = False
+        if len(metric_history) > patience:
+            if should_decrease:
+                out_of_patience = max(metric_history[-patience:]) <= cur_score
+            else:
+                out_of_patience = min(metric_history[-patience:]) >= cur_score
+
+        if best_so_far and out_of_patience: # then something is up
+            print("Something is up")
+
+        return best_so_far, out_of_patience
+    
+    
+    def _forward(self, 
+                tensor_batch: torch.Tensor, 
+                for_training: bool = False,
+                task:Task = None):
+        if task is not None:
+            tensor_batch = move_to_device(tensor_batch, self._cuda_device)
+            output_dict = self._model.forward(task_name = task._name, tensor_batch = tensor_batch, for_training = for_training)
+            if for_training:
+                try:
+                    loss = output_dict["loss"]
+                    loss += self._model.get_regularization_penalty()
+                except KeyError:
+                    raise RuntimeError("The model you are trying to optimize does not contain a"
+                                           " `loss` key in the output of model.forward(inputs).")
+            return output_dict
+        else:
+            raise ConfigurationError("Cannot call forward through task `None`")
+    
+        
+    def _get_metrics(self, 
+                    task: Task, 
+                    reset: bool = False):
+        task_tagger = getattr(self._model, "_tagger_" + task._name)
+        return task_tagger.get_metrics(reset)
+
+
+    def _description_from_metrics(self, 
+                                 metrics: Dict[str, float]):
+        # pylint: disable=no-self-use
+        return ', '.join(["%s: %.4f" % (name, value) for name, value in metrics.items()]) + " ||"
+
+
+    def _rescale_gradients(self) -> Optional[float]:
+        """
+        Performs gradient rescaling. Is a no-op if gradient rescaling is not enabled.
+        """
+        if self._grad_norm:
+            parameters_to_clip = [p for p in self._model.parameters()
+                                  if p.grad is not None]
+            return sparse_clip_norm(parameters_to_clip, self._grad_norm)
+        return None
+
+
+    def _enable_gradient_clipping(self) -> None:
+        if self._grad_clipping is not None:
+            # Pylint is unable to tell that we're in the case that _grad_clipping is not None...
+            # pylint: disable=invalid-unary-operand-type
+            clip_function = lambda grad: grad.clamp(-self._grad_clipping, self._grad_clipping)
+            for parameter in self._model.parameters():
+                if parameter.requires_grad:
+                    parameter.register_hook(clip_function)
+    
+                    
+    def _save_checkpoint(self, 
+                        epoch: int, 
+                        should_stop: bool) -> None:
+        """
+        Save the current states (model, training, optimizers, metrics and tasks).
+        
+        Parameters
+        ----------
+        epoch: int, required.
+            The epoch of training.
+        should_stop: bool, required
+            Wheter or not the training is finished.
+        should_save_model: bool, optional (default = True)
+            Whether or not the model state should be saved.
+        """		
+        ### Saving training state ###
+        training_state = {"epoch": epoch, 
+                        "should_stop": should_stop,
+                        "metric_infos": self._metric_infos,
+                        "task_infos": self._task_infos,
+                        "schedulers": {},
+                        "optimizers": {}}
+                        
+        if self._optimizers is not None:
+            for task_name, optimizer in self._optimizers.items():
+                training_state["optimizers"][task_name] = optimizer.state_dict()
+        if self._schedulers is not None:
+            for task_name, scheduler in self._schedulers.items():
+                training_state["schedulers"][task_name] = scheduler.lr_scheduler.state_dict()
+                
+        training_path = os.path.join(self._serialization_dir, "training_state.th")
+        torch.save(training_state, training_path)
+        logger.info("Checkpoint - Saved training state to %s", training_path)
+        
+        
+        ### Saving model state ###
+        model_path = os.path.join(self._serialization_dir, "model_state.th")
+        model_state = self._model.state_dict()
+        torch.save(model_state, model_path)
+        logger.info("Checkpoint - Saved model state to %s", model_path)
+        
+        
+        ### Saving best models for each task ###					 
+        for task_name, infos in self._metric_infos.items():
+            best_epoch, _ = infos["best"]
+            if best_epoch == epoch:
+                logger.info("Checkpoint - Best validation performance so far for %s task", task_name)
+                logger.info("Checkpoint - Copying weights to '%s/best_%s.th'.", self._serialization_dir, task_name)
+                shutil.copyfile(model_path, os.path.join(self._serialization_dir, "best_{}.th".format(task_name)))
+    
+    
+    def find_latest_checkpoint(self) -> Tuple[str, str]:
+        """
+        Return the location of the latest model and training state files.
+        If there isn't a valid checkpoint then return None.
+        """
+        have_checkpoint = (self._serialization_dir is not None and
+                           any("model_state" in x for x in os.listdir(self._serialization_dir)) and
+                           any("training_state" in x for x in os.listdir(self._serialization_dir)))
+
+        if not have_checkpoint:
+            return None
+
+        model_path = os.path.join(self._serialization_dir,
+                                "model_state.th")
+        training_state_path = os.path.join(self._serialization_dir,
+                                           "training_state.th")
+
+        return (model_path, training_state_path)
+        
+                
+    def _restore_checkpoint(self):
+        """
+        Restores a model from a serialization_dir to the last saved checkpoint.
+        This includes an epoch count, optimizer state, a model state, a task state and
+        a metric state. All are of which are serialized separately. 
+        This function should only be used to continue training -
+        if you wish to load a model for inference/load parts of a model into a new
+        computation graph, you should use the native Pytorch functions:
+        `` model.load_state_dict(torch.load("/path/to/model/weights.th"))``
+
+        Returns
+        -------
+        epoch: int, 
+            The epoch at which to resume training.
+        should_stop: bool
+            Whether or not the training should already by stopped.
+        """
+        
+        latest_checkpoint = self.find_latest_checkpoint()
+        
+        if not self._serialization_dir:
+            raise ConfigurationError("`serialization_dir` not specified - cannot "
+                                     "restore a model without a directory path.")
+        if latest_checkpoint is None:
+            raise ConfigurationError("Cannot restore model because one of"
+                                    "`model_state.th` or `training_state.th` is not in directory path.")
+        
+        model_path, training_state_path = latest_checkpoint
+        
+        # Load the parameters onto CPU, then transfer to GPU.
+        # This avoids potential OOM on GPU for large models that
+        # load parameters onto GPU then make a new GPU copy into the parameter
+        # buffer. The GPU transfer happens implicitly in load_state_dict.
+        model_state = torch.load(model_path, map_location = device_mapping(-1))
+        training_state = torch.load(training_state_path, map_location = device_mapping(-1))
+        
+        # Load model
+        self._model.load_state_dict(model_state)
+        logger.info("Checkpoint - Model loaded from %s", model_path)
+        
+        # Load optimizers
+        for task_name, optimizers_state in training_state["optimizers"].items():
+            self._optimizers[task_name].load_state_dict(optimizers_state)
+        logger.info("Checkpoint - Optimizers loaded from %s", training_state_path)
+        
+        # Load schedulers
+        for task_name, scheduler_state in training_state["schedulers"].items():
+            self._schedulers[task_name].lr_scheduler.load_state_dict(scheduler_state)
+        logger.info("Checkpoint - Learning rate schedulers loaded from %s", training_state_path)
+        
+        self._metric_infos = training_state["metric_infos"]
+        self._task_infos = training_state["task_infos"]
+        logger.info("Checkpoint - Task infos loaded from %s", training_state_path)
+        logger.info("Checkpoint - Metric infos loaded from %s", training_state_path)
+        
+        n_epoch, should_stop = training_state["epoch"], training_state["should_stop"]
+        
+        return n_epoch + 1, should_stop
+
+
+    @classmethod
+    def from_params(cls,  
+                    model: Model, 
+                    task_list: List[Task],
+                    serialization_dir: str,
+                    params: Params) -> 'MultiTaskTrainer':
+        """
+        Static method that constructs the multi task trainer described by ``params``.
+        """
+        choice = params.pop_choice('type', cls.list_available())
+        return cls.by_name(choice).from_params(model = model, 
+                                            task_list = task_list,
+                                            serialization_dir = serialization_dir,
+                                            params = params)
\ No newline at end of file
diff --git a/hmtl/training/sampler_multi_task_trainer.py b/hmtl/training/sampler_multi_task_trainer.py
new file mode 100644
index 0000000..bfc12de
--- /dev/null
+++ b/hmtl/training/sampler_multi_task_trainer.py
@@ -0,0 +1,501 @@
+# coding: utf-8
+
+import os
+import math
+import time
+from copy import deepcopy
+import random
+import logging
+import itertools
+import shutil
+from tensorboardX import SummaryWriter
+import numpy as np
+
+from typing import List, Optional, Dict, Any
+from overrides import overrides
+
+import torch
+import torch.optim.lr_scheduler
+import tqdm
+
+from allennlp.common import Params
+from allennlp.common.checks import ConfigurationError, check_for_gpu
+from allennlp.common.util import peak_memory_mb, gpu_memory_mb
+from allennlp.nn.util import device_mapping
+from allennlp.data.iterators import DataIterator
+from allennlp.training.learning_rate_schedulers import LearningRateScheduler
+from allennlp.training.optimizers import Optimizer
+from allennlp.training.trainer import sparse_clip_norm, TensorboardWriter
+from allennlp.models.model import Model
+
+from hmtl.tasks import Task
+from hmtl.training.multi_task_trainer import MultiTaskTrainer
+
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+    
+@MultiTaskTrainer.register("sampler_multi_task_trainer")
+class SamplerMultiTaskTrainer(MultiTaskTrainer):
+    def __init__(self, 
+                model: Model,
+                task_list: List[Task],
+                optimizer_params: Params,
+                lr_scheduler_params: Params,
+                patience: Optional[int] = None, 
+                num_epochs: int = 20, 
+                serialization_dir: str = None, 
+                cuda_device: int = -1,
+                grad_norm: Optional[float] = None, 
+                grad_clipping: Optional[float] = None, 
+                min_lr: float = 0.00001,
+                no_tqdm: bool = False,
+                summary_interval: int = 50,
+                log_parameter_statistics: bool = False,
+                log_gradient_statistics: bool = False,
+                sampling_method: str = "proportional"):
+                
+        if sampling_method not in ["uniform", "proportional"]:
+            raise ConfigurationError(f"Sampling method ({sampling_method}) must be `uniform` or `proportional`.")
+        
+        self._sampling_method = sampling_method
+        super(SamplerMultiTaskTrainer, self).__init__(model = model,
+                                                    task_list = task_list,
+                                                    optimizer_params = optimizer_params,
+                                                    lr_scheduler_params = lr_scheduler_params,
+                                                    patience = patience,
+                                                    num_epochs = num_epochs,
+                                                    serialization_dir=serialization_dir,
+                                                    cuda_device = cuda_device,
+                                                    grad_norm = grad_norm,
+                                                    grad_clipping = grad_clipping,
+                                                    min_lr = min_lr,
+                                                    no_tqdm = no_tqdm,
+                                                    summary_interval = summary_interval,
+                                                    log_parameter_statistics = log_parameter_statistics,
+                                                    log_gradient_statistics = log_gradient_statistics)
+        
+
+
+    @overrides
+    def train(self, 
+            recover: bool = False):
+        '''
+        Train the different task_list, save the different checkpoints and metrics,
+        and save the model at the end of training while logging the training details.
+        
+        The metrics through the training are stored in dictionaries with the following structure:
+        
+        all_metrics - Dict[str, str]
+            task_name: val_metric
+
+        metric_infos (Dict[])
+            task_name (Dict[str, diverse]
+                val_metric (str): name (str)
+                hist (str): history_of_the_val_metric (List[float])
+                stopped (str): training_is_stopped (bool)
+                best (str): best_epoch_for_val_metric (Tuple(int, Dict))  
+
+        all_tr_metrics (Dict[str, Dict[str, float]])
+            task_name (Dict[str, float])
+                metric_name (str): value (float)
+                loss: value (float)		
+
+        all_val_metrics (Dict[str, Dict[str, float]])
+            task_name (Dict[str, float])
+                metric_name (str): value (float)
+                loss (str): value (float)
+        
+        Parameters
+        ----------
+        task_list: List[Task], required
+            A list containing the tasks to train.
+        params: Params, required
+            Training parameters
+        recover: bool, required
+            Whether or not training should be recovered from a previous training.
+
+        Returns
+        -------
+        return_dict: Dict
+            A dictionary summarizing the training and the metrics for the best epochs for each task.
+        '''
+        training_start_time = time.time()
+        
+        if recover:
+            try:
+                n_epoch, should_stop = self._restore_checkpoint()
+                logger.info("Loaded model from checkpoint. Starting at epoch %d", n_epoch)
+            except RuntimeError:
+                raise ConfigurationError("Could not recover training from the checkpoint.  Did you mean to output to "
+                                         "a different serialization directory or delete the existing serialization "
+                                         "directory?")
+        else:
+            n_epoch, should_stop = 0, False	
+                    
+            ### Store all the necessary informations and attributes about the tasks ###
+            task_infos = {task._name: {} for task in self._task_list}
+            for task_idx, task in enumerate(self._task_list):
+                task_info = task_infos[task._name]
+                
+                # Store statistiscs on training and validation batches
+                data_iterator = task._data_iterator
+                n_tr_batches = data_iterator.get_num_batches(task._train_data)
+                n_val_batches = data_iterator.get_num_batches(task._validation_data)
+                task_info['n_tr_batches'] = n_tr_batches
+                task_info['n_val_batches'] = n_val_batches
+                
+                # Create counter for number of batches trained during the whole
+                # training for this specific task
+                task_info['total_n_batches_trained'] = 0
+                
+                task_info['last_log'] = time.time() # Time of last logging
+            self._task_infos = task_infos
+            
+            ### Bookkeeping the validation metrics ###
+            metric_infos = {task._name: {'val_metric': task._val_metric, 
+                                        'hist': [], 
+                                        'is_out_of_patience': False,
+                                        'min_lr_hit': False, 
+                                        'best': (-1, {})} 
+                            for task in self._task_list}
+            self._metric_infos = metric_infos
+        
+                
+        ### Write log ###
+        total_n_tr_batches = 0 # The total number of training batches across all the datasets.
+        for task_name, info in self._task_infos.items():
+            total_n_tr_batches += info["n_tr_batches"]
+            logger.info("Task %s:", task_name)
+            logger.info("\t%d training batches", info["n_tr_batches"])
+            logger.info("\t%d validation batches", info["n_val_batches"])
+                
+        
+        ### Create the training generators/iterators tqdm ###
+        self._tr_generators = {}
+        for task in self._task_list:
+            data_iterator = task._data_iterator
+            tr_generator = data_iterator(task._train_data,
+                                        num_epochs = None)
+            self._tr_generators[task._name] = tr_generator
+            
+        
+        ### Create sampling probability distribution ###
+        if self._sampling_method == "uniform":
+            sampling_prob = [float(1/self._n_tasks)]*self._n_tasks
+        elif self._sampling_method == "proportional":
+            sampling_prob = [float(info['n_tr_batches']/total_n_tr_batches) for info in self._task_infos.values()]
+
+
+        ### Enable gradient clipping ###
+        # Only if self._grad_clipping is specified
+        self._enable_gradient_clipping()
+
+
+        ### Setup is ready. Training of the model can begin ###
+        logger.info("Set up ready. Beginning training/validation.")
+
+        
+        ### Begin Training of the model ###
+        while not should_stop:					
+            # Train one epoch (training pass + validation pass)
+            
+            
+            self._model.train() # Set the model to "train" mode.
+            
+            
+            ### Log Infos: current epoch count and CPU/GPU usage ###
+            logger.info("")
+            logger.info("Epoch %d/%d - Begin", n_epoch, self._num_epochs - 1)
+            logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}")
+            for gpu, memory in gpu_memory_mb().items():
+                logger.info(f"GPU {gpu} memory usage MB: {memory}")	
+            
+            logger.info("Training - Begin")
+            
+            
+            ### Reset training and trained batches counter before new training epoch ###
+            for _, task_info in self._task_infos.items():
+                task_info["tr_loss_cum"] = 0.0
+                task_info["n_batches_trained_this_epoch"] = 0
+            all_tr_metrics = {} # BUG TO COMPLETE COMMENT TO MAKE IT MORE CLEAR
+            
+            
+            ### Start training epoch ###
+            epoch_tqdm = tqdm.tqdm(range(total_n_tr_batches), total = total_n_tr_batches)
+            for _ in epoch_tqdm:
+                task_idx = np.argmax(np.random.multinomial(1, sampling_prob)) 
+                task = self._task_list[task_idx]
+                task_info = self._task_infos[task._name]
+                
+                
+                ### One forward + backward pass ###
+                
+                # Call next batch to train
+                batch = next(self._tr_generators[task._name])
+                task_info["n_batches_trained_this_epoch"] += 1
+
+                # Load optimizer				
+                optimizer = self._optimizers[task._name]			
+                optimizer.zero_grad()
+                
+                # Get the loss for this batch
+                output_dict = self._forward(tensor_batch = batch, task = task, for_training = True)
+                assert "loss" in output_dict, "Model must return a dict containing a 'loss' key"
+                loss = output_dict["loss"]
+                loss.backward()
+                task_info["tr_loss_cum"] += loss.item()
+
+                # Gradient rescaling if self._grad_norm is specified
+                self._rescale_gradients()
+                
+                # Take an optimization step
+                optimizer.step()
+                
+                
+                ### Get metrics for all progress so far, update tqdm, display description ###
+                task_metrics = self._get_metrics(task = task)
+                task_metrics["loss"] = float(task_info["tr_loss_cum"] / (task_info["n_batches_trained_this_epoch"]+0.000001))
+                description = self._description_from_metrics(task_metrics)
+                epoch_tqdm.set_description(task._name + ", " + description)
+
+                        
+                ### Tensorboard logging: Training detailled metrics, parameters and gradients ###
+                if self._global_step % self._summary_interval == 0:
+                    # Metrics
+                    for metric_name, value in task_metrics.items():
+                        self._tensorboard.add_train_scalar(name = "training_details/" + task._name + "/" + metric_name,
+                                                        value = value,
+                                                        global_step = self._global_step)
+                    # Parameters and Gradients
+                    for param_name, param in self._model.named_parameters():
+                        if self._log_parameter_statistics:
+                            self._tensorboard.add_train_scalar(name = "parameter_mean/" + param_name,
+                                                            value = param.data.mean(),
+                                                            global_step = self._global_step)
+                            self._tensorboard.add_train_scalar(name = "parameter_std/" + param_name,
+                                                            value = param.data.std(),
+                                                            global_step = self._global_step)
+                        if param.grad is None:
+                            continue
+                        if self._log_gradient_statistics:
+                            self._tensorboard.add_train_scalar(name = "grad_mean/" + param_name,
+                                                            value = param.grad.data.mean(),
+                                                            global_step = self._global_step)
+                            self._tensorboard.add_train_scalar(name = "grad_std/" + param_name,
+                                                            value = param.grad.data.std(),
+                                                            global_step = self._global_step)						
+                self._global_step += 1
+                        
+
+
+            ### Bookkeeping all the training metrics for all the tasks on the training epoch that just finished ###
+            for task in self._task_list:
+                task_info = self._task_infos[task._name]
+                    
+                task_info['total_n_batches_trained'] += task_info["n_batches_trained_this_epoch"]
+                task_info['last_log'] = time.time()
+                
+                task_metrics = self._get_metrics(task = task, reset = True)
+                if task._name not in all_tr_metrics:
+                    all_tr_metrics[task._name ] = {}
+                for name, value in task_metrics.items():
+                    all_tr_metrics[task._name][name] = value
+                all_tr_metrics[task._name]["loss"] = \
+                    float(task_info["tr_loss_cum"] / (task_info["n_batches_trained_this_epoch"]+0.00000001))
+            
+                # Tensorboard - Training metrics for this epoch
+                self._tensorboard.add_train_scalar(name = "training_proportions/" + task._name, 
+                                                value = task_info['n_batches_trained_this_epoch'],
+                                                global_step = n_epoch)
+                for metric_name, value in all_tr_metrics[task._name].items():
+                    self._tensorboard.add_train_scalar(name = "task_" + task._name + "/" + metric_name,
+                                                    value = value,
+                                                    global_step = n_epoch)
+            
+            
+            logger.info("Train - End")
+
+
+
+
+            ### Begin validation of the model ###
+            logger.info("Validation - Begin")
+            all_val_metrics = {}
+            
+            
+            self._model.eval() #Set the model into evaluation mode
+            
+            
+            for task_idx, task in enumerate(self._task_list):
+                logger.info("Validation - Task %d/%d: %s", task_idx + 1, self._n_tasks, task._name)
+                
+                val_loss = 0.0
+                n_batches_val_this_epoch_this_task = 0
+                n_val_batches = self._task_infos[task._name]['n_val_batches']
+                scheduler = self._schedulers[task._name]
+                
+                # Create tqdm generator for current task's validation
+                data_iterator = task._data_iterator
+                val_generator = data_iterator(task._validation_data, 
+                                            num_epochs = 1,
+                                            shuffle = False)
+                val_generator_tqdm = tqdm.tqdm(val_generator, 
+                                            total = n_val_batches)
+                
+                # Iterate over each validation batch for this task
+                for batch in val_generator_tqdm:
+                    n_batches_val_this_epoch_this_task += 1
+                    
+                    # Get the loss
+                    val_output_dict = self._forward(batch, task = task, for_training = False)
+                    loss = val_output_dict["loss"]
+                    val_loss += loss.item()
+                    
+                    # Get metrics for all progress so far, update tqdm, display description
+                    task_metrics = self._get_metrics(task = task)
+                    task_metrics["loss"] = float(val_loss / n_batches_val_this_epoch_this_task)
+                    description = self._description_from_metrics(task_metrics)
+                    val_generator_tqdm.set_description(description)	
+                    
+                # Get task validation metrics and store them in all_val_metrics
+                task_metrics = self._get_metrics(task = task, reset = True)
+                if task._name not in all_val_metrics:
+                    all_val_metrics[task._name] = {}
+                for name, value in task_metrics.items():
+                    all_val_metrics[task._name][name] = value
+                all_val_metrics[task._name]["loss"] = float(val_loss / n_batches_val_this_epoch_this_task)
+                                
+                # Tensorboard - Validation metrics for this epoch
+                for metric_name, value in all_val_metrics[task._name].items():
+                    self._tensorboard.add_validation_scalar(name = "task_" + task._name + "/" + metric_name, 
+                                                            value = value, 
+                                                            global_step = n_epoch)
+
+                
+                ### Perform a patience check and update the history of validation metric for this task ###
+                this_epoch_val_metric = all_val_metrics[task._name][task._val_metric]
+                metric_history = self._metric_infos[task._name]['hist']
+                
+                metric_history.append(this_epoch_val_metric)
+                is_best_so_far, out_of_patience = self._check_history(metric_history = metric_history, 
+                                                                    cur_score = this_epoch_val_metric,
+                                                                    should_decrease = task._val_metric_decreases)
+                                            
+                if is_best_so_far:
+                    logger.info("Best model found for %s.", task._name)
+                    self._metric_infos[task._name]['best'] = (n_epoch, all_val_metrics)
+                if out_of_patience and not self._metric_infos[task._name]['is_out_of_patience']:
+                    self._metric_infos[task._name]['is_out_of_patience'] = True
+                    logger.info("Task %s is out of patience and vote to stop the training.", task._name)
+                
+                # The LRScheduler API is agnostic to whether your schedule requires a validation metric -
+                # if it doesn't, the validation metric passed here is ignored.
+                scheduler.step(this_epoch_val_metric, n_epoch)
+                
+            
+            logger.info("Validation - End")
+            
+            
+            ### Print all training and validation metrics for this epoch ###
+            logger.info("***** Epoch %d/%d Statistics *****", n_epoch, self._num_epochs - 1)
+            for task in self._task_list:
+                logger.info("Statistic: %s", task._name)
+                logger.info("\tTraining - %s: %3d", "Nb batches trained", self._task_infos[task._name]["n_batches_trained_this_epoch"])
+                for metric_name, value in all_tr_metrics[task._name].items():
+                    logger.info("\tTraining - %s: %3f", metric_name, value)
+                for metric_name, value in all_val_metrics[task._name].items():
+                    logger.info("\tValidation - %s: %3f", metric_name, value)
+            logger.info("**********")
+
+
+            ### Check to see if should stop ###
+            stop_tr, stop_val = True, True
+            
+            for task in self._task_list:
+                #task_info = self._task_infos[task._name]
+                if self._optimizers[task._name].param_groups[0]['lr'] < self._min_lr:
+                    logger.info("Minimum lr hit on %s.", task._name)
+                    logger.info("Task %s vote to stop training.", task._name)
+                    metric_infos[task._name]['min_lr_hit'] = True
+                stop_tr = stop_tr and self._metric_infos[task._name]['min_lr_hit']
+                stop_val = stop_val and self._metric_infos[task._name]['is_out_of_patience']
+
+            if stop_tr:
+                should_stop = True
+                logging.info("All tasks hit minimum lr. Stopping training.")
+            if stop_val:
+                should_stop = True
+                logging.info("All metrics ran out of patience. Stopping training.")
+            if n_epoch >= self._num_epochs - 1:
+                should_stop = True
+                logging.info("Maximum number of epoch hit. Stopping training.")
+
+            self._save_checkpoint(n_epoch, should_stop)
+            
+            
+            ### Update n_epoch ###
+            # One epoch = doing N (forward + backward) pass where N is the total number of training batches.
+            n_epoch += 1 
+
+
+        ### Summarize training at the end ###
+        logging.info('***** Training is finished *****')
+        logging.info('Stopped training after %d epochs', n_epoch)
+        return_metrics = {}
+        for task_name, task_info in self._task_infos.items():
+            nb_epoch_trained = int(task_info['total_n_batches_trained'] / task_info['n_tr_batches'])
+            logging.info('Trained %s for %d batches ~= %d epochs',
+                         task_name, 
+                         task_info['total_n_batches_trained'],
+                         nb_epoch_trained)
+            return_metrics[task._name] = {"best_epoch": self._metric_infos[task_name]['best'][0],
+                                        "nb_epoch_trained": nb_epoch_trained,
+                                        "best_epoch_val_metrics": self._metric_infos[task_name]['best'][1]}
+            
+        training_elapsed_time = time.time() - training_start_time	
+        return_metrics["training_duration"] = time.strftime("%d:%H:%M:%S", time.gmtime(training_elapsed_time))
+        return_metrics["nb_epoch_trained"] = n_epoch
+        
+        
+        return return_metrics
+
+    @classmethod
+    def from_params(cls, 
+                    model: Model, 
+                    task_list: List[Task],
+                    serialization_dir: str, 
+                    params: Params) -> 'SamplerMultiTaskTrainer':
+        ''' Generator multi-task trainer from parameters.  '''
+
+        optimizer_params = params.pop("optimizer")
+        lr_scheduler_params = params.pop("scheduler")
+        patience = params.pop_int("patience", 2)
+        num_epochs = params.pop_int("num_epochs", 20)
+        cuda_device = params.pop_int("cuda_device", -1)
+        grad_norm = params.pop_float("grad_norm", None)
+        grad_clipping = params.pop_float("grad_clipping", None)
+        min_lr = params.pop_float("min_lr", 0.00001)
+        no_tqdm = params.pop_bool("no_tqdm", False)
+        summary_interval = params.pop("sumarry_interval", 50)
+        log_parameter_statistics = params.pop("log_parameter_statistics", False)
+        log_gradient_statistics = params.pop("log_gradient_statistics", False)
+        sampling_method = params.pop("sampling_method", "proportional")
+
+        params.assert_empty(cls.__name__)
+        return SamplerMultiTaskTrainer(model = model,
+                                task_list = task_list,
+                                optimizer_params = optimizer_params,
+                                lr_scheduler_params = lr_scheduler_params,
+                                patience = patience,
+                                num_epochs = num_epochs,
+                                serialization_dir = serialization_dir,
+                                cuda_device = cuda_device,
+                                grad_norm = grad_norm,
+                                grad_clipping = grad_clipping,
+                                min_lr = min_lr,
+                                no_tqdm = no_tqdm,
+                                summary_interval = summary_interval,
+                                log_parameter_statistics = log_parameter_statistics,
+                                log_gradient_statistics = log_gradient_statistics,
+                                sampling_method = sampling_method)
\ No newline at end of file
diff --git a/html_senteval.py b/html_senteval.py
new file mode 100644
index 0000000..3d52f2f
--- /dev/null
+++ b/html_senteval.py
@@ -0,0 +1,166 @@
+# coding: utf-8
+
+"""
+A quick and simple script for evaluating the embeddings throught the HTML model/hierarchy
+using SentEval.
+"""
+
+
+from __future__ import absolute_import, division, unicode_literals
+
+import sys
+import io
+import numpy as np
+import logging
+import re
+
+# Set PATHs
+PATH_TO_SENTEVAL = './SentEval/'
+PATH_TO_DATA = './SentEval/data'
+sys.path.insert(0, PATH_TO_SENTEVAL)
+import senteval
+
+import os
+import torch
+import argparse
+
+from allennlp.common.params import Params
+from allennlp.data.token_indexers import TokenIndexer
+from allennlp.data import Token, Instance, Vocabulary
+from allennlp.data.dataset import Batch
+from allennlp.data.fields import TextField
+from allennlp.nn import util
+from allennlp.models.model import Model
+
+import hmtl
+
+
+def text_to_instance(sent, token_indexers):
+    text = TextField([Token(word) for word in sent], token_indexers = token_indexers)
+    instance = Instance({"text": text})
+    return instance
+
+def sentences_to_indexed_batch(sentences, token_indexers):
+    instances = [text_to_instance(sent, token_indexers) for sent in sentences]
+    batch = Batch(instances)
+    batch.index_instances(vocab)
+    return batch 	
+    
+def compute_embds_from_layer(model, model_layer_name, batch):
+    batch_tensor = batch.as_tensor_dict(batch.get_padding_lengths())
+    text = batch_tensor["text"]
+    text_mask = util.get_text_field_mask(text)
+    
+    if model_layer_name == "text_field_embedder":
+        embds_text_field_embedder = model._text_field_embedder(text)
+        embds = embds_text_field_embedder
+        
+    if model_layer_name == "encoder_ner":
+        embds_text_field_embedder = model._text_field_embedder(text)
+        embds_encoder_ner = model._encoder_ner(embds_text_field_embedder, text_mask)
+        embds = embds_encoder_ner
+        
+    if model_layer_name == "encoder_emd":
+        embds_text_field_embedder = model._shortcut_text_field_embedder(text)
+        embds_encoder_emd = model._encoder_emd(embds_text_field_embedder, text_mask)
+        embds = embds_encoder_emd
+        
+    if model_layer_name == "encoder_relation":
+        embds_text_field_embedder = model._shortcut_text_field_embedder_relation(text)
+        embds_encoder_relation = model._encoder_relation(embds_text_field_embedder, text_mask)
+        embds = embds_encoder_relation
+    
+    if model_layer_name == "encoder_coref":
+        embds_text_field_embedder = model._shortcut_text_field_embedder_coref(text)
+        embds_encoder_coref = model._encoder_coref(embds_text_field_embedder, text_mask)
+        embds = embds_encoder_coref
+    
+    emds_size = embds.size(2)
+    expanded_text_mask = torch.cat([text_mask.unsqueeze(-1)]*emds_size, dim = -1)
+        
+    embds_sum = (embds*expanded_text_mask.float()).sum(dim = 1)
+    normalization = torch.cat([(1/text_mask.float().sum(-1)).unsqueeze(-1)]*emds_size, dim = -1)
+    computed_embds = (embds_sum*normalization)
+    
+    return computed_embds.detach().numpy()
+
+
+# SentEval prepare and batcher
+def prepare(params, samples):
+    return
+
+def batcher(params, batch):
+    batch = sentences_to_indexed_batch(batch, token_index)
+    embds = compute_embds_from_layer(model, args.layer_name, batch)
+    return embds
+
+
+# Set params for SentEval
+params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
+params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
+                                 'tenacity': 3, 'epoch_size': 2}
+
+
+# Set up logger
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+if __name__ == "__main__":	
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-s",
+                        "--serialization_dir",
+                        required = True,
+                        help = "Directory from which to load the pretrained model.", 
+                        type = str)
+    parser.add_argument("-t",
+                        "--task",
+                        required = False,
+                        default = "ner",
+                        help = "Name of the task to load.", 
+                        type = str)	
+    parser.add_argument("-l",
+                        "--layer_name",
+                        required = False,
+                        default = "text_field_embedder",
+                        help = "Name of encoder/embedding layer of the model", 
+                        type = str)										
+    args = parser.parse_args()
+    
+    
+    serialization_dir = args.serialization_dir
+
+    params = Params.from_file(params_file = os.path.join(args.serialization_dir, "config.json"))
+    logging.info("Parameters loaded from %s", os.path.join(serialization_dir, "config.json"))
+    
+    ### Load Vocabulary from files ###
+    logging.info("Loading Vocavulary from %s", os.path.join(serialization_dir, "vocabulary"))
+    vocab = Vocabulary.from_files(os.path.join(args.serialization_dir, "vocabulary"))
+    logger.info("Vocabulary loaded")
+    
+    ### Create model ###
+    model_params = params.pop("model")
+    model = Model.from_params(vocab = vocab, params = model_params, regularizer = None)
+    best_model_state_path = os.path.join(serialization_dir, "best_{}.th".format(args.task))
+    best_model_state = torch.load(best_model_state_path)
+    model.load_state_dict(state_dict = best_model_state)
+    
+    ### Create token indexer ###
+    token_index = {}
+    task_keys = [key for key in params.keys() if re.search("^task_", key)] 
+    token_indexer_params = params.pop(task_keys[-1]).pop("data_params").pop("dataset_reader").pop("token_indexers")
+    for name, indexer_params in token_indexer_params.items(): 
+        token_index[name] = TokenIndexer.from_params(indexer_params) 
+    
+    params_senteval['encoder'] = model
+    
+    se = senteval.engine.SE(params_senteval, batcher, prepare)
+    transfer_tasks = ['Length', 'WordContent', 'Depth', 'TopConstituents',
+                      'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                      'OddManOut', 'CoordinationInversion']
+    results = se.eval(transfer_tasks)
+    
+    print(results)
+    logging.info("SentEval(uation) Finished")
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..9d5e33b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,102 @@
+alabaster==0.7.12
+allennlp==0.7.0
+asn1crypto==0.24.0
+atomicwrites==1.2.1
+attrs==18.2.0
+aws-xray-sdk==0.95
+awscli==1.16.38
+Babel==2.6.0
+biscuits==0.1.1
+boto==2.49.0
+boto3==1.9.28
+botocore==1.12.28
+certifi==2018.10.15
+cffi==1.11.2
+chardet==3.0.4
+Click==7.0
+colorama==0.3.9
+conllu==0.11
+cookies==2.2.1
+cryptography==2.3.1
+cymem==2.0.2
+cytoolz==0.9.0.1
+dill==0.2.8.2
+docker==3.5.1
+docker-pycreds==0.3.0
+docutils==0.14
+ecdsa==0.13
+editdistance==0.5.2
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
+flaky==3.4.0
+Flask==0.12.4
+Flask-Cors==3.0.3
+ftfy==5.5.0
+future==0.16.0
+gevent==1.3.6
+greenlet==0.4.15
+h5py==2.8.0
+idna==2.7
+imagesize==1.1.0
+ItsDangerous==1.1.0
+Jinja2==2.10
+jmespath==0.9.3
+jsondiff==1.1.1
+jsonnet==0.10.0
+jsonpickle==1.0
+MarkupSafe==1.0
+mock==2.0.0
+more-itertools==4.3.0
+moto==1.3.4
+msgpack==0.5.6
+msgpack-numpy==0.4.3.2
+murmurhash==1.0.1
+nltk==3.3
+numpy==1.15.2
+numpydoc==0.8.0
+overrides==1.9
+packaging==18.0
+parsimonious==0.8.0
+pbr==5.0.0
+plac==0.9.6
+pluggy==0.8.0
+preshed==2.0.1
+protobuf==3.6.1
+py==1.7.0
+pyaml==17.12.1
+pyasn1==0.4.4
+pycparser==2.19
+pycryptodome==3.6.6
+Pygments==2.2.0
+pyparsing==2.2.2
+pytest==3.9.1
+pytest-pythonpath==0.7.3
+python-dateutil==2.7.3
+python-jose==2.0.2
+pytz==2017.3
+PyYAML==3.13
+regex==2018.1.10
+requests==2.20.0
+responses==0.10.1
+rsa==3.4.2
+s3transfer==0.1.13
+scikit-learn==0.20.0
+scipy==1.1.0
+six==1.11.0
+snowballstemmer==1.2.1
+spacy==2.0.16
+Sphinx==1.8.1
+sphinxcontrib-websupport==1.1.0
+sqlparse==0.2.4
+tensorboardX==1.2
+thinc==6.12.0
+toolz==0.9.0
+torch==0.4.1
+tqdm==4.28.1
+ujson==1.35
+Unidecode==1.0.22
+urllib3==1.24
+wcwidth==0.1.7
+websocket-client==0.53.0
+Werkzeug==0.14.1
+wrapt==1.10.11
+xmltodict==0.11.0
diff --git a/scripts/data_setup.sh b/scripts/data_setup.sh
new file mode 100755
index 0000000..382c4f0
--- /dev/null
+++ b/scripts/data_setup.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+#Download Data
+cd data
+
+#ELMO
+mkdir elmo
+cd elmo
+
+##Original size
+wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json
+mv elmo_2x4096_512_2048cnn_2xhighway_options.json 2x4096_512_2048cnn_2xhighway_options.json
+wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5
+mv elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5 2x4096_512_2048cnn_2xhighway_weights.hdf5
+
+##Medium size
+wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5
+mv elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5 2x2048_256_2048cnn_1xhighway_weights.hdf5
+wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json
+mv elmo_2x2048_256_2048cnn_1xhighway_options.json 2x2048_256_2048cnn_1xhighway_options.json
+
+##Small size
+wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5
+mv elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5 2x1024_128_2048cnn_1xhighway_weights.hdf5
+wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json
+mv elmo_2x1024_128_2048cnn_1xhighway_options.json 2x1024_128_2048cnn_1xhighway_options.json
+
+#Glove
+cd ..
+mkdir glove
+cd glove
+wget https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz
\ No newline at end of file
diff --git a/scripts/machine_setup.sh b/scripts/machine_setup.sh
new file mode 100755
index 0000000..d3342e2
--- /dev/null
+++ b/scripts/machine_setup.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+### Install git-lfs ###
+curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
+sudo apt-get install git-lfs
+git lfs install
+
+
+### Install Python3.6 ###
+sudo add-apt-repository ppa:deadsnakes/ppa
+sudo apt-get update
+sudo apt-get install python3.6 python3.6-dev
+wget https://bootstrap.pypa.io/get-pip.py
+sudo python3.6 get-pip.py
+sudo ln -s /usr/bin/python3.6 /usr/local/bin/python3
+sudo ln -s /usr/local/bin/pip /usr/local/bin/pip3
+
+
+### Create a clean Python3.6 environment ###
+sudo pip3 install virtualenv
+virtualenv -p /usr/bin/python3.6 .env
+source ./.env/bin/activate
+
+
+### Install dependencies ###
+pip install -r requirements.txt
+
+
+### Install submodules (SentEval) ###
+git submodule init
+git submodule update
+
+sudo apt-get install unzip
+cd SentEval/data/downstream/
+./get_transfer_data.bash
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..f0a54f4
--- /dev/null
+++ b/train.py
@@ -0,0 +1,237 @@
+# coding: utf-8
+
+"""
+The ``train.py`` file can be used to train a model.
+It requires a configuration file and a directory in
+which to write the results.
+
+.. code-block:: bash
+
+   $ python train.py --help
+    usage: train.py [-h] -s SERIALIZATION_DIR -c CONFIG_FILE_PATH [-r]
+
+    optional arguments:
+    -h, --help            show this help message and exit
+    -s SERIALIZATION_DIR, --serialization_dir SERIALIZATION_DIR
+                            Directory in which to save the model and its logs.
+    -c CONFIG_FILE_PATH, --config_file_path CONFIG_FILE_PATH
+                            Path to parameter file describing the multi-tasked
+                            model to be trained.
+    -r, --recover         Recover a previous training from the state in
+                            serialization_dir.
+"""
+
+import argparse
+import itertools
+import os
+import json
+import re
+from copy import deepcopy
+import torch
+import logging
+from typing import List, Dict, Any, Tuple
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+
+from hmtl.tasks import Task
+from hmtl.training.multi_task_trainer import MultiTaskTrainer
+from hmtl.common import create_and_set_iterators
+from evaluate import evaluate
+
+from allennlp.models.model import Model
+from allennlp.data import Vocabulary
+from allennlp.data.iterators import DataIterator
+from allennlp.commands.train import create_serialization_dir
+from allennlp.common.params import Params
+from allennlp.nn import RegularizerApplicator
+
+logger = logging.getLogger(__name__)
+        
+        
+def tasks_and_vocab_from_params(params: Params,
+                                serialization_dir: str) -> Tuple[List[Task], Vocabulary]:
+    '''
+    Load each of the tasks in the model from the ``params`` file
+    and load the datasets associated with each of these task.
+    Create the vocavulary from ``params`` using the concatenation of the ``datasets_for_vocab_creation``
+    from each of the task specific dataset.
+    
+    Parameters
+    ----------
+    params: ``Params``
+        A parameter object specifing an experiment.
+    serialization_dir: ``str``
+        Directory in which to save the model and its logs.
+    Returns
+    -------
+    task_list: ``List[Task]``
+        A list containing the tasks of the model to train.
+    vocab: ``Vocabulary``
+        The vocabulary fitted on the datasets_for_vocab_creation.
+    '''
+    ### Instantiate the different tasks ###
+    task_list = []
+    instances_for_vocab_creation = itertools.chain()
+    datasets_for_vocab_creation = {}
+    task_keys = [key for key in params.keys() if re.search("^task_", key)]
+    
+    for key in task_keys:
+        logger.info("Creating %s", key)
+        task_params = params.pop(key)
+        task_description = task_params.pop("task_description")
+        task_data_params = task_params.pop("data_params")
+        
+        task = Task.from_params(params = task_description)
+        task_list.append(task)
+        
+        task_instances_for_vocab, task_datasets_for_vocab = task.load_data_from_params(params = task_data_params)
+        instances_for_vocab_creation = itertools.chain(instances_for_vocab_creation, task_instances_for_vocab)
+        datasets_for_vocab_creation[task._name] = task_datasets_for_vocab
+    
+    
+    ### Create and save the vocabulary ###
+    for task_name, task_dataset_list in datasets_for_vocab_creation.items():
+        logger.info("Creating a vocabulary using %s data from %s.", ", ".join(task_dataset_list), task_name)
+    
+    logger.info("Fitting vocabulary from dataset")						
+    vocab = Vocabulary.from_params(params.pop("vocabulary", {}), instances_for_vocab_creation)
+    
+    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))
+    logger.info("Vocabulary saved to %s", os.path.join(serialization_dir, "vocabulary"))
+    
+    return task_list, vocab
+    
+def train_model(multi_task_trainer: MultiTaskTrainer,
+                recover: bool = False)-> Dict[str, Any]:
+    '''
+    Launching the training of the multi-task model.
+    
+    Parameters
+    ----------
+    multi_task_trainer: ``MultiTaskTrainer``
+        A trainer (similar to allennlp.training.trainer.Trainer) that can handle multi-task training.
+    recover : ``bool``, optional (default=False)
+        If ``True``, we will try to recover a training run from an existing serialization
+        directory.  This is only intended for use when something actually crashed during the middle
+        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
+                    
+    Returns
+    -------
+    metrics: ``Dict[str, Any]
+        The different metrics summarizing the training of the model.
+        It includes the validation and test (if necessary) metrics.
+    '''
+    ### Train the multi-task model ###
+    metrics = multi_task_trainer.train(recover = recover)
+    
+    task_list = multi_task_trainer._task_list
+    serialization_dir = multi_task_trainer._serialization_dir
+    model = multi_task_trainer._model
+    
+    ### Evaluate the model on test data if necessary ###
+    # This is a multi-task learning framework, the best validation metrics for one task are not necessarily
+    # obtained from the same epoch for all the tasks, one epoch begin equal to N forward+backward passes,
+    # where N is the total number of batches in all the training sets.
+    # We evaluate each of the best model for each task (based on the validation metrics) for all the other tasks (which have a test set).												
+    for task in task_list:
+        if not task._evaluate_on_test: continue
+
+        logger.info("Task %s will be evaluated using the best epoch weights.", task._name)
+        assert task._test_data is not None, "Task {} wants to be evaluated on test dataset but no there is no test data loaded.".format(task._name)
+        
+        logger.info("Loading the best epoch weights for task %s", task._name)
+        best_model_state_path = os.path.join(serialization_dir, "best_{}.th".format(task._name))
+        best_model_state = torch.load(best_model_state_path)
+        best_model = model
+        best_model.load_state_dict(state_dict = best_model_state)
+        
+        test_metric_dict = {}
+        
+        for pair_task in task_list:
+            if not pair_task._evaluate_on_test: continue
+            
+            logger.info("Pair task %s is evaluated with the best model for %s", pair_task._name, task._name)
+            test_metric_dict[pair_task._name] = {}	
+            test_metrics = evaluate(model = best_model,
+                                    task_name = pair_task._name, 
+                                    instances = pair_task._test_data, 
+                                    data_iterator = pair_task._data_iterator, 
+                                    cuda_device = multi_task_trainer._cuda_device)
+        
+            for metric_name, value in test_metrics.items():
+                test_metric_dict[pair_task._name][metric_name] = value
+                
+        metrics[task._name]["test"] = deepcopy(test_metric_dict)
+        logger.info("Finished evaluation of task %s.", task._name)
+    
+    
+    ### Dump validation and possibly test metrics ###
+    metrics_json = json.dumps(metrics, indent = 2)
+    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
+        metrics_file.write(metrics_json)
+    logger.info("Metrics: %s", metrics_json)
+    
+    return metrics
+    
+
+if __name__ == "__main__":
+    # Parse arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-s",
+                        "--serialization_dir",
+                        required = True, 
+                        help = "Directory in which to save the model and its logs.", 
+                        type = str)					
+    parser.add_argument("-c",
+                        "--config_file_path",
+                        required = True,
+                        help = "Path to parameter file describing the multi-tasked model to be trained.",
+                        type = str)	
+    parser.add_argument("-r",
+                        "--recover",
+                        action = "store_true",
+                        default = False,
+                        help = "Recover a previous training from the state in serialization_dir.")
+    args = parser.parse_args()	
+    
+    
+    params = Params.from_file(params_file = args.config_file_path)
+    serialization_dir = args.serialization_dir
+    create_serialization_dir(params, serialization_dir, args.recover)
+    
+    serialization_params = deepcopy(params).as_dict(quiet=True)
+    with open(os.path.join(serialization_dir, "config.json"), "w") as param_file:
+        json.dump(serialization_params, param_file, indent = 4)
+    
+
+    ### Instantiate the different tasks from the param file, load datasets and create vocabulary ###
+    tasks, vocab = tasks_and_vocab_from_params(params = params, serialization_dir = serialization_dir)
+
+
+    ### Load the data iterators for each task ###
+    tasks = create_and_set_iterators(params = params, task_list = tasks, vocab = vocab)
+    
+    
+    ### Load Regularizations ###
+    regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))
+    
+    
+    ### Create model ###
+    model_params = params.pop("model")
+    model = Model.from_params(vocab = vocab, params = model_params, regularizer = regularizer)
+        
+        
+    ### Create multi-task trainer ###
+    multi_task_trainer_params = params.pop("multi_task_trainer")
+    trainer = MultiTaskTrainer.from_params(model = model,
+                                        task_list = tasks,
+                                        serialization_dir = serialization_dir,
+                                        params = multi_task_trainer_params)
+    
+    
+    ### Launch training ###
+    metrics = train_model(multi_task_trainer = trainer,
+                        recover = args.recover)					
+    if metrics is not None:
+        logging.info("Training is finished ! Let's have a drink. It's on the house !")
\ No newline at end of file