Initial Commit

huggingface · Oct 31, 2018 · f1a43d5 · f1a43d5
1 parent f517957
commit f1a43d5
Show file tree

Hide file tree

Showing 58 changed files with 7,222 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,16 @@
+**.DS_Store
+**/.DS_Store
+
+*.pyc
+*.pyo
+
+__pycache__/
+
+data/*
+serialization_dirs/
+nohup_logs/
+.env/
+
+*.ipynb
+*.vscode
+.ipynb_checkpoints
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "SentEval"]
+	path = SentEval
+	url = https://github.com/facebookresearch/SentEval
diff --git a/HMTL_architecture.png b/HMTL_architecture.png
diff --git a/README.md b/README.md
@@ -1,2 +1,61 @@
-# hmtl
-HMTL: Hierarchical Multi-Task Learning
+# HMTL (Hierarchical Multi-Task Learning model) 
+
+A Hierarchical Multi-Task Approach for Learning Embeddings from Semantic Tasks\
+Victor SANH, Thomas WOLF, Sebastian RUDER\
+AAAI 2019
+
+<img src="https://github.com/huggingface/jointmodelMD/blob/master/HMTL_architecture.png" alt="HMTL Architecture" width="350"/>
+
+## About
+
+HMTL is a Hierarchical Multi-Task Learning model which combine a set of four carefully selected semantic tasks (namely Named Entity Recoginition, Entity Mention Detection, Relation Extraction and Coreference Resolution). The model achieves state-of-the-art results on Named Entity Recognition, Entity Mention Detection and Relation Extraction. Using [SentEval](https://github.com/facebookresearch/SentEval), we show that as we move from the bottom to the top layers of the model, the model tend to learn more complex semantic representation.
+
+For more details, we refer to our AAAI paper (LINK Arxiv).
+
+We release here the code for _training_, _fine tuning_ and _evaluating_ HMTL. We hope that this code will be useful for building your own Multi-Task models (hierarchical or not). The code is written in __Python__ and powered by __Pytorch__.
+
+## Dependecies and installation
+
+The main dependencies are:
+- [AllenNLP](https://github.com/allenai/allennlp)
+- [PyTorch](https://pytorch.org/)
+- [SentEval](https://github.com/facebookresearch/SentEval) (only for evaluating the embeddings)
+
+The code works with __Python 3.6__. A stable version of the dependencies is listed in `requirements.txt`.
+
+You can quickly setup a working environment by calling the script `./script/machine_setup.sh`. It installs Python 3.6, create a clean virtual environment, and install all the required dependencies (listed in `requirements.txt`). Please adapt the script depending on your needs.
+
+## Example usage
+
+We base our implementation on the [AllenNLP library](https://github.com/allenai/allennlp). For an introduction to this library, you should check [these tutorials](https://allennlp.org/tutorials).
+
+An experiment is described in a _json_ configuration file (see `configs/*.json` for examples). The configuration file mainly describes the datasets to load, the model to create along with all the hyper-parameters of the model. 
+
+Once you have set up your configuration file (and defined custom classes if needed), you can simply launch a training with the following command and arguments:
+
+```bash
+python train.py --config_file_path configs/hmtl_coref_conll.json --serialization_dir my_first_training
+```
+
+Once the training has started, you can simply follow the training in the terminal or open a [Tensorboard](https://www.tensorflow.org/guide/summaries_and_tensorboard):
+
+```bash
+tensorboard --logdir my_first_training/log
+```
+
+## Evaluating the embeddings with SentEval
+
+We used [SentEval](https://github.com/facebookresearch/SentEval) to assess the linguistic properties learned by the model. `hmtl_senteval.py` gives an example of how we can create an interface between SentEval and HMTL.
+
+## Data
+
+To download the pre-trained embeddings we used in HMTL, you can simply call the script `./script/data_setup.sh`.
+
+We do not attached the datasets used to train HMTL for licensing reasons, but we invite you to collect them by yourself: [OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), [CoNLL2003](https://www.clips.uantwerpen.be/conll2003/ner/), and [ACE2005](https://catalog.ldc.upenn.edu/LDC2006T06). The configuration files expect the datasets to be placed in the `data/` folder.
+
+## References
+
+```
+@article{
+}
+```
diff --git a/configs/coref_ace.json b/configs/coref_ace.json
@@ -0,0 +1,150 @@
+{
+	"task_coref":{
+		"task_description":{
+			"task_name": "coref",
+			"validation_metric_name": "coref_f1",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+
+		"data_params":{
+			"dataset_reader": {
+				"type": "coref_ace",
+				"max_span_width": 8,
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+
+
+			"train_data_path":"./data/ace2005/single_file_train_rahman.gold_conll",
+			"validation_data_path": "./data/ace2005/single_file_dev_rahman.gold_conll",
+			"test_data_path": "./data/ace2005/single_file_test_rahman.gold_conll",
+
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+
+	"model": {
+		"type": "coref_custom",
+
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+
+		"coref": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 200,
+				"num_layers": 1,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"mention_feedforward": {
+					"input_dim": 2008,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"antecedent_feedforward": {
+					"input_dim": 6044,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"initializer": [
+					[".*linear_layers.*weight", {"type": "xavier_normal"}],
+					[".*scorer._module.weight", {"type": "xavier_normal"}],
+					["_distance_embedding.weight", {"type": "xavier_normal"}],
+					["_span_width_embedding.weight", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_hh.*", {"type": "orthogonal"}]
+				],
+				"lexical_dropout": 0.5,
+				"feature_size": 20,
+				"max_span_width": 8,
+				"spans_per_word": 0.4,
+				"max_antecedents": 70,
+				"eval_on_gold_mentions": false
+			}
+		}
+	},
+
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		},
+		"iterator_coref": {
+			"type": "bucket",
+			"sorting_keys": [["text", "num_tokens"]],
+			"padding_noise": 0.0,
+			"batch_size": 1
+		  }
+	},
+
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}
diff --git a/configs/coref_conll.json b/configs/coref_conll.json
@@ -0,0 +1,150 @@
+{
+	"task_coref":{
+		"task_description":{
+			"task_name": "coref",
+			"validation_metric_name": "coref_f1",
+			"validation_metric_decreases": false,
+			"evaluate_on_test": true
+		},
+
+		"data_params":{
+			"dataset_reader": {
+				"type": "coref",
+				"max_span_width": 8,
+				"token_indexers": {
+					"tokens": {
+						"type": "single_id",
+						"lowercase_tokens": true
+					},
+					"token_characters":{
+						"type": "characters"
+					},
+					"elmo": {
+						"type": "elmo_characters"
+					}
+				}
+			},
+
+
+			"train_data_path":"./data/conll-2012_single_file/train.english.v4_gold_conll",
+			"validation_data_path": "./data/conll-2012_single_file/dev.english.v4_gold_conll",
+			"test_data_path": "./data/conll-2012_single_file/test.english.v4_gold_conll",
+
+			"datasets_for_vocab_creation": ["train"]
+		}
+	},
+
+	"model": {
+		"type": "coref_custom",
+
+		"text_field_embedder": {
+			"token_embedders": {
+				"tokens": {
+					"type": "embedding",
+					"pretrained_file": "./data/glove/glove.6B.100d.txt.gz",
+					"embedding_dim": 100,
+					"trainable": true
+				},
+				"elmo": {
+					"type": "elmo_token_embedder",
+					"options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json",
+					"weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5",
+					"do_layer_norm": false,
+					"dropout": 0,
+					"requires_grad": false
+				},
+				"token_characters": {
+					"type": "character_encoding",
+					"embedding": {
+						"embedding_dim": 16
+					},
+					"encoder": {
+						"type": "cnn",
+						"embedding_dim": 16,
+						"num_filters": 64,
+						"ngram_filter_sizes": [3]
+					},
+					"dropout": 0.1
+				}
+			}
+		},
+
+		"coref": {
+			"encoder": {
+				"type": "lstm",
+				"bidirectional": true,
+				"input_size": 1188,
+				"hidden_size": 200,
+				"num_layers": 1,
+				"dropout": 0.2
+			},
+			"tagger": {
+				"mention_feedforward": {
+					"input_dim": 2008,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"antecedent_feedforward": {
+					"input_dim": 6044,
+					"num_layers": 2,
+					"hidden_dims": 150,
+					"activations": "relu",
+					"dropout": 0.3
+				},
+				"initializer": [
+					[".*linear_layers.*weight", {"type": "xavier_normal"}],
+					[".*scorer._module.weight", {"type": "xavier_normal"}],
+					["_distance_embedding.weight", {"type": "xavier_normal"}],
+					["_span_width_embedding.weight", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}],
+					["_context_layer._module.weight_hh.*", {"type": "orthogonal"}]
+				],
+				"lexical_dropout": 0.5,
+				"feature_size": 20,
+				"max_span_width": 8,
+				"spans_per_word": 0.4,
+				"max_antecedents": 70,
+				"eval_on_gold_mentions": false
+			}
+		}
+	},
+
+	"iterators": {
+		"iterator": {
+			"type": "basic",
+			"batch_size": 32
+		},
+		"iterator_coref": {
+			"type": "bucket",
+			"sorting_keys": [["text", "num_tokens"]],
+			"padding_noise": 0.0,
+			"batch_size": 1
+		  }
+	},
+
+	"multi_task_trainer": {
+		"type": "sampler_multi_task_trainer",
+		"sampling_method": "proportional",
+		"patience": 10,
+		"num_epochs": 100,
+		"min_lr": "1e-7",
+		"grad_norm": 5.0,
+		"grad_clipping": 10.0,
+		"cuda_device": 0,
+		"optimizer": {
+			"type": "adam", 
+			"lr": 0.001
+		},
+		"scheduler": {
+			"type": "reduce_on_plateau", 
+			"mode": "min", 
+			"factor": 0.5,
+			"patience": 5, 
+			"threshold": 0.0001,
+			"threshold_mode": "abs", 
+			"verbose": true
+		}
+	}
+}