diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2b19e98 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +**.DS_Store +**/.DS_Store + +*.pyc +*.pyo + +__pycache__/ + +data/* +serialization_dirs/ +nohup_logs/ +.env/ + +*.ipynb +*.vscode +.ipynb_checkpoints diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..a318351 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "SentEval"] + path = SentEval + url = https://github.com/facebookresearch/SentEval diff --git a/HMTL_architecture.png b/HMTL_architecture.png new file mode 100644 index 0000000..6eda142 Binary files /dev/null and b/HMTL_architecture.png differ diff --git a/README.md b/README.md index ba7898f..8480ce6 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,61 @@ -# hmtl -HMTL: Hierarchical Multi-Task Learning +# HMTL (Hierarchical Multi-Task Learning model) + +A Hierarchical Multi-Task Approach for Learning Embeddings from Semantic Tasks\ +Victor SANH, Thomas WOLF, Sebastian RUDER\ +AAAI 2019 + +HMTL Architecture + +## About + +HMTL is a Hierarchical Multi-Task Learning model which combine a set of four carefully selected semantic tasks (namely Named Entity Recoginition, Entity Mention Detection, Relation Extraction and Coreference Resolution). The model achieves state-of-the-art results on Named Entity Recognition, Entity Mention Detection and Relation Extraction. Using [SentEval](https://github.com/facebookresearch/SentEval), we show that as we move from the bottom to the top layers of the model, the model tend to learn more complex semantic representation. + +For more details, we refer to our AAAI paper (LINK Arxiv). + +We release here the code for _training_, _fine tuning_ and _evaluating_ HMTL. We hope that this code will be useful for building your own Multi-Task models (hierarchical or not). The code is written in __Python__ and powered by __Pytorch__. + +## Dependecies and installation + +The main dependencies are: +- [AllenNLP](https://github.com/allenai/allennlp) +- [PyTorch](https://pytorch.org/) +- [SentEval](https://github.com/facebookresearch/SentEval) (only for evaluating the embeddings) + +The code works with __Python 3.6__. A stable version of the dependencies is listed in `requirements.txt`. + +You can quickly setup a working environment by calling the script `./script/machine_setup.sh`. It installs Python 3.6, create a clean virtual environment, and install all the required dependencies (listed in `requirements.txt`). Please adapt the script depending on your needs. + +## Example usage + +We base our implementation on the [AllenNLP library](https://github.com/allenai/allennlp). For an introduction to this library, you should check [these tutorials](https://allennlp.org/tutorials). + +An experiment is described in a _json_ configuration file (see `configs/*.json` for examples). The configuration file mainly describes the datasets to load, the model to create along with all the hyper-parameters of the model. + +Once you have set up your configuration file (and defined custom classes if needed), you can simply launch a training with the following command and arguments: + +```bash +python train.py --config_file_path configs/hmtl_coref_conll.json --serialization_dir my_first_training +``` + +Once the training has started, you can simply follow the training in the terminal or open a [Tensorboard](https://www.tensorflow.org/guide/summaries_and_tensorboard): + +```bash +tensorboard --logdir my_first_training/log +``` + +## Evaluating the embeddings with SentEval + +We used [SentEval](https://github.com/facebookresearch/SentEval) to assess the linguistic properties learned by the model. `hmtl_senteval.py` gives an example of how we can create an interface between SentEval and HMTL. + +## Data + +To download the pre-trained embeddings we used in HMTL, you can simply call the script `./script/data_setup.sh`. + +We do not attached the datasets used to train HMTL for licensing reasons, but we invite you to collect them by yourself: [OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), [CoNLL2003](https://www.clips.uantwerpen.be/conll2003/ner/), and [ACE2005](https://catalog.ldc.upenn.edu/LDC2006T06). The configuration files expect the datasets to be placed in the `data/` folder. + +## References + +``` +@article{ +} +``` diff --git a/configs/coref_ace.json b/configs/coref_ace.json new file mode 100644 index 0000000..d3e5326 --- /dev/null +++ b/configs/coref_ace.json @@ -0,0 +1,150 @@ +{ + "task_coref":{ + "task_description":{ + "task_name": "coref", + "validation_metric_name": "coref_f1", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "coref_ace", + "max_span_width": 8, + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path":"./data/ace2005/single_file_train_rahman.gold_conll", + "validation_data_path": "./data/ace2005/single_file_dev_rahman.gold_conll", + "test_data_path": "./data/ace2005/single_file_test_rahman.gold_conll", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "model": { + "type": "coref_custom", + + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "pretrained_file": "./data/glove/glove.6B.100d.txt.gz", + "embedding_dim": 100, + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder", + "options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0, + "requires_grad": false + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 64, + "ngram_filter_sizes": [3] + }, + "dropout": 0.1 + } + } + }, + + "coref": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1188, + "hidden_size": 200, + "num_layers": 1, + "dropout": 0.2 + }, + "tagger": { + "mention_feedforward": { + "input_dim": 2008, + "num_layers": 2, + "hidden_dims": 150, + "activations": "relu", + "dropout": 0.3 + }, + "antecedent_feedforward": { + "input_dim": 6044, + "num_layers": 2, + "hidden_dims": 150, + "activations": "relu", + "dropout": 0.3 + }, + "initializer": [ + [".*linear_layers.*weight", {"type": "xavier_normal"}], + [".*scorer._module.weight", {"type": "xavier_normal"}], + ["_distance_embedding.weight", {"type": "xavier_normal"}], + ["_span_width_embedding.weight", {"type": "xavier_normal"}], + ["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}], + ["_context_layer._module.weight_hh.*", {"type": "orthogonal"}] + ], + "lexical_dropout": 0.5, + "feature_size": 20, + "max_span_width": 8, + "spans_per_word": 0.4, + "max_antecedents": 70, + "eval_on_gold_mentions": false + } + } + }, + + "iterators": { + "iterator": { + "type": "basic", + "batch_size": 32 + }, + "iterator_coref": { + "type": "bucket", + "sorting_keys": [["text", "num_tokens"]], + "padding_noise": 0.0, + "batch_size": 1 + } + }, + + "multi_task_trainer": { + "type": "sampler_multi_task_trainer", + "sampling_method": "proportional", + "patience": 10, + "num_epochs": 100, + "min_lr": "1e-7", + "grad_norm": 5.0, + "grad_clipping": 10.0, + "cuda_device": 0, + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "scheduler": { + "type": "reduce_on_plateau", + "mode": "min", + "factor": 0.5, + "patience": 5, + "threshold": 0.0001, + "threshold_mode": "abs", + "verbose": true + } + } +} \ No newline at end of file diff --git a/configs/coref_conll.json b/configs/coref_conll.json new file mode 100644 index 0000000..d538bac --- /dev/null +++ b/configs/coref_conll.json @@ -0,0 +1,150 @@ +{ + "task_coref":{ + "task_description":{ + "task_name": "coref", + "validation_metric_name": "coref_f1", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "coref", + "max_span_width": 8, + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path":"./data/conll-2012_single_file/train.english.v4_gold_conll", + "validation_data_path": "./data/conll-2012_single_file/dev.english.v4_gold_conll", + "test_data_path": "./data/conll-2012_single_file/test.english.v4_gold_conll", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "model": { + "type": "coref_custom", + + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "pretrained_file": "./data/glove/glove.6B.100d.txt.gz", + "embedding_dim": 100, + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder", + "options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0, + "requires_grad": false + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 64, + "ngram_filter_sizes": [3] + }, + "dropout": 0.1 + } + } + }, + + "coref": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1188, + "hidden_size": 200, + "num_layers": 1, + "dropout": 0.2 + }, + "tagger": { + "mention_feedforward": { + "input_dim": 2008, + "num_layers": 2, + "hidden_dims": 150, + "activations": "relu", + "dropout": 0.3 + }, + "antecedent_feedforward": { + "input_dim": 6044, + "num_layers": 2, + "hidden_dims": 150, + "activations": "relu", + "dropout": 0.3 + }, + "initializer": [ + [".*linear_layers.*weight", {"type": "xavier_normal"}], + [".*scorer._module.weight", {"type": "xavier_normal"}], + ["_distance_embedding.weight", {"type": "xavier_normal"}], + ["_span_width_embedding.weight", {"type": "xavier_normal"}], + ["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}], + ["_context_layer._module.weight_hh.*", {"type": "orthogonal"}] + ], + "lexical_dropout": 0.5, + "feature_size": 20, + "max_span_width": 8, + "spans_per_word": 0.4, + "max_antecedents": 70, + "eval_on_gold_mentions": false + } + } + }, + + "iterators": { + "iterator": { + "type": "basic", + "batch_size": 32 + }, + "iterator_coref": { + "type": "bucket", + "sorting_keys": [["text", "num_tokens"]], + "padding_noise": 0.0, + "batch_size": 1 + } + }, + + "multi_task_trainer": { + "type": "sampler_multi_task_trainer", + "sampling_method": "proportional", + "patience": 10, + "num_epochs": 100, + "min_lr": "1e-7", + "grad_norm": 5.0, + "grad_clipping": 10.0, + "cuda_device": 0, + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "scheduler": { + "type": "reduce_on_plateau", + "mode": "min", + "factor": 0.5, + "patience": 5, + "threshold": 0.0001, + "threshold_mode": "abs", + "verbose": true + } + } +} \ No newline at end of file diff --git a/configs/emd.json b/configs/emd.json new file mode 100644 index 0000000..a921499 --- /dev/null +++ b/configs/emd.json @@ -0,0 +1,120 @@ +{ + "task_ner":{ + "task_description":{ + "task_name": "ner", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader":{ + "type": "mention_ace", + "label_namespace": "ace_mention_labels", + "lazy": false, + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path": "./data/ace2005/train/", + "validation_data_path": "./data/ace2005/dev/", + "test_data_path": "./data/ace2005/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "model": { + "type": "ner", + + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "pretrained_file": "./data/glove/glove.6B.100d.txt.gz", + "embedding_dim": 100, + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder", + "options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0, + "requires_grad": false + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 64, + "ngram_filter_sizes": [3] + }, + "dropout": 0.1 + } + } + }, + + "ner": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1188, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ace_mention_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + } + }, + + "iterators": { + "iterator": { + "type": "basic", + "batch_size": 32 + } + }, + + "multi_task_trainer": { + "type": "sampler_multi_task_trainer", + "sampling_method": "proportional", + "patience": 10, + "num_epochs": 100, + "min_lr": "1e-7", + "grad_norm": 5.0, + "grad_clipping": 10.0, + "cuda_device": 0, + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "scheduler": { + "type": "reduce_on_plateau", + "mode": "min", + "factor": 0.5, + "patience": 5, + "threshold": 0.0001, + "threshold_mode": "abs", + "verbose": true + } + } +} diff --git a/configs/emd_coref_ace.json b/configs/emd_coref_ace.json new file mode 100644 index 0000000..c5b829f --- /dev/null +++ b/configs/emd_coref_ace.json @@ -0,0 +1,200 @@ +{ + "task_emd":{ + "task_description":{ + "task_name": "emd", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "mention_ace", + "label_namespace": "ace_mention_labels", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + "train_data_path": "./data/ace2005/train/", + "validation_data_path": "./data/ace2005/dev/", + "test_data_path": "./data/ace2005/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "task_coref":{ + "task_description":{ + "task_name": "coref", + "validation_metric_name": "coref_f1", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "coref_ace", + "max_span_width": 8, + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path":"./data/ace2005/single_file_train.gold_conll", + "validation_data_path": "./data/ace2005/single_file_dev.gold_conll", + "test_data_path": "./data/ace2005/single_file_test.gold_conll", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "model": { + "type": "emd_coref", + + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "pretrained_file": "./data/glove/glove.6B.100d.txt.gz", + "embedding_dim": 100, + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder", + "options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0, + "requires_grad": false + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 64, + "ngram_filter_sizes": [3] + }, + "dropout": 0.1 + } + } + }, + + "emd": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1188, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ace_mention_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + }, + + "coref": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1316, + "hidden_size": 200, + "num_layers": 1, + "dropout": 0.2 + }, + "tagger": { + "mention_feedforward": { + "input_dim": 2136, + "num_layers": 2, + "hidden_dims": 150, + "activations": "relu", + "dropout": 0.3 + }, + "antecedent_feedforward": { + "input_dim": 6428, + "num_layers": 2, + "hidden_dims": 150, + "activations": "relu", + "dropout": 0.3 + }, + "initializer": [ + [".*linear_layers.*weight", {"type": "xavier_normal"}], + [".*scorer._module.weight", {"type": "xavier_normal"}], + ["_distance_embedding.weight", {"type": "xavier_normal"}], + ["_span_width_embedding.weight", {"type": "xavier_normal"}], + ["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}], + ["_context_layer._module.weight_hh.*", {"type": "orthogonal"}] + ], + "lexical_dropout": 0.5, + "feature_size": 20, + "max_span_width": 8, + "spans_per_word": 0.4, + "max_antecedents": 70, + "eval_on_gold_mentions": false + } + } + }, + + "iterators": { + "iterator": { + "type": "basic", + "batch_size": 32 + }, + "iterator_coref": { + "type": "bucket", + "sorting_keys": [["text", "num_tokens"]], + "padding_noise": 0.0, + "batch_size": 1 + } + }, + + "multi_task_trainer": { + "type": "sampler_multi_task_trainer", + "sampling_method": "proportional", + "patience": 10, + "num_epochs": 100, + "min_lr": "1e-7", + "grad_norm": 5.0, + "grad_clipping": 10.0, + "cuda_device": 0, + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "scheduler": { + "type": "reduce_on_plateau", + "mode": "min", + "factor": 0.5, + "patience": 5, + "threshold": 0.0001, + "threshold_mode": "abs", + "verbose": true + } + } +} \ No newline at end of file diff --git a/configs/emd_relation.json b/configs/emd_relation.json new file mode 100644 index 0000000..ed3fa90 --- /dev/null +++ b/configs/emd_relation.json @@ -0,0 +1,173 @@ +{ + "task_emd":{ + "task_description":{ + "task_name": "emd", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "mention_ace", + "label_namespace": "ace_mention_labels", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + "train_data_path": "./data/ace2005/train/", + "validation_data_path": "./data/ace2005/dev/", + "test_data_path": "./data/ace2005/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "task_relation":{ + "task_description":{ + "task_name": "relation", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "relation_ace", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path": "./data/ace2005/train/", + "validation_data_path": "./data/ace2005/dev/", + "test_data_path": "./data/ace2005/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "model": { + "type": "emd_relation", + + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "pretrained_file": "./data/glove/glove.6B.100d.txt.gz", + "embedding_dim": 100, + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder", + "options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0, + "requires_grad": false + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 64, + "ngram_filter_sizes": [3] + }, + "dropout": 0.1 + } + } + }, + + "emd": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1188, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ace_mention_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + }, + + "relation": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1316, + "hidden_size": 64, + "num_layers": 3, + "dropout": 0.2 + }, + "tagger": { + "d": 64, + "l": 64, + "n_classes": 6, + "activation": "relu" + } + } + }, + + "iterators": { + "iterator": { + "type": "basic", + "batch_size": 32 + }, + "iterator_relation": { + "type": "basic", + "batch_size": 4 + } + }, + + "multi_task_trainer": { + "type": "sampler_multi_task_trainer", + "sampling_method": "proportional", + "patience": 10, + "num_epochs": 100, + "min_lr": "1e-7", + "grad_norm": 5.0, + "grad_clipping": 10.0, + "cuda_device": 0, + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "scheduler": { + "type": "reduce_on_plateau", + "mode": "min", + "factor": 0.5, + "patience": 5, + "threshold": 0.0001, + "threshold_mode": "abs", + "verbose": true + } + } +} \ No newline at end of file diff --git a/configs/hmtl_coref_ace.json b/configs/hmtl_coref_ace.json new file mode 100644 index 0000000..57a6fdf --- /dev/null +++ b/configs/hmtl_coref_ace.json @@ -0,0 +1,307 @@ +{ + "task_ner":{ + "task_description":{ + "task_name": "ner", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader":{ + "type": "ner_ontonotes", + "label_namespace": "ontonotes_ner_labels", + "coding_scheme": "BIOUL", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path": "./data/conll-2012/v4/data/train/", + "validation_data_path": "./data/conll-2012/v4/data/development/", + "test_data_path": "./data/conll-2012/v4/data/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "task_emd":{ + "task_description":{ + "task_name": "emd", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "mention_ace", + "label_namespace": "ace_mention_labels", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + "train_data_path": "./data/ace2005/train/", + "validation_data_path": "./data/ace2005/dev/", + "test_data_path": "./data/ace2005/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "task_relation":{ + "task_description":{ + "task_name": "relation", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "relation_ace", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path": "./data/ace2005/train/", + "validation_data_path": "./data/ace2005/dev/", + "test_data_path": "./data/ace2005/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "task_coref":{ + "task_description":{ + "task_name": "coref", + "validation_metric_name": "coref_f1", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "coref_ace", + "max_span_width": 8, + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path":"./data/ace2005/single_file_train_rahman.gold_conll", + "validation_data_path": "./data/ace2005/single_file_dev_rahman.gold_conll", + "test_data_path": "./data/ace2005/single_file_test_rahman.gold_conll", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "model": { + "type": "hmtl", + + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "pretrained_file": "./data/glove/glove.6B.100d.txt.gz", + "embedding_dim": 100, + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder", + "options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0, + "requires_grad": false + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 64, + "ngram_filter_sizes": [3] + }, + "dropout": 0.1 + } + } + }, + + "ner": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1188, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ontonotes_ner_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + }, + + "emd": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1316, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ace_mention_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + }, + + "relation": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1316, + "hidden_size": 64, + "num_layers": 3, + "dropout": 0.2 + }, + "tagger": { + "d": 64, + "l": 64, + "n_classes": 6, + "activation": "relu" + } + }, + + "coref": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1316, + "hidden_size": 200, + "num_layers": 1, + "dropout": 0.2 + }, + "tagger": { + "mention_feedforward": { + "input_dim": 2136, + "num_layers": 2, + "hidden_dims": 150, + "activations": "relu", + "dropout": 0.3 + }, + "antecedent_feedforward": { + "input_dim": 6428, + "num_layers": 2, + "hidden_dims": 150, + "activations": "relu", + "dropout": 0.3 + }, + "initializer": [ + [".*linear_layers.*weight", {"type": "xavier_normal"}], + [".*scorer._module.weight", {"type": "xavier_normal"}], + ["_distance_embedding.weight", {"type": "xavier_normal"}], + ["_span_width_embedding.weight", {"type": "xavier_normal"}], + ["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}], + ["_context_layer._module.weight_hh.*", {"type": "orthogonal"}] + ], + "lexical_dropout": 0.5, + "feature_size": 20, + "max_span_width": 8, + "spans_per_word": 0.4, + "max_antecedents": 70, + "eval_on_gold_mentions": false + } + } + }, + + "iterators": { + "iterator": { + "type": "basic", + "batch_size": 32 + }, + "iterator_relation": { + "type": "basic", + "batch_size": 4 + }, + "iterator_coref": { + "type": "bucket", + "sorting_keys": [["text", "num_tokens"]], + "padding_noise": 0.0, + "batch_size": 1 + } + }, + + "multi_task_trainer": { + "type": "sampler_multi_task_trainer", + "sampling_method": "proportional", + "patience": 10, + "num_epochs": 100, + "min_lr": "1e-7", + "grad_norm": 5.0, + "grad_clipping": 10.0, + "cuda_device": 0, + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "scheduler": { + "type": "reduce_on_plateau", + "mode": "min", + "factor": 0.5, + "patience": 5, + "threshold": 0.0001, + "threshold_mode": "abs", + "verbose": true + } + } +} \ No newline at end of file diff --git a/configs/hmtl_coref_conll.json b/configs/hmtl_coref_conll.json new file mode 100644 index 0000000..8aa732b --- /dev/null +++ b/configs/hmtl_coref_conll.json @@ -0,0 +1,307 @@ +{ + "task_ner":{ + "task_description":{ + "task_name": "ner", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader":{ + "type": "ner_ontonotes", + "label_namespace": "ontonotes_ner_labels", + "coding_scheme": "BIOUL", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path": "./data/conll-2012/v4/data/train/", + "validation_data_path": "./data/conll-2012/v4/data/development/", + "test_data_path": "./data/conll-2012/v4/data/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "task_emd":{ + "task_description":{ + "task_name": "emd", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "mention_ace", + "label_namespace": "ace_mention_labels", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + "train_data_path": "./data/ace2005/train/", + "validation_data_path": "./data/ace2005/dev/", + "test_data_path": "./data/ace2005/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "task_relation":{ + "task_description":{ + "task_name": "relation", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "relation_ace", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path": "./data/ace2005/train/", + "validation_data_path": "./data/ace2005/dev/", + "test_data_path": "./data/ace2005/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "task_coref":{ + "task_description":{ + "task_name": "coref", + "validation_metric_name": "coref_f1", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "coref", + "max_span_width": 8, + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path":"./data/conll-2012_single_file/train.english.v4_gold_conll", + "validation_data_path": "./data/conll-2012_single_file/dev.english.v4_gold_conll", + "test_data_path": "./data/conll-2012_single_file/test.english.v4_gold_conll", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "model": { + "type": "hmtl", + + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "pretrained_file": "./data/glove/glove.6B.100d.txt.gz", + "embedding_dim": 100, + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder", + "options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0, + "requires_grad": false + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 64, + "ngram_filter_sizes": [3] + }, + "dropout": 0.1 + } + } + }, + + "ner": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1188, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ontonotes_ner_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + }, + + "emd": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1316, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ace_mention_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + }, + + "relation": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1316, + "hidden_size": 64, + "num_layers": 3, + "dropout": 0.2 + }, + "tagger": { + "d": 64, + "l": 64, + "n_classes": 6, + "activation": "relu" + } + }, + + "coref": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1316, + "hidden_size": 200, + "num_layers": 1, + "dropout": 0.2 + }, + "tagger": { + "mention_feedforward": { + "input_dim": 2136, + "num_layers": 2, + "hidden_dims": 150, + "activations": "relu", + "dropout": 0.3 + }, + "antecedent_feedforward": { + "input_dim": 6428, + "num_layers": 2, + "hidden_dims": 150, + "activations": "relu", + "dropout": 0.3 + }, + "initializer": [ + [".*linear_layers.*weight", {"type": "xavier_normal"}], + [".*scorer._module.weight", {"type": "xavier_normal"}], + ["_distance_embedding.weight", {"type": "xavier_normal"}], + ["_span_width_embedding.weight", {"type": "xavier_normal"}], + ["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}], + ["_context_layer._module.weight_hh.*", {"type": "orthogonal"}] + ], + "lexical_dropout": 0.5, + "feature_size": 20, + "max_span_width": 8, + "spans_per_word": 0.4, + "max_antecedents": 70, + "eval_on_gold_mentions": false + } + } + }, + + "iterators": { + "iterator": { + "type": "basic", + "batch_size": 32 + }, + "iterator_relation": { + "type": "basic", + "batch_size": 4 + }, + "iterator_coref": { + "type": "bucket", + "sorting_keys": [["text", "num_tokens"]], + "padding_noise": 0.0, + "batch_size": 1 + } + }, + + "multi_task_trainer": { + "type": "sampler_multi_task_trainer", + "sampling_method": "proportional", + "patience": 10, + "num_epochs": 100, + "min_lr": "1e-7", + "grad_norm": 5.0, + "grad_clipping": 10.0, + "cuda_device": 0, + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "scheduler": { + "type": "reduce_on_plateau", + "mode": "min", + "factor": 0.5, + "patience": 5, + "threshold": 0.0001, + "threshold_mode": "abs", + "verbose": true + } + } +} \ No newline at end of file diff --git a/configs/ner.json b/configs/ner.json new file mode 100644 index 0000000..d1655ea --- /dev/null +++ b/configs/ner.json @@ -0,0 +1,120 @@ +{ + "task_ner":{ + "task_description":{ + "task_name": "ner", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader":{ + "type": "ner_ontonotes", + "label_namespace": "ontonotes_ner_labels", + "coding_scheme": "BIOUL", + "lazy": false, + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path": "./data/conll-2012/v4/data/train/", + "validation_data_path": "./data/conll-2012/v4/data/development/", + "test_data_path": "./data/conll-2012/v4/data/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + "model": { + "type": "ner", + + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "pretrained_file": "./data/glove/glove.6B.100d.txt.gz", + "embedding_dim": 100, + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder", + "options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0, + "requires_grad": false + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 64, + "ngram_filter_sizes": [3] + }, + "dropout": 0.1 + } + } + }, + + "ner": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1188, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ontonotes_ner_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + } + }, + + "iterators": { + "iterator": { + "type": "basic", + "batch_size": 32 + } + }, + + "multi_task_trainer": { + "type": "sampler_multi_task_trainer", + "sampling_method": "proportional", + "patience": 10, + "num_epochs": 100, + "min_lr": "1e-7", + "grad_norm": 5.0, + "grad_clipping": 10.0, + "cuda_device": 0, + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "scheduler": { + "type": "reduce_on_plateau", + "mode": "min", + "factor": 0.5, + "patience": 5, + "threshold": 0.0001, + "threshold_mode": "abs", + "verbose": true + } + } +} diff --git a/configs/ner_emd.json b/configs/ner_emd.json new file mode 100644 index 0000000..b5ee471 --- /dev/null +++ b/configs/ner_emd.json @@ -0,0 +1,172 @@ +{ + "task_ner":{ + "task_description":{ + "task_name": "ner", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader":{ + "type": "ner_ontonotes", + "label_namespace": "ontonotes_ner_labels", + "coding_scheme": "BIOUL", + "lazy": false, + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path": "./data/conll-2012/v4/data/train/", + "validation_data_path": "./data/conll-2012/v4/data/development/", + "test_data_path": "./data/conll-2012/v4/data/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "task_emd":{ + "task_description":{ + "task_name": "emd", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "mention_ace", + "label_namespace": "ace_mention_labels", + "lazy": false, + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + "train_data_path": "./data/ace2005/train", + "validation_data_path": "./data/ace2005/dev", + "test_data_path": "./data/ace2005/test", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "model": { + "type": "ner_emd", + + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "pretrained_file": "./data/glove/glove.6B.100d.txt.gz", + "embedding_dim": 100, + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder", + "options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0, + "requires_grad": false + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 64, + "ngram_filter_sizes": [3] + }, + "dropout": 0.1 + } + } + }, + + "ner": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1188, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ontonotes_ner_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + }, + + "emd": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1316, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ace_mention_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + } + }, + + "iterators": { + "iterator": { + "type": "basic", + "batch_size": 32 + } + }, + + "multi_task_trainer": { + "type": "sampler_multi_task_trainer", + "sampling_method": "proportional", + "patience": 10, + "num_epochs": 100, + "min_lr": "1e-7", + "grad_norm": 5.0, + "grad_clipping": 10.0, + "cuda_device": 0, + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "scheduler": { + "type": "reduce_on_plateau", + "mode": "min", + "factor": 0.5, + "patience": 5, + "threshold": 0.0001, + "threshold_mode": "abs", + "verbose": true + } + } +} \ No newline at end of file diff --git a/configs/ner_emd_coref_ace.json b/configs/ner_emd_coref_ace.json new file mode 100644 index 0000000..b47df4f --- /dev/null +++ b/configs/ner_emd_coref_ace.json @@ -0,0 +1,252 @@ +{ + "task_ner":{ + "task_description":{ + "task_name": "ner", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader":{ + "type": "ner_ontonotes", + "label_namespace": "ontonotes_ner_labels", + "coding_scheme": "BIOUL", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path": "./data/conll-2012/v4/data/train/", + "validation_data_path": "./data/conll-2012/v4/data/development/", + "test_data_path": "./data/conll-2012/v4/data/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "task_emd":{ + "task_description":{ + "task_name": "emd", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "mention_ace", + "label_namespace": "ace_mention_labels", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + "train_data_path": "./data/ace2005/train/", + "validation_data_path": "./data/ace2005/dev/", + "test_data_path": "./data/ace2005/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "task_coref":{ + "task_description":{ + "task_name": "coref", + "validation_metric_name": "coref_f1", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "coref_ace", + "max_span_width": 8, + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path":"./data/ace2005/single_file_train.gold_conll", + "validation_data_path": "./data/ace2005/single_file_dev.gold_conll", + "test_data_path": "./data/ace2005/single_file_test.gold_conll", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "model": { + "type": "ner_emd_coref", + + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "pretrained_file": "./data/glove/glove.6B.100d.txt.gz", + "embedding_dim": 100, + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder", + "options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0, + "requires_grad": false + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 64, + "ngram_filter_sizes": [3] + }, + "dropout": 0.1 + } + } + }, + + "ner": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1188, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ontonotes_ner_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + }, + + "emd": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1316, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ace_mention_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + }, + + "coref": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1316, + "hidden_size": 200, + "num_layers": 1, + "dropout": 0.2 + }, + "tagger": { + "mention_feedforward": { + "input_dim": 2136, + "num_layers": 2, + "hidden_dims": 150, + "activations": "relu", + "dropout": 0.3 + }, + "antecedent_feedforward": { + "input_dim": 6428, + "num_layers": 2, + "hidden_dims": 150, + "activations": "relu", + "dropout": 0.3 + }, + "initializer": [ + [".*linear_layers.*weight", {"type": "xavier_normal"}], + [".*scorer._module.weight", {"type": "xavier_normal"}], + ["_distance_embedding.weight", {"type": "xavier_normal"}], + ["_span_width_embedding.weight", {"type": "xavier_normal"}], + ["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}], + ["_context_layer._module.weight_hh.*", {"type": "orthogonal"}] + ], + "lexical_dropout": 0.5, + "feature_size": 20, + "max_span_width": 8, + "spans_per_word": 0.4, + "max_antecedents": 70, + "eval_on_gold_mentions": false + } + } + }, + + "iterators": { + "iterator": { + "type": "basic", + "batch_size": 32 + }, + "iterator_coref": { + "type": "bucket", + "sorting_keys": [["text", "num_tokens"]], + "padding_noise": 0.0, + "batch_size": 1 + } + }, + + "multi_task_trainer": { + "type": "sampler_multi_task_trainer", + "sampling_method": "proportional", + "patience": 10, + "num_epochs": 100, + "min_lr": "1e-7", + "grad_norm": 5.0, + "grad_clipping": 10.0, + "cuda_device": 0, + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "scheduler": { + "type": "reduce_on_plateau", + "mode": "min", + "factor": 0.5, + "patience": 5, + "threshold": 0.0001, + "threshold_mode": "abs", + "verbose": true + } + } +} \ No newline at end of file diff --git a/configs/ner_emd_relation.json b/configs/ner_emd_relation.json new file mode 100644 index 0000000..08d924c --- /dev/null +++ b/configs/ner_emd_relation.json @@ -0,0 +1,226 @@ +{ + "task_ner":{ + "task_description":{ + "task_name": "ner", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader":{ + "type": "ner_ontonotes", + "label_namespace": "ontonotes_ner_labels", + "coding_scheme": "BIOUL", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path": "./data/conll-2012/v4/data/train/", + "validation_data_path": "./data/conll-2012/v4/data/development/", + "test_data_path": "./data/conll-2012/v4/data/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "task_emd":{ + "task_description":{ + "task_name": "emd", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "mention_ace", + "label_namespace": "ace_mention_labels", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + "train_data_path": "./data/ace2005/train/", + "validation_data_path": "./data/ace2005/dev/", + "test_data_path": "./data/ace2005/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "task_relation":{ + "task_description":{ + "task_name": "relation", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader": { + "type": "relation_ace", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path": "./data/ace2005/train/", + "validation_data_path": "./data/ace2005/dev/", + "test_data_path": "./data/ace2005/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "model": { + "type": "ner_emd_relation", + + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "pretrained_file": "./data/glove/glove.6B.100d.txt.gz", + "embedding_dim": 100, + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder", + "options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0, + "requires_grad": false + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 64, + "ngram_filter_sizes": [3] + }, + "dropout": 0.1 + } + } + }, + + "ner": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1188, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ontonotes_ner_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + }, + + "emd": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1316, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.2 + }, + "tagger": { + "label_namespace": "ace_mention_labels", + "constraint_type": "BIOUL", + "dropout": 0.2 + } + }, + + "relation": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1316, + "hidden_size": 64, + "num_layers": 3, + "dropout": 0.2 + }, + + "tagger": { + "d": 64, + "l": 64, + "n_classes": 6, + "activation": "relu" + } + } + }, + + "iterators": { + "iterator": { + "type": "basic", + "batch_size": 32 + }, + "iterator_relation": { + "type": "basic", + "batch_size": 4 + } + }, + + "multi_task_trainer": { + "type": "sampler_multi_task_trainer", + "sampling_method": "proportional", + "patience": 10, + "num_epochs": 100, + "min_lr": "1e-7", + "grad_norm": 5.0, + "grad_clipping": 10.0, + "cuda_device": 0, + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "scheduler": { + "type": "reduce_on_plateau", + "mode": "min", + "factor": 0.5, + "patience": 5, + "threshold": 0.0001, + "threshold_mode": "abs", + "verbose": true + } + } +} \ No newline at end of file diff --git a/configs/relation.json b/configs/relation.json new file mode 100644 index 0000000..570dbf7 --- /dev/null +++ b/configs/relation.json @@ -0,0 +1,124 @@ +{ + "task_relation":{ + "task_description":{ + "task_name": "relation", + "validation_metric_name": "f1-measure-overall", + "validation_metric_decreases": false, + "evaluate_on_test": true + }, + + "data_params":{ + "dataset_reader":{ + "type": "relation_ace", + "lazy": false, + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters":{ + "type": "characters" + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + + + "train_data_path": "./data/ace2005/train/", + "validation_data_path": "./data/ace2005/dev/", + "test_data_path": "./data/ace2005/test/", + + "datasets_for_vocab_creation": ["train"] + } + }, + + "model": { + "type": "relation", + + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "pretrained_file": "./data/glove/glove.6B.100d.txt.gz", + "embedding_dim": 100, + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder", + "options_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "./data/elmo/2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0, + "requires_grad": false + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 64, + "ngram_filter_sizes": [3] + }, + "dropout": 0.1 + } + } + }, + + "relation": { + "encoder": { + "type": "lstm", + "bidirectional": true, + "input_size": 1188, + "hidden_size": 64, + "num_layers": 3, + "dropout": 0.2 + }, + "tagger": { + "d": 64, + "l": 64, + "n_classes": 6, + "activation": "relu" + } + } + }, + + "iterators": { + "iterator": { + "type": "basic", + "batch_size": 32 + }, + "iterator_relation": { + "type": "basic", + "batch_size": 4 + } + }, + + "multi_task_trainer": { + "type": "sampler_multi_task_trainer", + "sampling_method": "proportional", + "patience": 10, + "num_epochs": 100, + "min_lr": "1e-7", + "grad_norm": 5.0, + "grad_clipping": 10.0, + "cuda_device": 0, + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "scheduler": { + "type": "reduce_on_plateau", + "mode": "min", + "factor": 0.5, + "patience": 5, + "threshold": 0.0001, + "threshold_mode": "abs", + "verbose": true + } + } +} \ No newline at end of file diff --git a/evaluate.py b/evaluate.py new file mode 100644 index 0000000..46e7748 --- /dev/null +++ b/evaluate.py @@ -0,0 +1,203 @@ +# coding: utf-8 + +""" +The ``evaluate.py`` file can be used to +evaluate a trained model against a dataset +and report any metrics calculated by the model. +It requires a configuration file and a directory in +which to write the results. + +.. code-block:: bash + + $ python evaluate.py --help + usage: evaluate.py [-h] -s SERIALIZATION_DIR [-g] + + optional arguments: + -h, --help show this help message and exit + -s SERIALIZATION_DIR, --serialization_dir SERIALIZATION_DIR + Directory in which to save the model and its logs. + -g, --gold_mentions Whether or not evaluate using gold mentions in + coreference +""" + +import argparse +import os +import json +import itertools +import re +from copy import deepcopy +import tqdm +from typing import List, Dict, Any, Iterable +import torch + +from allennlp.models.model import Model +from allennlp.data import Instance +from allennlp.data.iterators import DataIterator +from allennlp.common.checks import check_for_gpu +from allennlp.common.params import Params +from allennlp.nn import util +from allennlp.data import Vocabulary + +from hmtl.tasks import Task +from hmtl.common import create_and_set_iterators + +import logging +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO) +logger = logging.getLogger(__name__) + + +def evaluate(model: Model, + instances: Iterable[Instance], + task_name: str, + data_iterator: DataIterator, + cuda_device: int) -> Dict[str, Any]: + """ + Evaluate a model for a particular task (usually after training). + + Parameters + ---------- + model : ``allennlp.models.model.Model``, required + The model to evaluate + instances : ``Iterable[Instance]``, required + The (usually test) dataset on which to evalute the model. + task_name : ``str``, required + The name of the task on which evaluate the model. + data_iterator : ``DataIterator`` + Iterator that go through the dataset. + cuda_device : ``int`` + Cuda device to use. + + Returns + ------- + metrics : ``Dict[str, Any]`` + A dictionary containing the metrics on the evaluated dataset. + """ + check_for_gpu(cuda_device) + with torch.no_grad(): + model.eval() + + iterator = data_iterator(instances, + num_epochs = 1, + shuffle = False) + logger.info("Iterating over dataset") + generator_tqdm = tqdm.tqdm(iterator, + total = data_iterator.get_num_batches(instances)) + + eval_loss = 0 + nb_batches = 0 + for batch in generator_tqdm: + batch = util.move_to_device(batch, cuda_device) + nb_batches += 1 + + eval_output_dict = model.forward(task_name = task_name, tensor_batch = batch) + loss = eval_output_dict["loss"] + eval_loss += loss.item() + metrics = model.get_metrics(task_name = task_name) + metrics["loss"] = float(eval_loss/nb_batches) + + description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" + generator_tqdm.set_description(description, refresh = False) + + metrics = model.get_metrics(task_name = task_name, reset = True, full = True) + metrics["loss"] = float(eval_loss/nb_batches) + return metrics + + +if __name__ == "__main__": + ### Evaluate from args ### + + # Parse arguments + parser = argparse.ArgumentParser() + parser.add_argument("-s", + "--serialization_dir", + required = True, + help = "Directory in which to save the model and its logs.", + type = str) + parser.add_argument("-g", + "--gold_mentions", + action = "store_true", + required = False, + default = False, + help = "Whether or not evaluate using gold mentions in coreference") + args = parser.parse_args() + + + params = Params.from_file(params_file = os.path.join(args.serialization_dir, "config.json")) + + + ### Instantiate tasks ### + task_list = [] + task_keys = [key for key in params.keys() if re.search("^task_", key)] + + for key in task_keys: + logger.info("Creating %s", key) + task_params = params.pop(key) + task_description = task_params.pop("task_description") + task_data_params = task_params.pop("data_params") + + task = Task.from_params(params = task_description) + task_list.append(task) + + _, _ = task.load_data_from_params(params = task_data_params) + + + ### Load Vocabulary from files ### + vocab = Vocabulary.from_files(os.path.join(args.serialization_dir, "vocabulary")) + logger.info("Vocabulary loaded") + + + ### Load the data iterators ### + task_list = create_and_set_iterators(params = params, task_list = task_list, vocab = vocab) + + + ### Regularization ### + regularizer = None + + + ### Create model ### + model_params = params.pop("model") + model = Model.from_params(vocab = vocab, params = model_params, regularizer = regularizer) + + + ### Real evaluation ### + cuda_device = params.pop("multi_task_trainer").pop_int("cuda_device", -1) + + metrics = {task._name: {} for task in task_list} + for task in task_list: + if not task._evaluate_on_test: continue + + logger.info("Task %s will be evaluated using the best epoch weights.", task._name) + assert task._test_data is not None, "Task {} wants to be evaluated on test dataset but no there is no test data loaded.".format(task._name) + + logger.info("Loading the best epoch weights for task %s", task._name) + best_model_state_path = os.path.join(args.serialization_dir, "best_{}.th".format(task._name)) + best_model_state = torch.load(best_model_state_path) + best_model = model + best_model.load_state_dict(state_dict = best_model_state) + + test_metric_dict = {} + + for pair_task in task_list: + if not pair_task._evaluate_on_test: continue + + logger.info("Pair task %s is evaluated with the best model for %s", pair_task._name, task._name) + test_metric_dict[pair_task._name] = {} + test_metrics = evaluate(model = best_model, + task_name = pair_task._name, + instances = pair_task._test_data, + data_iterator = pair_task._data_iterator, + cuda_device = cuda_device) + + for metric_name, value in test_metrics.items(): + test_metric_dict[pair_task._name][metric_name] = value + + metrics[task._name]["test"] = deepcopy(test_metric_dict) + logger.info("Finished evaluation of task %s.", task._name) + + metrics_json = json.dumps(metrics, indent = 2) + with open(os.path.join(args.serialization_dir, "evaluate_metrics.json"), "w") as metrics_file: + metrics_file.write(metrics_json) + + logger.info("Metrics: %s", metrics_json) \ No newline at end of file diff --git a/fine_tune.py b/fine_tune.py new file mode 100644 index 0000000..8cb4964 --- /dev/null +++ b/fine_tune.py @@ -0,0 +1,157 @@ +# coding: utf-8 + +""" +The ``fine_tune.py`` file is used to continue training (or `fine-tune`) a model on a `different +dataset` than the one it was originally trained on. It requires a saved model archive file, a path +to the data you will continue training with, and a directory in which to write the results. + +. code-block:: bash + + $ python fine_tune.py --help + usage: fine_tune.py [-h] -s SERIALIZATION_DIR -c CONFIG_FILE_PATH -p + PRETRAINED_DIR -m PRETRAINED_MODEL_NAME + + optional arguments: + -h, --help show this help message and exit + -s SERIALIZATION_DIR, --serialization_dir SERIALIZATION_DIR + Directory in which to save the model and its logs. + -c CONFIG_FILE_PATH, --config_file_path CONFIG_FILE_PATH + Path to parameter file describing the new multi-tasked + model to be fine-tuned. + -p PRETRAINED_DIR, --pretrained_dir PRETRAINED_DIR + Directory in which was saved the pre-trained model. + -m PRETRAINED_MODEL_NAME, --pretrained_model_name PRETRAINED_MODEL_NAME + Name of the weight file for the pretrained model to + fine-tune in the ``pretrained_dir``. +""" + +import argparse +import itertools +import os +import json +import re +from copy import deepcopy +import torch +from typing import List, Dict, Any, Tuple +import logging +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO) + +from hmtl.tasks import Task +from hmtl.training.multi_task_trainer import MultiTaskTrainer +from hmtl.common import create_and_set_iterators +from evaluate import evaluate +from train import train_model + +from allennlp.models.model import Model +from allennlp.data import Vocabulary +from allennlp.data.iterators import DataIterator +from allennlp.commands.train import create_serialization_dir +from allennlp.common.params import Params +from allennlp.common.checks import ConfigurationError +from allennlp.nn import RegularizerApplicator + +logger = logging.getLogger(__name__) + + +if __name__ == "__main__": + # Parse arguments + parser = argparse.ArgumentParser() + parser.add_argument("-s", + "--serialization_dir", + required = True, + help = "Directory in which to save the model and its logs.", + type = str) + parser.add_argument("-c", + "--config_file_path", + required = True, + help = "Path to parameter file describing the new multi-tasked model to be fine-tuned.", + type = str) + parser.add_argument("-p", + "--pretrained_dir", + required = True, + help = "Directory in which was saved the pre-trained model.", + type = str) + parser.add_argument("-m", + "--pretrained_model_name", + required = True, + help = "Name of the weight file for the pretrained model to fine-tune in the ``pretrained_dir``.", + type = str) + args = parser.parse_args() + + + params = Params.from_file(params_file = args.config_file_path) + serialization_dir = args.serialization_dir + create_serialization_dir(params, serialization_dir, False) + + serialization_params = deepcopy(params).as_dict(quiet=True) + with open(os.path.join(serialization_dir, "config.json"), "w") as param_file: + json.dump(serialization_params, param_file, indent = 4) + + + ### Instantiate tasks ### + task_list = [] + task_keys = [key for key in params.keys() if re.search("^task_", key)] + + for key in task_keys: + logger.info("Creating %s", key) + task_params = params.pop(key) + task_description = task_params.pop("task_description") + task_data_params = task_params.pop("data_params") + + task = Task.from_params(params = task_description) + task_list.append(task) + + _, _ = task.load_data_from_params(params = task_data_params) + + + ### Load Vocabulary from files and save it to the new serialization_dir ### + # PLEASE NOTE that here, we suppose that the vocabulary is the same for the pre-trained model + # and the model to fine-tune. The most noticeable implication of this hypothesis is that the label specs + # between the two datasets (for pre-training and for fine-tuning) are exactly the same. + vocab = Vocabulary.from_files(os.path.join(args.pretrained_dir, "vocabulary")) + logger.info("Vocabulary loaded from %s", os.path.join(args.pretrained_dir, "vocabulary")) + + vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) + logger.info("Save vocabulary to file %s", os.path.join(serialization_dir, "vocabulary")) + + + ### Load the data iterators for each task ### + task_list = create_and_set_iterators(params = params, task_list = task_list, vocab = vocab) + + + ### Load Regularizations ### + regularizer = RegularizerApplicator.from_params(params.pop('regularizer', [])) + + + ### Create model ### + model_params = params.pop("model") + model = Model.from_params(vocab = vocab, params = model_params, regularizer = regularizer) + + + logger.info("Loading the pretrained model from %s", os.path.join(args.pretrained_dir, args.pretrained_model_name)) + try: + pretrained_model_state_path = os.path.join(args.pretrained_dir, args.pretrained_model_name) + pretrained_model_state = torch.load(pretrained_model_state_path) + model.load_state_dict(state_dict = pretrained_model_state) + except: + raise ConfigurationError("It appears that the configuration of the pretrained model and " + "the model to fine-tune are not compatible. " + "Please check the compatibility of the encoders and taggers in the " + "config files.") + + + ### Create multi-task trainer ### + multi_task_trainer_params = params.pop("multi_task_trainer") + trainer = MultiTaskTrainer.from_params(model = model, + task_list = task_list, + serialization_dir = serialization_dir, + params = multi_task_trainer_params) + + + ### Launch training ### + metrics = train_model(multi_task_trainer = trainer, + recover = False) + if metrics is not None: + logging.info("Fine-tuning is finished ! Let's have a drink. It's on the house !") \ No newline at end of file diff --git a/hmtl/__init__.py b/hmtl/__init__.py new file mode 100644 index 0000000..e0807c6 --- /dev/null +++ b/hmtl/__init__.py @@ -0,0 +1,7 @@ +# coding: utf-8 + +from hmtl.dataset_readers import * +from hmtl.modules import * +from hmtl.models import * +from hmtl.tasks import * +from hmtl.training import * \ No newline at end of file diff --git a/hmtl/common/__init__.py b/hmtl/common/__init__.py new file mode 100644 index 0000000..183c8db --- /dev/null +++ b/hmtl/common/__init__.py @@ -0,0 +1,3 @@ +# coding: utf-8 + +from hmtl.common.util import create_and_set_iterators \ No newline at end of file diff --git a/hmtl/common/util.py b/hmtl/common/util.py new file mode 100644 index 0000000..dc16492 --- /dev/null +++ b/hmtl/common/util.py @@ -0,0 +1,53 @@ +# coding: utf-8 + +""" +Various utilities that don't fit anwhere else. +""" + +from typing import List, Dict, Any, Tuple + +from allennlp.common.params import Params +from allennlp.data import Vocabulary +from allennlp.data.iterators import DataIterator + +from hmtl.tasks import Task + + + +def create_and_set_iterators(params: Params, + task_list: List[Task], + vocab: Vocabulary) -> List[Task]: + ''' + Each task/dataset can have its own specific data iterator. If not precised, + we use a shared/common data iterator. + + Parameters + ---------- + params: ``Params`` + A parameter object specifing an experiment. + task_list: ``List[Task]`` + A list containing the tasks of the model to train. + + Returns + ------- + task_list: ``List[Task]`` + The list containing the tasks of the model to train, where each task has a new attribute: the data iterator. + ''' + ### Charge default iterator ### + iterators_params = params.pop("iterators") + + default_iterator_params = iterators_params.pop("iterator") + default_iterator = DataIterator.from_params(default_iterator_params) + default_iterator.index_with(vocab) + + ### Charge dataset specific iterators ### + for task in task_list: + specific_iterator_params = iterators_params.pop("iterator_" + task._name, None) + if specific_iterator_params is not None: + specific_iterator = DataIterator.from_params(specific_iterator_params) + specific_iterator.index_with(vocab) + task.set_data_iterator(specific_iterator) + else: + task.set_data_iterator(default_iterator) + + return task_list \ No newline at end of file diff --git a/hmtl/dataset_readers/__init__.py b/hmtl/dataset_readers/__init__.py new file mode 100644 index 0000000..fbf8504 --- /dev/null +++ b/hmtl/dataset_readers/__init__.py @@ -0,0 +1,6 @@ +# coding: utf-8 + +from hmtl.dataset_readers.ner_ontonotes import NerOntonotesReader +from hmtl.dataset_readers.mention_ace import MentionACEReader +from hmtl.dataset_readers.relation_ace import RelationACEReader +from hmtl.dataset_readers.coref_ace import CorefACEReader \ No newline at end of file diff --git a/hmtl/dataset_readers/coref_ace.py b/hmtl/dataset_readers/coref_ace.py new file mode 100644 index 0000000..054783f --- /dev/null +++ b/hmtl/dataset_readers/coref_ace.py @@ -0,0 +1,180 @@ +# coding: utf-8 + +import logging +import collections +from typing import Any, Dict, List, Optional, Tuple, DefaultDict, Set + +from overrides import overrides + +from allennlp.common import Params +from allennlp.common.file_utils import cached_path +from allennlp.data.dataset_readers.dataset_reader import DatasetReader +from allennlp.data.fields import Field, ListField, TextField, SpanField, MetadataField, SequenceLabelField +from allennlp.data.instance import Instance +from allennlp.data.tokenizers import Token +from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer +from allennlp.data.dataset_readers.dataset_utils import enumerate_spans + +from hmtl.dataset_readers.dataset_utils import ACE + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +def canonicalize_clusters(clusters: DefaultDict[int, List[Tuple[int, int]]]) -> List[List[Tuple[int, int]]]: + """ + The CoNLL 2012 data includes 2 annotatated spans which are identical, + but have different ids. This checks all clusters for spans which are + identical, and if it finds any, merges the clusters containing the + identical spans. + """ + merged_clusters: List[Set[Tuple[int, int]]] = [] + for cluster in clusters.values(): + cluster_with_overlapping_mention = None + for mention in cluster: + # Look at clusters we have already processed to + # see if they contain a mention in the current + # cluster for comparison. + for cluster2 in merged_clusters: + if mention in cluster2: + # first cluster in merged clusters + # which contains this mention. + cluster_with_overlapping_mention = cluster2 + break + # Already encountered overlap - no need to keep looking. + if cluster_with_overlapping_mention is not None: + break + if cluster_with_overlapping_mention is not None: + # Merge cluster we are currently processing into + # the cluster in the processed list. + cluster_with_overlapping_mention.update(cluster) + else: + merged_clusters.append(set(cluster)) + return [list(c) for c in merged_clusters] + + +@DatasetReader.register("coref_ace") +class CorefACEReader(DatasetReader): + """ + A dataset reader to read the coref clusters from an ACE dataset + previously pre-procesed to fit the CoNLL-coreference format. + + Parameters + ---------- + max_span_width: ``int``, required. + The maximum width of candidate spans to consider. + token_indexers : ``Dict[str, TokenIndexer]``, optional + This is used to index the words in the document. See :class:`TokenIndexer`. + Default is ``{"tokens": SingleIdTokenIndexer()}``. + lazy : ``bool``, optional (default = False) + Whether or not the dataset should be loaded in lazy way. + """ + def __init__(self, + max_span_width: int, + token_indexers: Dict[str, TokenIndexer] = None, + lazy: bool = False) -> None: + super().__init__(lazy) + self._max_span_width = max_span_width + self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} + + @overrides + def _read(self, file_path: str): + # if `file_path` is a URL, redirect to the cache + file_path = cached_path(file_path) + + ace_reader = ACE() + for sentences in ace_reader.dataset_document_iterator(file_path): + clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) + + total_tokens = 0 + for sentence in sentences: + for typed_span in sentence.coref_spans: + # Coref annotations are on a _per sentence_ + # basis, so we need to adjust them to be relative + # to the length of the document. + span_id, (start, end) = typed_span + clusters[span_id].append((start + total_tokens, + end + total_tokens)) + total_tokens += len(sentence.words) + + canonical_clusters = canonicalize_clusters(clusters) + yield self.text_to_instance([s.words for s in sentences], canonical_clusters) + + @overrides + def text_to_instance(self, # type: ignore + sentences: List[List[str]], + gold_clusters: Optional[List[List[Tuple[int, int]]]] = None) -> Instance: + # pylint: disable=arguments-differ + """ + Parameters + ---------- + sentences : ``List[List[str]]``, required. + A list of lists representing the tokenised words and sentences in the document. + gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) + A list of all clusters in the document, represented as word spans. Each cluster + contains some number of spans, which can be nested and overlap, but will never + exactly match between clusters. + + Returns + ------- + An ``Instance`` containing the following ``Fields``: + text : ``TextField`` + The text of the full document. + spans : ``ListField[SpanField]`` + A ListField containing the spans represented as ``SpanFields`` + with respect to the document text. + span_labels : ``SequenceLabelField``, optional + The id of the cluster which each possible span belongs to, or -1 if it does + not belong to a cluster. As these labels have variable length (it depends on + how many spans we are considering), we represent this a as a ``SequenceLabelField`` + with respect to the ``spans ``ListField``. + """ + flattened_sentences = [self._normalize_word(word) + for sentence in sentences + for word in sentence] + + metadata: Dict[str, Any] = {"original_text": flattened_sentences} + if gold_clusters is not None: + metadata["clusters"] = gold_clusters + + text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers) + + cluster_dict = {} + if gold_clusters is not None: + for cluster_id, cluster in enumerate(gold_clusters): + for mention in cluster: + cluster_dict[tuple(mention)] = cluster_id + + spans: List[Field] = [] + span_labels: Optional[List[int]] = [] if gold_clusters is not None else None + + sentence_offset = 0 + for sentence in sentences: + for start, end in enumerate_spans(sentence, + offset=sentence_offset, + max_span_width=self._max_span_width): + if span_labels is not None: + if (start, end) in cluster_dict: + span_labels.append(cluster_dict[(start, end)]) + else: + span_labels.append(-1) + + spans.append(SpanField(start, end, text_field)) + sentence_offset += len(sentence) + + span_field = ListField(spans) + metadata_field = MetadataField(metadata) + + fields: Dict[str, Field] = {"text": text_field, + "spans": span_field, + "metadata": metadata_field} + if span_labels is not None: + fields["span_labels"] = SequenceLabelField(span_labels, span_field) + + return Instance(fields) + + @staticmethod + def _normalize_word(word): + if word == "/." or word == "/?": + return word[1:] + else: + return word diff --git a/hmtl/dataset_readers/dataset_utils/__init__.py b/hmtl/dataset_readers/dataset_utils/__init__.py new file mode 100644 index 0000000..6610b37 --- /dev/null +++ b/hmtl/dataset_readers/dataset_utils/__init__.py @@ -0,0 +1,3 @@ +# coding: utf-8 + +from hmtl.dataset_readers.dataset_utils.ace import ACE, ACESentence \ No newline at end of file diff --git a/hmtl/dataset_readers/dataset_utils/ace.py b/hmtl/dataset_readers/dataset_utils/ace.py new file mode 100644 index 0000000..b29003a --- /dev/null +++ b/hmtl/dataset_readers/dataset_utils/ace.py @@ -0,0 +1,282 @@ +# coding: utf-8 + +from typing import DefaultDict, List, Optional, Iterator, Set, Tuple +from collections import defaultdict +import codecs +import os +import logging + +from allennlp.data.dataset_readers.dataset_utils import iob1_to_bioul + +from nltk import Tree + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + +TypedSpan = Tuple[int, Tuple[int, int]] # pylint: disable=invalid-name +TypedStringSpan = Tuple[str, Tuple[int, int]] # pylint: disable=invalid-name + +class ACESentence: + """ + A class representing the annotations available for a single ACE CONLL-formatted sentence. + + Parameters + ---------- + words : ``List[str]`` + This is the tokens as segmented/tokenized with spayc. + mention_tags : ``List[str]`` + The BIO tags for Entity Mention Detection in the sentence. + relations : ``List[Tuple[str, List[str]]]`` + The relations tags for Relation Extraction in the sentence. + last_head_token_relations : ``List[Tuple[str, List[str]]]`` + The relations tags between last tokens for ARG1 and ARG2 for Relation Extraction in the sentence. + coref_spans : ``Set[TypedSpan]`` + The spans for entity mentions involved in coreference resolution within the sentence. + Each element is a tuple composed of (cluster_id, (start_index, end_index)). Indices + are `inclusive`. + """ + def __init__(self, + words: List[str], + mention_tags: List[str], + relations: List[Tuple[str, List[str]]], + last_head_token_relations: List[Tuple[str, List[str]]], + coref_spans: Set[TypedSpan]): + self.words = words + self.mention_tags = mention_tags + self.relations = relations + self.last_head_token_relations = last_head_token_relations + self.coref_spans = coref_spans + + +class ACE: + """ + This DatasetReader is designed to read in the ACE (2005 or 2004) which + have been previously formatted in the format used by the CoNLL format + (see for instance OntoNotes dataset). + """ + def dataset_iterator(self, file_path: str) -> Iterator[ACESentence]: + """ + An iterator over the entire dataset, yielding all sentences processed. + """ + for conll_file in self.dataset_path_iterator(file_path): + yield from self.sentence_iterator(conll_file) + + @staticmethod + def dataset_path_iterator(file_path: str) -> Iterator[str]: + """ + An iterator returning file_paths in a directory + containing CONLL-formatted files. + """ + logger.info("Reading ACE CONLL-like sentences from dataset files at: %s", file_path) + for root, _, files in list(os.walk(file_path)): + for data_file in files: + if not data_file.endswith("like_conll"): + continue + + yield os.path.join(root, data_file) + + def dataset_document_iterator(self, file_path: str) -> Iterator[List[ACESentence]]: + """ + An iterator over CONLL-formatted files which yields documents, regardless + of the number of document annotations in a particular file. + """ + with codecs.open(file_path, 'r', encoding='utf8') as open_file: + conll_rows = [] + document: List[ACESentence] = [] + for line in open_file: + line = line.strip() + if line != '' and not line.startswith('#'): + # Non-empty line. Collect the annotation. + conll_rows.append(line) + else: + if conll_rows: + document.append(self._conll_rows_to_sentence(conll_rows)) + conll_rows = [] + if line.startswith("#end document"): + yield document + document = [] + if document: + # Collect any stragglers or files which might not + # have the '#end document' format for the end of the file. + yield document + + def sentence_iterator(self, file_path: str) -> Iterator[ACESentence]: + """ + An iterator over the sentences in an individual CONLL formatted file. + """ + for document in self.dataset_document_iterator(file_path): + for sentence in document: + yield sentence + + def _conll_rows_to_sentence(self, conll_rows: List[str]) -> ACESentence: + sentence: List[str] = [] + mention_tags: List[str] = [] + + span_labels: List[List[str]] = [] + current_span_labels: List[str] = [] + + # Cluster id -> List of (start_index, end_index) spans. + clusters: DefaultDict[int, List[Tuple[int, int]]] = defaultdict(list) + # Cluster id -> List of start_indices which are open for this id. + coref_stacks: DefaultDict[int, List[int]] = defaultdict(list) + + for index, row in enumerate(conll_rows): + conll_components = row.split() + + word = conll_components[1] + + if not span_labels: + span_labels = [[] for _ in conll_components[2:-1]] + current_span_labels = [None for _ in conll_components[2:-1]] + self._process_span_annotations_for_word(annotations = conll_components[2:-1], + span_labels = span_labels, + current_span_labels = current_span_labels) + + #Process coref + self._process_coref_span_annotations_for_word(conll_components[-1], + index, + clusters, + coref_stacks) + + sentence.append(word) + + + mention_tags = iob1_to_bioul(span_labels[0]) + + #Process coref clusters + coref_span_tuples: Set[TypedSpan] = {(cluster_id, span) + for cluster_id, span_list in clusters.items() + for span in span_list} + + + #Reformat the labels to only keep the the last token of the head + #Cf paper, we model relation between last tokens of heads. + last_head_token_relations = [] + bioul_relations = [] + + for relation_frame in span_labels[1:]: + bioul_relation_frame = iob1_to_bioul(relation_frame) + + reformatted_frame = [] + for annotation in bioul_relation_frame: + if annotation[:2] in ["L-", "U-"]: + reformatted_frame.append(annotation[2:]) + else: + reformatted_frame.append("*") + + last_head_token_relations.append(reformatted_frame) + bioul_relations.append(bioul_relation_frame) + + return ACESentence(sentence, mention_tags, bioul_relations, last_head_token_relations, coref_span_tuples) + + + @staticmethod + def _process_mention_tags(annotations: List[str]): + """ + Read and pre-process the entity mention tags as a formatted in CoNll-NER-style. + """ + labels = [] + current_span_label = None + for annotation in annotations: + label = annotation.strip("()*") + if "(" in annotation: + bio_label = "B-" + label + current_span_label = label + elif current_span_label is not None: + bio_label = "I-" + current_span_label + else: + bio_label = "O" + if ")" in annotation: + current_span_label = None + labels.append(bio_label) + return labels + + @staticmethod + def _process_span_annotations_for_word(annotations: List[str], + span_labels: List[List[str]], + current_span_labels: List[Optional[str]]) -> None: + """ + Given a sequence of different label types for a single word and the current + span label we are inside, compute the BIO tag for each label and append to a list. + + Parameters + ---------- + annotations: ``List[str]`` + A list of labels to compute BIO tags for. + span_labels : ``List[List[str]]`` + A list of lists, one for each annotation, to incrementally collect + the BIO tags for a sequence. + current_span_labels : ``List[Optional[str]]`` + The currently open span per annotation type, or ``None`` if there is no open span. + """ + for annotation_index, annotation in enumerate(annotations): + # strip all bracketing information to + # get the actual propbank label. + label = annotation.strip("()*") + + if "(" in annotation: + # Entering into a span for a particular semantic role label. + # We append the label and set the current span for this annotation. + bio_label = "B-" + label + span_labels[annotation_index].append(bio_label) + current_span_labels[annotation_index] = label + elif current_span_labels[annotation_index] is not None: + # If there's no '(' token, but the current_span_label is not None, + # then we are inside a span. + bio_label = "I-" + current_span_labels[annotation_index] + span_labels[annotation_index].append(bio_label) + else: + # We're outside a span. + span_labels[annotation_index].append("O") + # Exiting a span, so we reset the current span label for this annotation. + if ")" in annotation: + current_span_labels[annotation_index] = None + + + @staticmethod + def _process_coref_span_annotations_for_word(label: str, + word_index: int, + clusters: DefaultDict[int, List[Tuple[int, int]]], + coref_stacks: DefaultDict[int, List[int]]) -> None: + """ + For a given coref label, add it to a currently open span(s), complete a span(s) or + ignore it, if it is outside of all spans. This method mutates the clusters and coref_stacks + dictionaries. + + Parameters + ---------- + label : ``str`` + The coref label for this word. + word_index : ``int`` + The word index into the sentence. + clusters : ``DefaultDict[int, List[Tuple[int, int]]]`` + A dictionary mapping cluster ids to lists of inclusive spans into the + sentence. + coref_stacks: ``DefaultDict[int, List[int]]`` + Stacks for each cluster id to hold the start indices of active spans (spans + which we are inside of when processing a given word). Spans with the same id + can be nested, which is why we collect these opening spans on a stack, e.g: + + [Greg, the baker who referred to [himself]_ID1 as 'the bread man']_ID1 + """ + if label != "-": + for segment in label.split("|"): + # The conll representation of coref spans allows spans to + # overlap. If spans end or begin at the same word, they are + # separated by a "|". + if segment[0] == "(": + # The span begins at this word. + if segment[-1] == ")": + # The span begins and ends at this word (single word span). + cluster_id = int(segment[1:-1]) + clusters[cluster_id].append((word_index, word_index)) + else: + # The span is starting, so we record the index of the word. + cluster_id = int(segment[1:]) + coref_stacks[cluster_id].append(word_index) + else: + # The span for this id is ending, but didn't start at this word. + # Retrieve the start index from the document state and + # add the span to the clusters for this id. + cluster_id = int(segment[:-1]) + start = coref_stacks[cluster_id].pop() + clusters[cluster_id].append((start, word_index)) \ No newline at end of file diff --git a/hmtl/dataset_readers/mention_ace.py b/hmtl/dataset_readers/mention_ace.py new file mode 100644 index 0000000..048f060 --- /dev/null +++ b/hmtl/dataset_readers/mention_ace.py @@ -0,0 +1,75 @@ +# coding: utf-8 + +import logging +from typing import Dict, List, Iterable, Iterator + +from overrides import overrides +import codecs + +from allennlp.common import Params +from allennlp.common.checks import ConfigurationError +from allennlp.common.file_utils import cached_path +from allennlp.data.dataset_readers.dataset_reader import DatasetReader +from allennlp.data.dataset_readers.dataset_utils import iob1_to_bioul +from allennlp.data.fields import Field, TextField, SequenceLabelField +from allennlp.data.instance import Instance +from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer +from allennlp.data.tokenizers import Token +from allennlp.data.dataset_readers.dataset_utils import Ontonotes, OntonotesSentence + +from hmtl.dataset_readers.dataset_utils import ACE, ACESentence + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + + +@DatasetReader.register("mention_ace") +class MentionACEReader(DatasetReader): + ''' + A dataset reader to read the Entity Mention Tags from an ACE dataset + previously pre-procesed to fit the CoNll-NER format. + ''' + def __init__(self, + token_indexers: Dict[str, TokenIndexer] = None, + label_namespace: str = "ace_mention_labels", + lazy: bool = False) -> None: + super().__init__(lazy) + self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} + self._label_namespace = label_namespace + + + @staticmethod + def _sentence_iterate(ace_reader: ACE, + file_path: str) -> Iterable[ACESentence]: + for conll_file in ace_reader.dataset_path_iterator(file_path): + yield from ace_reader.sentence_iterator(conll_file) + + + @overrides + def _read(self, + file_path: str): + file_path = cached_path(file_path) # if `file_path` is a URL, redirect to the cache + ace_reader = ACE() + logger.info("Reading ACE Mention instances from dataset files at: %s", file_path) + + for sentence in self._sentence_iterate(ace_reader, file_path): + tokens = [Token(t) for t in sentence.words] + if not sentence.mention_tags: + tags = ["O" for _ in tokens] + else: + tags = sentence.mention_tags + + yield self.text_to_instance(tokens, tags) + + + def text_to_instance(self, + tokens: List[Token], + tags: List[str] = None) -> Instance: + # pylint: disable=arguments-differ + fields: Dict[str, Field] = {} + text_field = TextField(tokens, token_indexers=self._token_indexers) + fields['tokens'] = text_field + if tags: + fields['tags'] = SequenceLabelField(labels = tags, sequence_field = text_field, label_namespace = self._label_namespace) + return Instance(fields) + \ No newline at end of file diff --git a/hmtl/dataset_readers/ner_ontonotes.py b/hmtl/dataset_readers/ner_ontonotes.py new file mode 100644 index 0000000..f8d0c17 --- /dev/null +++ b/hmtl/dataset_readers/ner_ontonotes.py @@ -0,0 +1,107 @@ +# coding: utf-8 + +import logging +from typing import Dict, List, Iterable + +from overrides import overrides + +from allennlp.common import Params +from allennlp.common.checks import ConfigurationError +from allennlp.common.file_utils import cached_path +from allennlp.data.dataset_readers.dataset_reader import DatasetReader +from allennlp.data.dataset_readers.dataset_utils import iob1_to_bioul +from allennlp.data.fields import Field, TextField, SequenceLabelField +from allennlp.data.instance import Instance +from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer +from allennlp.data.tokenizers import Token +from allennlp.data.dataset_readers.dataset_utils import Ontonotes, OntonotesSentence + + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +@DatasetReader.register("ner_ontonotes") +class NerOntonotesReader(DatasetReader): + ''' + An ``allennlp.data.dataset_readers.dataset_reader.DatasetReader`` for reading + NER annotations in CoNll-formatted OntoNotes dataset. + + NB: This DatasetReader was implemented before the current implementation of + ``OntonotesNamedEntityRecognition`` in AllenNLP. It is thought doing pretty much the same thing. + + Parameters + ---------- + token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``) + We use this to define the input representation for the text. See :class:`TokenIndexer`. + Map a token to an id. + domain_identifier : ``str``, optional (default = None) + The subdomain to load. If None is specified, the whole dataset is loaded. + label_namespace : ``str``, optional (default = "ontonotes_ner_labels") + The tag/label namespace for the task/dataset considered. + lazy : ``bool``, optional (default = False) + Whether or not the dataset should be loaded in lazy way. + Refer to https://github.com/allenai/allennlp/blob/master/tutorials/getting_started/laziness.md + for more details about lazyness. + coding_scheme: ``str``, optional (default=``IOB1``) + Specifies the coding scheme for ``ner_labels`` and ``chunk_labels``. + Valid options are ``IOB1`` and ``BIOUL``. The ``IOB1`` default maintains + the original IOB1 scheme in the CoNLL data. + In the IOB1 scheme, I is a token inside a span, O is a token outside + a span and B is the beginning of span immediately following another + span of the same type. + ''' + def __init__(self, + token_indexers: Dict[str, TokenIndexer] = None, + domain_identifier: str = None, + label_namespace: str = "ontonotes_ner_labels", + lazy: bool = False, + coding_scheme: str = "IOB1") -> None: + super().__init__(lazy) + self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} + self._domain_identifier = domain_identifier + self._label_namespace = label_namespace + self._coding_scheme = coding_scheme + if coding_scheme not in ("IOB1", "BIOUL"): + raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme)) + + @overrides + def _read(self, + file_path: str): + file_path = cached_path(file_path) # if `file_path` is a URL, redirect to the cache + ontonotes_reader = Ontonotes() + logger.info("Reading NER instances from dataset files at: %s", file_path) + if self._domain_identifier is not None: + logger.info("Filtering to only include file paths containing the %s domain", self._domain_identifier) + + for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): + tokens = [Token(t) for t in sentence.words] + if not sentence.named_entities: + tags = ["O" for _ in tokens] + else: + tags = sentence.named_entities + + if self._coding_scheme == "BIOUL": + tags = iob1_to_bioul(tags) + + yield self.text_to_instance(tokens, tags) + + + @staticmethod + def _ontonotes_subset(ontonotes_reader: Ontonotes, + file_path: str, + domain_identifier: str) -> Iterable[OntonotesSentence]: + for conll_file in ontonotes_reader.dataset_path_iterator(file_path): + yield from ontonotes_reader.sentence_iterator(conll_file) + + + def text_to_instance(self, + tokens: List[Token], + tags: List[str] = None) -> Instance: + # pylint: disable=arguments-differ + fields: Dict[str, Field] = {} + text_field = TextField(tokens, token_indexers=self._token_indexers) + fields['tokens'] = text_field + if tags: + fields['tags'] = SequenceLabelField(labels = tags, sequence_field = text_field, label_namespace = self._label_namespace) + return Instance(fields) + \ No newline at end of file diff --git a/hmtl/dataset_readers/relation_ace.py b/hmtl/dataset_readers/relation_ace.py new file mode 100644 index 0000000..55b96aa --- /dev/null +++ b/hmtl/dataset_readers/relation_ace.py @@ -0,0 +1,80 @@ +# coding: utf-8 + +import logging +from typing import Dict, List, Iterable, Iterator + +from overrides import overrides +import codecs + +from allennlp.common import Params +from allennlp.common.checks import ConfigurationError +from allennlp.common.file_utils import cached_path +from allennlp.data.dataset_readers.dataset_reader import DatasetReader +from allennlp.data.dataset_readers.dataset_utils import iob1_to_bioul +from allennlp.data.fields import Field, TextField, SequenceLabelField, ListField +from allennlp.data.instance import Instance +from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer +from allennlp.data.tokenizers import Token +from allennlp.data.dataset_readers.dataset_utils import Ontonotes, OntonotesSentence + +from hmtl.dataset_readers.dataset_utils import ACE, ACESentence +#from hmtl.fields import MultipleSequenceLabelField + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + + +@DatasetReader.register("relation_ace") +class RelationACEReader(DatasetReader): + """ + A dataset reader to read the relations links from an ACE dataset + previously pre-procesed to fit the CoNLL-SRL format. + """ + def __init__(self, + token_indexers: Dict[str, TokenIndexer] = None, + label_namespace: str = "relation_ace_labels", + lazy: bool = False) -> None: + super().__init__(lazy) + self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} + self._label_namespace = label_namespace + + + @staticmethod + def _sentence_iterate(ace_reader: ACE, + file_path: str) -> Iterable[ACESentence]: + for conll_file in ace_reader.dataset_path_iterator(file_path): + yield from ace_reader.sentence_iterator(conll_file) + + + @overrides + def _read(self, + file_path: str): + file_path = cached_path(file_path) # if `file_path` is a URL, redirect to the cache + ace_reader = ACE() + logger.info("Reading Relation labels from dataset files at: %s", file_path) + + for sentence in self._sentence_iterate(ace_reader, file_path): + tokens = [Token(t) for t in sentence.words] + + if sentence.relations == []: + relations = None + continue + else: + relations = sentence.last_head_token_relations + yield self.text_to_instance(tokens, relations) + + + def text_to_instance(self, + tokens: List[Token], + relations = None) -> Instance: + # pylint: disable=arguments-differ + fields: Dict[str, Field] = {} + text_field = TextField(tokens, token_indexers=self._token_indexers) + fields['text'] = text_field + if relations is not None: + field_list = [] + for relation in relations: + field_list.append(SequenceLabelField(labels = relation, sequence_field = text_field, label_namespace = self._label_namespace)) + fields["relations"] = ListField(field_list = field_list) + return Instance(fields) + \ No newline at end of file diff --git a/hmtl/models/__init__.py b/hmtl/models/__init__.py new file mode 100644 index 0000000..b13b730 --- /dev/null +++ b/hmtl/models/__init__.py @@ -0,0 +1,21 @@ +# coding: utf-8 + +from hmtl.models.coref_custom import CoreferenceCustom +from hmtl.models.relation_extraction import RelationExtractor + +#Single Module +from hmtl.models.layerNer import LayerNer +from hmtl.models.layerRelation import LayerRelation +from hmtl.models.layerCoref import LayerCoref + +#Two modules +from hmtl.models.layerNerEmd import LayerNerEmd +from hmtl.models.layerEmdRelation import LayerEmdRelation +from hmtl.models.layerEmdCoref import LayerEmdCoref + +#Three modules +from hmtl.models.layerNerEmdCoref import LayerNerEmdCoref +from hmtl.models.layerNerEmdRelation import LayerNerEmdRelation + +#Four modules +from hmtl.models.hmtl import HMTL \ No newline at end of file diff --git a/hmtl/models/coref_custom.py b/hmtl/models/coref_custom.py new file mode 100644 index 0000000..45e07e3 --- /dev/null +++ b/hmtl/models/coref_custom.py @@ -0,0 +1,204 @@ +import logging +import math +from typing import Any, Dict, List, Optional, Tuple + +import torch +import torch.nn.functional as F +from overrides import overrides + +from allennlp.data import Vocabulary +from allennlp.models.model import Model +from allennlp.modules.token_embedders import Embedding +from allennlp.modules import FeedForward +from allennlp.modules import Seq2SeqEncoder, TimeDistributed, TextFieldEmbedder, SpanPruner +from allennlp.modules.span_extractors import SelfAttentiveSpanExtractor, EndpointSpanExtractor +from allennlp.nn import util, InitializerApplicator, RegularizerApplicator +from allennlp.training.metrics import MentionRecall, ConllCorefScores +from allennlp.models.coreference_resolution import CoreferenceResolver + +from hmtl.training.metrics import ConllCorefFullScores + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +class CoreferenceCustom(CoreferenceResolver): + """ + This class implements a marginally modified version of ``allennlp.models.coreference_resolution.CoreferenceResolver`` + which is an implementation of the model of Lee et al., 2017. + The two modifications are: + 1/ Replacing the scorer to be able to get the 3 detailled coreference metrics (B3, MUC, CEAFE), + and not only their average. + 2/ Give the possibility to evaluate with the gold mentions: the model first predict mentions that MIGHT + be part of a coreference cluster, and in second time predict the coreference clusters for theses mentions. + We leave the possibility of replacing predicting the possible mentions + with the gold mentions in evaluation. + """ + def __init__(self, + vocab: Vocabulary, + text_field_embedder: TextFieldEmbedder, + context_layer: Seq2SeqEncoder, + mention_feedforward: FeedForward, + antecedent_feedforward: FeedForward, + feature_size: int, + max_span_width: int, + spans_per_word: float, + max_antecedents: int, + lexical_dropout: float = 0.2, + initializer: InitializerApplicator = InitializerApplicator(), + regularizer: Optional[RegularizerApplicator] = None, + eval_on_gold_mentions: bool = False) -> None: + super(CoreferenceCustom, self).__init__(vocab = vocab, + text_field_embedder = text_field_embedder, + context_layer = context_layer, + mention_feedforward = mention_feedforward, + antecedent_feedforward = antecedent_feedforward, + feature_size = feature_size, + max_span_width = max_span_width, + spans_per_word = spans_per_word, + max_antecedents = max_antecedents, + lexical_dropout = lexical_dropout, + initializer = initializer, + regularizer = regularizer) + + self._conll_coref_scores = ConllCorefFullScores() + self._eval_on_gold_mentions = eval_on_gold_mentions + + if self._eval_on_gold_mentions: + self._use_gold_mentions = False + else: + self._use_gold_mentions = None + + + @overrides + def get_metrics(self, + reset: bool = False, + full:bool = False): + mention_recall = self._mention_recall.get_metric(reset = reset) + metrics = self._conll_coref_scores.get_metric(reset = reset, full = full) + metrics["mention_recall"] = mention_recall + + return metrics + + @overrides + def forward(self, # type: ignore + text: Dict[str, torch.LongTensor], + spans: torch.IntTensor, + span_labels: torch.IntTensor = None, + metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + + # Shape: (batch_size, document_length, embedding_size) + text_embeddings = self._lexical_dropout(self._text_field_embedder(text)) + + document_length = text_embeddings.size(1) + + # Shape: (batch_size, document_length) + text_mask = util.get_text_field_mask(text).float() + + # Shape: (batch_size, num_spans) + if self._use_gold_mentions: + if text_embeddings.is_cuda: device = torch.device('cuda') + else: device = torch.device('cpu') + + s = [torch.as_tensor(pair, dtype = torch.long, device = device) for cluster in metadata[0]["clusters"] for pair in cluster] + gm = torch.stack(s, dim = 0).unsqueeze(0).unsqueeze(1) + + span_mask = (spans.unsqueeze(2) - gm) + span_mask = (span_mask[:,:,:,0]==0) + (span_mask[:,:,:,1]==0) + span_mask, _ = (span_mask == 2).max(-1) + num_spans = span_mask.sum().item() + span_mask = span_mask.float() + else: + span_mask = (spans[:, :, 0] >= 0).squeeze(-1).float() + num_spans = spans.size(1) + # Shape: (batch_size, num_spans, 2) + spans = F.relu(spans.float()).long() + + # Shape: (batch_size, document_length, encoding_dim) + contextualized_embeddings = self._context_layer(text_embeddings, text_mask) + # Shape: (batch_size, num_spans, 2 * encoding_dim + feature_size) + endpoint_span_embeddings = self._endpoint_span_extractor(contextualized_embeddings, spans) + # Shape: (batch_size, num_spans, emebedding_size) + attended_span_embeddings = self._attentive_span_extractor(text_embeddings, spans) + + # Shape: (batch_size, num_spans, emebedding_size + 2 * encoding_dim + feature_size) + span_embeddings = torch.cat([endpoint_span_embeddings, attended_span_embeddings], -1) + + # Prune based on mention scores. + num_spans_to_keep = int(math.floor(self._spans_per_word * document_length)) + + (top_span_embeddings, top_span_mask, + top_span_indices, top_span_mention_scores) = self._mention_pruner(span_embeddings, + span_mask, + num_spans_to_keep) + top_span_mask = top_span_mask.unsqueeze(-1) + # Shape: (batch_size * num_spans_to_keep) + flat_top_span_indices = util.flatten_and_batch_shift_indices(top_span_indices, num_spans) + + # Compute final predictions for which spans to consider as mentions. + # Shape: (batch_size, num_spans_to_keep, 2) + top_spans = util.batched_index_select(spans, + top_span_indices, + flat_top_span_indices) + + # Compute indices for antecedent spans to consider. + max_antecedents = min(self._max_antecedents, num_spans_to_keep) + + # Shapes: + # (num_spans_to_keep, max_antecedents), + # (1, max_antecedents), + # (1, num_spans_to_keep, max_antecedents) + valid_antecedent_indices, valid_antecedent_offsets, valid_antecedent_log_mask = \ + self._generate_valid_antecedents(num_spans_to_keep, max_antecedents, util.get_device_of(text_mask)) + # Select tensors relating to the antecedent spans. + # Shape: (batch_size, num_spans_to_keep, max_antecedents, embedding_size) + candidate_antecedent_embeddings = util.flattened_index_select(top_span_embeddings, + valid_antecedent_indices) + + # Shape: (batch_size, num_spans_to_keep, max_antecedents) + candidate_antecedent_mention_scores = util.flattened_index_select(top_span_mention_scores, + valid_antecedent_indices).squeeze(-1) + # Compute antecedent scores. + # Shape: (batch_size, num_spans_to_keep, max_antecedents, embedding_size) + span_pair_embeddings = self._compute_span_pair_embeddings(top_span_embeddings, + candidate_antecedent_embeddings, + valid_antecedent_offsets) + # Shape: (batch_size, num_spans_to_keep, 1 + max_antecedents) + coreference_scores = self._compute_coreference_scores(span_pair_embeddings, + top_span_mention_scores, + candidate_antecedent_mention_scores, + valid_antecedent_log_mask) + + # Shape: (batch_size, num_spans_to_keep) + _, predicted_antecedents = coreference_scores.max(2) + predicted_antecedents -= 1 + + output_dict = {"top_spans": top_spans, + "antecedent_indices": valid_antecedent_indices, + "predicted_antecedents": predicted_antecedents} + if span_labels is not None: + # Find the gold labels for the spans which we kept. + pruned_gold_labels = util.batched_index_select(span_labels.unsqueeze(-1), + top_span_indices, + flat_top_span_indices) + + antecedent_labels = util.flattened_index_select(pruned_gold_labels, + valid_antecedent_indices).squeeze(-1) + antecedent_labels += valid_antecedent_log_mask.long() + + # Compute labels. + # Shape: (batch_size, num_spans_to_keep, max_antecedents + 1) + gold_antecedent_labels = self._compute_antecedent_gold_labels(pruned_gold_labels, + antecedent_labels) + coreference_log_probs = util.last_dim_log_softmax(coreference_scores, top_span_mask) + correct_antecedent_log_probs = coreference_log_probs + gold_antecedent_labels.log() + negative_marginal_log_likelihood = -util.logsumexp(correct_antecedent_log_probs).sum() + + self._mention_recall(top_spans, metadata) + self._conll_coref_scores(top_spans, valid_antecedent_indices, predicted_antecedents, metadata) + + output_dict["loss"] = negative_marginal_log_likelihood + + if metadata is not None: + output_dict["document"] = [x["original_text"] for x in metadata] + return output_dict \ No newline at end of file diff --git a/hmtl/models/hmtl.py b/hmtl/models/hmtl.py new file mode 100644 index 0000000..493ae2c --- /dev/null +++ b/hmtl/models/hmtl.py @@ -0,0 +1,207 @@ +# coding: utf-8 + +import os +import sys +import logging +from typing import Dict +from overrides import overrides + +import torch + +from allennlp.common import Params +from allennlp.common.checks import ConfigurationError +from allennlp.data import Vocabulary +from allennlp.models.model import Model +from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder +from allennlp.nn import RegularizerApplicator, InitializerApplicator +from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder +from allennlp.modules import FeedForward +from allennlp.models.crf_tagger import CrfTagger + +from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder +from hmtl.models.relation_extraction import RelationExtractor +from hmtl.models import CoreferenceCustom + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +@Model.register("hmtl") +class HMTL(Model): + """ + A class that implement the full HMTL model. + + Parameters + ---------- + vocab: ``allennlp.data.Vocabulary``, required. + The vocabulary fitted on the data. + params: ``allennlp.common.Params``, required + Configuration parameters for the multi-task model. + regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None) + A reguralizer to apply to the model's layers. + """ + def __init__(self, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator = None): + + super(HMTL, self).__init__(vocab = vocab, regularizer = regularizer) + + # Base text Field Embedder + text_field_embedder_params = params.pop("text_field_embedder") + text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, + params=text_field_embedder_params) + self._text_field_embedder = text_field_embedder + + + ############ + # NER Stuffs + ############ + ner_params = params.pop("ner") + + # Encoder + encoder_ner_params = ner_params.pop("encoder") + encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params) + self._encoder_ner = encoder_ner + + # Tagger NER - CRF Tagger + tagger_ner_params = ner_params.pop("tagger") + tagger_ner = CrfTagger(vocab = vocab, + text_field_embedder = self._text_field_embedder, + encoder = self._encoder_ner, + label_namespace = tagger_ner_params.pop("label_namespace", "labels"), + constraint_type = tagger_ner_params.pop("constraint_type", None), + dropout = tagger_ner_params.pop("dropout", None), + regularizer = regularizer) + self._tagger_ner = tagger_ner + + + ############ + # EMD Stuffs + ############ + emd_params = params.pop("emd") + + # Encoder + encoder_emd_params = emd_params.pop("encoder") + encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params) + self._encoder_emd = encoder_emd + + shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder, + previous_encoders = [self._encoder_ner]) + self._shortcut_text_field_embedder = shortcut_text_field_embedder + + + # Tagger: EMD - CRF Tagger + tagger_emd_params = emd_params.pop("tagger") + tagger_emd = CrfTagger(vocab = vocab, + text_field_embedder = self._shortcut_text_field_embedder, + encoder = self._encoder_emd, + label_namespace = tagger_emd_params.pop("label_namespace", "labels"), + constraint_type = tagger_emd_params.pop("constraint_type", None), + dropout = tagger_ner_params.pop("dropout", None), + regularizer = regularizer) + self._tagger_emd = tagger_emd + + + ############################ + # Relation Extraction Stuffs + ############################ + relation_params = params.pop("relation") + + # Encoder + encoder_relation_params = relation_params.pop("encoder") + encoder_relation = Seq2SeqEncoder.from_params(encoder_relation_params) + self._encoder_relation = encoder_relation + + shortcut_text_field_embedder_relation = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder, + previous_encoders = [self._encoder_ner, self._encoder_emd]) + self._shortcut_text_field_embedder_relation = shortcut_text_field_embedder_relation + + # Tagger: Relation + tagger_relation_params = relation_params.pop("tagger") + tagger_relation = RelationExtractor(vocab = vocab, + text_field_embedder = self._shortcut_text_field_embedder_relation, + context_layer = self._encoder_relation, + d = tagger_relation_params.pop_int("d"), + l = tagger_relation_params.pop_int("l"), + n_classes = tagger_relation_params.pop("n_classes"), + activation = tagger_relation_params.pop("activation")) + self._tagger_relation = tagger_relation + + + ############## + # Coref Stuffs + ############## + coref_params = params.pop("coref") + + # Encoder + encoder_coref_params = coref_params.pop("encoder") + encoder_coref = Seq2SeqEncoder.from_params(encoder_coref_params) + self._encoder_coref = encoder_coref + + shortcut_text_field_embedder_coref = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder, + previous_encoders = [self._encoder_ner, self._encoder_emd]) + self._shortcut_text_field_embedder_coref = shortcut_text_field_embedder_coref + + # Tagger: Coreference + tagger_coref_params = coref_params.pop("tagger") + eval_on_gold_mentions = tagger_coref_params.pop_bool("eval_on_gold_mentions", False) + init_params = tagger_coref_params.pop("initializer", None) + initializer = (InitializerApplicator.from_params(init_params) + if init_params is not None + else InitializerApplicator()) + + tagger_coref = CoreferenceCustom(vocab = vocab, + text_field_embedder = self._shortcut_text_field_embedder_coref, + context_layer = self._encoder_coref, + mention_feedforward = FeedForward.from_params(tagger_coref_params.pop("mention_feedforward")), + antecedent_feedforward = FeedForward.from_params(tagger_coref_params.pop("antecedent_feedforward")), + feature_size = tagger_coref_params.pop_int("feature_size"), + max_span_width = tagger_coref_params.pop_int("max_span_width"), + spans_per_word = tagger_coref_params.pop_float("spans_per_word"), + max_antecedents = tagger_coref_params.pop_int("max_antecedents"), + lexical_dropout = tagger_coref_params.pop_float("lexical_dropout", 0.2), + initializer = initializer, + regularizer = regularizer, + eval_on_gold_mentions = eval_on_gold_mentions) + self._tagger_coref = tagger_coref + if eval_on_gold_mentions: + self._tagger_coref._eval_on_gold_mentions = True + + logger.info("Multi-Task Learning Model has been instantiated.") + + @overrides + def forward(self, + tensor_batch, + for_training: bool = False, + task_name: str = "ner") -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + + tagger = getattr(self, "_tagger_%s" % task_name) + + if task_name == "coref" and tagger._eval_on_gold_mentions: + if for_training: tagger._use_gold_mentions = False + else: tagger._use_gold_mentions = True + + return tagger.forward(**tensor_batch) + + @overrides + def get_metrics(self, + task_name: str, + reset: bool = False, + full: bool = False) -> Dict[str, float]: + + task_tagger = getattr(self, "_tagger_" + task_name) + if full and task_name == "coref": + return task_tagger.get_metrics(reset = reset, full = full) + else: + return task_tagger.get_metrics(reset) + + @classmethod + def from_params(cls, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator) -> "HMTL": + return cls(vocab = vocab, + params = params, + regularizer = regularizer) + \ No newline at end of file diff --git a/hmtl/models/layerCoref.py b/hmtl/models/layerCoref.py new file mode 100644 index 0000000..b0f7f24 --- /dev/null +++ b/hmtl/models/layerCoref.py @@ -0,0 +1,126 @@ +# coding: utf-8 + +import os +import sys +import logging +from typing import Dict +from overrides import overrides + +import torch + +from allennlp.common import Params +from allennlp.common.checks import ConfigurationError +from allennlp.data import Vocabulary +from allennlp.models.model import Model +from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder +from allennlp.nn import RegularizerApplicator, InitializerApplicator +from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder +from allennlp.modules import FeedForward +from allennlp.models.crf_tagger import CrfTagger + +from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder +from hmtl.models.relation_extraction import RelationExtractor +from hmtl.models import CoreferenceCustom + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +@Model.register("coref_custom") +class LayerCoref(Model): + """ + A class that implement the one task of HMTL model: Coref (Lee et al). + + Parameters + ---------- + vocab: ``allennlp.data.Vocabulary``, required. + The vocabulary fitted on the data. + params: ``allennlp.common.Params``, required + Configuration parameters for the multi-task model. + regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None) + A reguralizer to apply to the model's layers. + """ + def __init__(self, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator = None): + + super(LayerCoref, self).__init__(vocab = vocab, regularizer = regularizer) + + # Base text Field Embedder + text_field_embedder_params = params.pop("text_field_embedder") + text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, + params=text_field_embedder_params) + self._text_field_embedder = text_field_embedder + + ############## + # Coref Stuffs + ############## + coref_params = params.pop("coref") + + # Encoder + encoder_coref_params = coref_params.pop("encoder") + encoder_coref = Seq2SeqEncoder.from_params(encoder_coref_params) + self._encoder_coref = encoder_coref + + # Tagger: Coreference + tagger_coref_params = coref_params.pop("tagger") + eval_on_gold_mentions = tagger_coref_params.pop_bool("eval_on_gold_mentions", False) + init_params = tagger_coref_params.pop("initializer", None) + initializer = (InitializerApplicator.from_params(init_params) + if init_params is not None + else InitializerApplicator()) + + tagger_coref = CoreferenceCustom(vocab = vocab, + text_field_embedder = self._text_field_embedder, + context_layer = self._encoder_coref, + mention_feedforward = FeedForward.from_params(tagger_coref_params.pop("mention_feedforward")), + antecedent_feedforward = FeedForward.from_params(tagger_coref_params.pop("antecedent_feedforward")), + feature_size = tagger_coref_params.pop_int("feature_size"), + max_span_width = tagger_coref_params.pop_int("max_span_width"), + spans_per_word = tagger_coref_params.pop_float("spans_per_word"), + max_antecedents = tagger_coref_params.pop_int("max_antecedents"), + lexical_dropout = tagger_coref_params.pop_float("lexical_dropout", 0.2), + initializer = initializer, + regularizer = regularizer, + eval_on_gold_mentions = eval_on_gold_mentions) + self._tagger_coref = tagger_coref + if eval_on_gold_mentions: + self._tagger_coref._eval_on_gold_mentions = True + + logger.info("Multi-Task Learning Model has been instantiated.") + + @overrides + def forward(self, + tensor_batch, + for_training: bool = False, + task_name: str = "coref") -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + + tagger = getattr(self, "_tagger_%s" % task_name) + + if task_name == "coref" and tagger._eval_on_gold_mentions: + if for_training: tagger._use_gold_mentions = False + else: tagger._use_gold_mentions = True + + return tagger.forward(**tensor_batch) + + @overrides + def get_metrics(self, + task_name: str, + reset: bool = False, + full: bool = False) -> Dict[str, float]: + + task_tagger = getattr(self, "_tagger_" + task_name) + if full and task_name == "coref": + return task_tagger.get_metrics(reset = reset, full = full) + else: + return task_tagger.get_metrics(reset) + + @classmethod + def from_params(cls, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator) -> "LayerCoref": + return cls(vocab = vocab, + params = params, + regularizer = regularizer) \ No newline at end of file diff --git a/hmtl/models/layerEmdCoref.py b/hmtl/models/layerEmdCoref.py new file mode 100644 index 0000000..657581d --- /dev/null +++ b/hmtl/models/layerEmdCoref.py @@ -0,0 +1,155 @@ +# coding: utf-8 + +import os +import sys +import logging +from typing import Dict +from overrides import overrides + +import torch + +from allennlp.common import Params +from allennlp.data import Vocabulary +from allennlp.models.model import Model +from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder +from allennlp.nn import RegularizerApplicator, InitializerApplicator +from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder +from allennlp.modules import FeedForward +from allennlp.models.crf_tagger import CrfTagger + +from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder +from hmtl.models import CoreferenceCustom + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +@Model.register("emd_coref") +class LayerEmdCoref(Model): + """ + A class that implement two tasks of HMTL model: EMD (CRF Tagger) and Coref (Lee et al., 2017). + + Parameters + ---------- + vocab: ``allennlp.data.Vocabulary``, required. + The vocabulary fitted on the data. + params: ``allennlp.common.Params``, required + Configuration parameters for the multi-task model. + regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None) + A reguralizer to apply to the model's layers. + """ + def __init__(self, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator = None): + + super(LayerEmdCoref, self).__init__(vocab = vocab, regularizer = regularizer) + + # Base text Field Embedder + text_field_embedder_params = params.pop("text_field_embedder") + text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, + params=text_field_embedder_params) + self._text_field_embedder = text_field_embedder + + + ############ + # EMD Stuffs + ############ + emd_params = params.pop("emd") + + # Encoder + encoder_emd_params = emd_params.pop("encoder") + encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params) + self._encoder_emd = encoder_emd + + # Tagger EMD - CRF Tagger + tagger_emd_params = emd_params.pop("tagger") + tagger_emd = CrfTagger(vocab = vocab, + text_field_embedder = self._text_field_embedder, + encoder = self._encoder_emd, + label_namespace = tagger_emd_params.pop("label_namespace", "labels"), + constraint_type = tagger_emd_params.pop("constraint_type", None), + dropout = tagger_emd_params.pop("dropout", None), + regularizer = regularizer) + self._tagger_emd = tagger_emd + + + ############## + # Coref Stuffs + ############## + coref_params = params.pop("coref") + + # Encoder + encoder_coref_params = coref_params.pop("encoder") + encoder_coref = Seq2SeqEncoder.from_params(encoder_coref_params) + self._encoder_coref = encoder_coref + + shortcut_text_field_embedder_coref = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder, + previous_encoders = [self._encoder_emd]) + self._shortcut_text_field_embedder_coref = shortcut_text_field_embedder_coref + + # Tagger: Coreference + tagger_coref_params = coref_params.pop("tagger") + eval_on_gold_mentions = tagger_coref_params.pop_bool("eval_on_gold_mentions", False) + init_params = tagger_coref_params.pop("initializer", None) + initializer = (InitializerApplicator.from_params(init_params) + if init_params is not None + else InitializerApplicator()) + + tagger_coref = CoreferenceCustom(vocab = vocab, + text_field_embedder = self._shortcut_text_field_embedder_coref, + context_layer = self._encoder_coref, + mention_feedforward = FeedForward.from_params(tagger_coref_params.pop("mention_feedforward")), + antecedent_feedforward = FeedForward.from_params(tagger_coref_params.pop("antecedent_feedforward")), + feature_size = tagger_coref_params.pop_int("feature_size"), + max_span_width = tagger_coref_params.pop_int("max_span_width"), + spans_per_word = tagger_coref_params.pop_float("spans_per_word"), + max_antecedents = tagger_coref_params.pop_int("max_antecedents"), + lexical_dropout = tagger_coref_params.pop_float("lexical_dropout", 0.2), + initializer = initializer, + regularizer = regularizer, + eval_on_gold_mentions = eval_on_gold_mentions) + self._tagger_coref = tagger_coref + if eval_on_gold_mentions: + self._tagger_coref._eval_on_gold_mentions = True + + logger.info("Multi-Task Learning Model has been instantiated.") + + @overrides + def forward(self, + tensor_batch, + for_training: bool = False, + task_name: str = "emd") -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + """ + Special case for forward: for coreference, we can use gold mentions to predict the clusters + during evaluation (not during training). + """ + + tagger = getattr(self, "_tagger_%s" % task_name) + + if task_name == "coref" and tagger._eval_on_gold_mentions: + if for_training: tagger._use_gold_mentions = False + else: tagger._use_gold_mentions = True + + return tagger.forward(**tensor_batch) + + @overrides + def get_metrics(self, + task_name: str = "emd", + reset: bool = False, + full: bool = False) -> Dict[str, float]: + + task_tagger = getattr(self, "_tagger_" + task_name) + if full and task_name == "coref": + return task_tagger.get_metrics(reset = reset, full = full) + else: + return task_tagger.get_metrics(reset = reset) + + @classmethod + def from_params(cls, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator) -> "LayerEmdCoref": + return cls(vocab = vocab, + params = params, + regularizer = regularizer) \ No newline at end of file diff --git a/hmtl/models/layerEmdRelation.py b/hmtl/models/layerEmdRelation.py new file mode 100644 index 0000000..e39210d --- /dev/null +++ b/hmtl/models/layerEmdRelation.py @@ -0,0 +1,129 @@ +# coding: utf-8 + +import os +import sys +import logging +from typing import Dict +from overrides import overrides + +import torch + +from allennlp.common import Params +from allennlp.data import Vocabulary +from allennlp.models.model import Model +from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder +from allennlp.nn import RegularizerApplicator +from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder +from allennlp.models.crf_tagger import CrfTagger + +from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder +from hmtl.models.relation_extraction import RelationExtractor + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +@Model.register("emd_relation") +class LayerEmdRelation(Model): + """ + A class that implement three tasks of HMTL model: EMD (CRF Tagger) and Relation Extraction. + + Parameters + ---------- + vocab: ``allennlp.data.Vocabulary``, required. + The vocabulary fitted on the data. + params: ``allennlp.common.Params``, required + Configuration parameters for the multi-task model. + regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None) + A reguralizer to apply to the model's layers. + """ + def __init__(self, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator = None): + + super(LayerEmdRelation, self).__init__(vocab = vocab, regularizer = regularizer) + + + # Base text Field Embedder + text_field_embedder_params = params.pop("text_field_embedder") + text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, + params=text_field_embedder_params) + self._text_field_embedder = text_field_embedder + + + ############ + # EMD Stuffs + ############ + emd_params = params.pop("emd") + + # Encoder + encoder_emd_params = emd_params.pop("encoder") + encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params) + self._encoder_emd = encoder_emd + + # Tagger EMD - CRF Tagger + tagger_emd_params = emd_params.pop("tagger") + tagger_emd = CrfTagger(vocab = vocab, + text_field_embedder = self._text_field_embedder, + encoder = self._encoder_emd, + label_namespace = tagger_emd_params.pop("label_namespace", "labels"), + constraint_type = tagger_emd_params.pop("constraint_type", None), + dropout = tagger_emd_params.pop("dropout", None), + regularizer = regularizer) + self._tagger_emd = tagger_emd + + + ############################ + # Relation Extraction Stuffs + ############################ + relation_params = params.pop("relation") + + # Encoder + encoder_relation_params = relation_params.pop("encoder") + encoder_relation = Seq2SeqEncoder.from_params(encoder_relation_params) + self._encoder_relation = encoder_relation + + shortcut_text_field_embedder_relation = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder, + previous_encoders = [self._encoder_emd]) + self._shortcut_text_field_embedder_relation = shortcut_text_field_embedder_relation + + # Tagger: Relation + tagger_relation_params = relation_params.pop("tagger") + tagger_relation = RelationExtractor(vocab = vocab, + text_field_embedder = self._shortcut_text_field_embedder_relation, + context_layer = self._encoder_relation, + d = tagger_relation_params.pop_int("d"), + l = tagger_relation_params.pop_int("l"), + n_classes = tagger_relation_params.pop("n_classes"), + activation = tagger_relation_params.pop("activation")) + self._tagger_relation = tagger_relation + + logger.info("Multi-Task Learning Model has been instantiated.") + + @overrides + def forward(self, + tensor_batch, + for_training: bool = False, + task_name: str = "ner") -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + + tagger = getattr(self, "_tagger_%s" % task_name) + return tagger.forward(**tensor_batch) + + @overrides + def get_metrics(self, + task_name: str, + reset: bool = False, + full: bool = False) -> Dict[str, float]: + + task_tagger = getattr(self, "_tagger_" + task_name) + return task_tagger.get_metrics(reset) + + @classmethod + def from_params(cls, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator) -> "LayerEmdRelation": + return cls(vocab = vocab, + params = params, + regularizer = regularizer) \ No newline at end of file diff --git a/hmtl/models/layerNer.py b/hmtl/models/layerNer.py new file mode 100644 index 0000000..945438a --- /dev/null +++ b/hmtl/models/layerNer.py @@ -0,0 +1,99 @@ +# coding: utf-8 + +import os +import sys +import logging +from typing import Dict +from overrides import overrides + +import torch + +from allennlp.common import Params +from allennlp.data import Vocabulary +from allennlp.models.model import Model +from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder +from allennlp.nn import RegularizerApplicator +from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder +from allennlp.models.crf_tagger import CrfTagger + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +@Model.register("ner") +class LayerNer(Model): + """ + A class that implement the first task of HMTL model: NER (CRF Tagger). + + Parameters + ---------- + vocab: ``allennlp.data.Vocabulary``, required. + The vocabulary fitted on the data. + params: ``allennlp.common.Params``, required + Configuration parameters for the multi-task model. + regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None) + A reguralizer to apply to the model's layers. + """ + def __init__(self, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator = None): + + super(LayerNer, self).__init__(vocab = vocab, regularizer = regularizer) + + # Base Text Field Embedder + text_field_embedder_params = params.pop("text_field_embedder") + text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, + params=text_field_embedder_params) + self._text_field_embedder = text_field_embedder + + ############ + # NER Stuffs + ############ + ner_params = params.pop("ner") + + # Encoder + encoder_ner_params = ner_params.pop("encoder") + encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params) + self._encoder_ner = encoder_ner + + # Tagger NER - CRF Tagger + tagger_ner_params = ner_params.pop("tagger") + tagger_ner = CrfTagger(vocab = vocab, + text_field_embedder = self._text_field_embedder, + encoder = self._encoder_ner, + label_namespace = tagger_ner_params.pop("label_namespace", "labels"), + constraint_type = tagger_ner_params.pop("constraint_type", None), + dropout = tagger_ner_params.pop("dropout", None), + regularizer = regularizer) + self._tagger_ner = tagger_ner + + logger.info("Multi-Task Learning Model has been instantiated.") + + @overrides + def forward(self, + tensor_batch, + for_training: bool = False, + task_name: str = "ner") -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + + tagger = getattr(self, "_tagger_%s" % task_name) + return tagger.forward(**tensor_batch) + + @overrides + def get_metrics(self, + task_name: str = "ner", + reset: bool = False, + full: bool = False) -> Dict[str, float]: + + task_tagger = getattr(self, "_tagger_" + task_name) + return task_tagger.get_metrics(reset = reset) + + @classmethod + def from_params(cls, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator) -> "LayerNer": + return cls(vocab = vocab, + params = params, + regularizer = regularizer) + \ No newline at end of file diff --git a/hmtl/models/layerNerEmd.py b/hmtl/models/layerNerEmd.py new file mode 100644 index 0000000..f97c532 --- /dev/null +++ b/hmtl/models/layerNerEmd.py @@ -0,0 +1,127 @@ +# coding: utf-8 + +import os +import sys +import logging +from typing import Dict +from overrides import overrides + +import torch + +from allennlp.common import Params +from allennlp.data import Vocabulary +from allennlp.models.model import Model +from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder +from allennlp.nn import RegularizerApplicator +from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder +from allennlp.models.crf_tagger import CrfTagger + +from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +@Model.register("ner_emd") +class LayerNerEmd(Model): + """ + A class that implement two tasks of HMTL model: NER (CRF Tagger) and EMD (CRF Tagger). + + Parameters + ---------- + vocab: ``allennlp.data.Vocabulary``, required. + The vocabulary fitted on the data. + params: ``allennlp.common.Params``, required + Configuration parameters for the multi-task model. + regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None) + A reguralizer to apply to the model's layers. + """ + def __init__(self, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator = None): + + super(LayerNerEmd, self).__init__(vocab = vocab, regularizer = regularizer) + + # Base text Field Embedder + text_field_embedder_params = params.pop("text_field_embedder") + text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, + params=text_field_embedder_params) + self._text_field_embedder = text_field_embedder + + ############ + # NER Stuffs + ############ + ner_params = params.pop("ner") + + # Encoder + encoder_ner_params = ner_params.pop("encoder") + encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params) + self._encoder_ner = encoder_ner + + # Tagger NER - CRF Tagger + tagger_ner_params = ner_params.pop("tagger") + tagger_ner = CrfTagger(vocab = vocab, + text_field_embedder = self._text_field_embedder, + encoder = self._encoder_ner, + label_namespace = tagger_ner_params.pop("label_namespace", "labels"), + constraint_type = tagger_ner_params.pop("constraint_type", None), + dropout = tagger_ner_params.pop("dropout", None), + regularizer = regularizer) + self._tagger_ner = tagger_ner + + + ############ + # EMD Stuffs + ############ + emd_params = params.pop("emd") + + # Encoder + encoder_emd_params = emd_params.pop("encoder") + encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params) + self._encoder_emd = encoder_emd + + shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder, + previous_encoders = [self._encoder_ner]) + self._shortcut_text_field_embedder = shortcut_text_field_embedder + + + # Tagger: EMD - CRF Tagger + tagger_emd_params = emd_params.pop("tagger") + tagger_emd = CrfTagger(vocab = vocab, + text_field_embedder = self._shortcut_text_field_embedder, + encoder = self._encoder_emd, + label_namespace = tagger_emd_params.pop("label_namespace", "labels"), + constraint_type = tagger_emd_params.pop("constraint_type", None), + dropout = tagger_ner_params.pop("dropout", None), + regularizer = regularizer) + self._tagger_emd = tagger_emd + + logger.info("Multi-Task Learning Model has been instantiated.") + + @overrides + def forward(self, + tensor_batch, + for_training: bool = False, + task_name: str = "ner") -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + + tagger = getattr(self, "_tagger_%s" % task_name) + return tagger.forward(**tensor_batch) + + @overrides + def get_metrics(self, + task_name: str, + reset: bool = False, + full: bool = False) -> Dict[str, float]: + + task_tagger = getattr(self, "_tagger_" + task_name) + return task_tagger.get_metrics(reset) + + @classmethod + def from_params(cls, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator) -> "LayerNerEmd": + return cls(vocab = vocab, + params = params, + regularizer = regularizer) diff --git a/hmtl/models/layerNerEmdCoref.py b/hmtl/models/layerNerEmdCoref.py new file mode 100644 index 0000000..9a8d146 --- /dev/null +++ b/hmtl/models/layerNerEmdCoref.py @@ -0,0 +1,183 @@ +# coding: utf-8 + +import os +import sys +import logging +from typing import Dict +from overrides import overrides + +import torch + +from allennlp.common import Params +from allennlp.data import Vocabulary +from allennlp.models.model import Model +from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder +from allennlp.nn import RegularizerApplicator, InitializerApplicator +from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder +from allennlp.modules import FeedForward +from allennlp.models.crf_tagger import CrfTagger + +from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder +from hmtl.models import CoreferenceCustom + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +@Model.register("ner_emd_coref") +class LayerNerEmdCoref(Model): + """ + A class that implement three tasks of HMTL model: NER (CRF Tagger), EMD (CRF Tagger) and Coreference Resolution. + + Parameters + ---------- + vocab: ``allennlp.data.Vocabulary``, required. + The vocabulary fitted on the data. + params: ``allennlp.common.Params``, required + Configuration parameters for the multi-task model. + regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None) + A reguralizer to apply to the model's layers. + """ + def __init__(self, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator = None): + + super(LayerNerEmdCoref, self).__init__(vocab = vocab, regularizer = regularizer) + + # Base text Field Embedder + text_field_embedder_params = params.pop("text_field_embedder") + text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, + params=text_field_embedder_params) + self._text_field_embedder = text_field_embedder + + + ############ + # NER Stuffs + ############ + ner_params = params.pop("ner") + + # Encoder + encoder_ner_params = ner_params.pop("encoder") + encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params) + self._encoder_ner = encoder_ner + + # Tagger NER - CRF Tagger + tagger_ner_params = ner_params.pop("tagger") + tagger_ner = CrfTagger(vocab = vocab, + text_field_embedder = self._text_field_embedder, + encoder = self._encoder_ner, + label_namespace = tagger_ner_params.pop("label_namespace", "labels"), + constraint_type = tagger_ner_params.pop("constraint_type", None), + dropout = tagger_ner_params.pop("dropout", None), + regularizer = regularizer) + self._tagger_ner = tagger_ner + + + ############ + # EMD Stuffs + ############ + emd_params = params.pop("emd") + + # Encoder + encoder_emd_params = emd_params.pop("encoder") + encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params) + self._encoder_emd = encoder_emd + + shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder, + previous_encoders = [self._encoder_ner]) + self._shortcut_text_field_embedder = shortcut_text_field_embedder + + + # Tagger: EMD - CRF Tagger + tagger_emd_params = emd_params.pop("tagger") + tagger_emd = CrfTagger(vocab = vocab, + text_field_embedder = self._shortcut_text_field_embedder, + encoder = self._encoder_emd, + label_namespace = tagger_emd_params.pop("label_namespace", "labels"), + constraint_type = tagger_emd_params.pop("constraint_type", None), + dropout = tagger_ner_params.pop("dropout", None), + regularizer = regularizer) + self._tagger_emd = tagger_emd + + + ############## + # Coref Stuffs + ############## + coref_params = params.pop("coref") + + # Encoder + encoder_coref_params = coref_params.pop("encoder") + encoder_coref = Seq2SeqEncoder.from_params(encoder_coref_params) + self._encoder_coref = encoder_coref + + shortcut_text_field_embedder_coref = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder, + previous_encoders = [self._encoder_ner, self._encoder_emd]) + self._shortcut_text_field_embedder_coref = shortcut_text_field_embedder_coref + + # Tagger: Coreference + tagger_coref_params = coref_params.pop("tagger") + eval_on_gold_mentions = tagger_coref_params.pop_bool("eval_on_gold_mentions", False) + init_params = tagger_coref_params.pop("initializer", None) + initializer = (InitializerApplicator.from_params(init_params) + if init_params is not None + else InitializerApplicator()) + + tagger_coref = CoreferenceCustom(vocab = vocab, + text_field_embedder = self._shortcut_text_field_embedder_coref, + context_layer = self._encoder_coref, + mention_feedforward = FeedForward.from_params(tagger_coref_params.pop("mention_feedforward")), + antecedent_feedforward = FeedForward.from_params(tagger_coref_params.pop("antecedent_feedforward")), + feature_size = tagger_coref_params.pop_int("feature_size"), + max_span_width = tagger_coref_params.pop_int("max_span_width"), + spans_per_word = tagger_coref_params.pop_float("spans_per_word"), + max_antecedents = tagger_coref_params.pop_int("max_antecedents"), + lexical_dropout = tagger_coref_params.pop_float("lexical_dropout", 0.2), + initializer = initializer, + regularizer = regularizer, + eval_on_gold_mentions = eval_on_gold_mentions) + self._tagger_coref = tagger_coref + if eval_on_gold_mentions: + self._tagger_coref._eval_on_gold_mentions = True + + logger.info("Multi-Task Learning Model has been instantiated.") + + @overrides + def forward(self, + tensor_batch, + for_training: bool = False, + task_name: str = "ner") -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + """ + Special case for forward: for coreference, we can use gold mentions to predict the clusters + during evaluation (not during training). + """ + + tagger = getattr(self, "_tagger_%s" % task_name) + + if task_name == "coref" and tagger._eval_on_gold_mentions: + if for_training: tagger._use_gold_mentions = False + else: tagger._use_gold_mentions = True + + return tagger.forward(**tensor_batch) + + @overrides + def get_metrics(self, + task_name: str, + reset: bool = False, + full: bool = False) -> Dict[str, float]: + + task_tagger = getattr(self, "_tagger_" + task_name) + if full and task_name == "coref": + return task_tagger.get_metrics(reset = reset, full = full) + else: + return task_tagger.get_metrics(reset) + + @classmethod + def from_params(cls, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator) -> "LayerNerEmdCoref": + return cls(vocab = vocab, + params = params, + regularizer = regularizer) + \ No newline at end of file diff --git a/hmtl/models/layerNerEmdRelation.py b/hmtl/models/layerNerEmdRelation.py new file mode 100644 index 0000000..ed96c27 --- /dev/null +++ b/hmtl/models/layerNerEmdRelation.py @@ -0,0 +1,155 @@ +# coding: utf-8 + +import os +import sys +import logging +from typing import Dict +from overrides import overrides + +import torch + +from allennlp.common import Params +from allennlp.data import Vocabulary +from allennlp.models.model import Model +from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder +from allennlp.nn import RegularizerApplicator +from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder +from allennlp.models.crf_tagger import CrfTagger + +from hmtl.modules.text_field_embedders import ShortcutConnectTextFieldEmbedder +from hmtl.models.relation_extraction import RelationExtractor + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +@Model.register("ner_emd_relation") +class LayerNerEmdRelation(Model): + """ + A class that implement three tasks of HMTL model: NER (CRF Tagger), EMD (CRF Tagger) and Relation Extraction. + + Parameters + ---------- + vocab: ``allennlp.data.Vocabulary``, required. + The vocabulary fitted on the data. + params: ``allennlp.common.Params``, required + Configuration parameters for the multi-task model. + regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None) + A reguralizer to apply to the model's layers. + """ + def __init__(self, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator = None): + + super(LayerNerEmdRelation, self).__init__(vocab = vocab, regularizer = regularizer) + + # Base text Field Embedder + text_field_embedder_params = params.pop("text_field_embedder") + text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, + params=text_field_embedder_params) + self._text_field_embedder = text_field_embedder + + + ############ + # NER Stuffs + ############ + ner_params = params.pop("ner") + + # Encoder + encoder_ner_params = ner_params.pop("encoder") + encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params) + self._encoder_ner = encoder_ner + + # Tagger NER - CRF Tagger + tagger_ner_params = ner_params.pop("tagger") + tagger_ner = CrfTagger(vocab = vocab, + text_field_embedder = self._text_field_embedder, + encoder = self._encoder_ner, + label_namespace = tagger_ner_params.pop("label_namespace", "labels"), + constraint_type = tagger_ner_params.pop("constraint_type", None), + dropout = tagger_ner_params.pop("dropout", None), + regularizer = regularizer) + self._tagger_ner = tagger_ner + + + ############ + # EMD Stuffs + ############ + emd_params = params.pop("emd") + + # Encoder + encoder_emd_params = emd_params.pop("encoder") + encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params) + self._encoder_emd = encoder_emd + + shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder, + previous_encoders = [self._encoder_ner]) + self._shortcut_text_field_embedder = shortcut_text_field_embedder + + + # Tagger: EMD - CRF Tagger + tagger_emd_params = emd_params.pop("tagger") + tagger_emd = CrfTagger(vocab = vocab, + text_field_embedder = self._shortcut_text_field_embedder, + encoder = self._encoder_emd, + label_namespace = tagger_emd_params.pop("label_namespace", "labels"), + constraint_type = tagger_emd_params.pop("constraint_type", None), + dropout = tagger_ner_params.pop("dropout", None), + regularizer = regularizer) + self._tagger_emd = tagger_emd + + + ############################ + # Relation Extraction Stuffs + ############################ + relation_params = params.pop("relation") + + # Encoder + encoder_relation_params = relation_params.pop("encoder") + encoder_relation = Seq2SeqEncoder.from_params(encoder_relation_params) + self._encoder_relation = encoder_relation + + shortcut_text_field_embedder_relation = ShortcutConnectTextFieldEmbedder(base_text_field_embedder = self._text_field_embedder, + previous_encoders = [self._encoder_ner, self._encoder_emd]) + self._shortcut_text_field_embedder_relation = shortcut_text_field_embedder_relation + + # Tagger: Relation + tagger_relation_params = relation_params.pop("tagger") + tagger_relation = RelationExtractor(vocab = vocab, + text_field_embedder = self._shortcut_text_field_embedder_relation, + context_layer = self._encoder_relation, + d = tagger_relation_params.pop_int("d"), + l = tagger_relation_params.pop_int("l"), + n_classes = tagger_relation_params.pop("n_classes"), + activation = tagger_relation_params.pop("activation")) + self._tagger_relation = tagger_relation + + logger.info("Multi-Task Learning Model has been instantiated.") + + @overrides + def forward(self, + tensor_batch, + for_training: bool = False, + task_name: str = "ner") -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + + tagger = getattr(self, "_tagger_%s" % task_name) + return tagger.forward(**tensor_batch) + + @overrides + def get_metrics(self, + task_name: str, + reset: bool = False, + full: bool = False) -> Dict[str, float]: + + task_tagger = getattr(self, "_tagger_" + task_name) + return task_tagger.get_metrics(reset) + + @classmethod + def from_params(cls, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator) -> "LayerNerEmdRelation": + return cls(vocab = vocab, + params = params, + regularizer = regularizer) \ No newline at end of file diff --git a/hmtl/models/layerRelation.py b/hmtl/models/layerRelation.py new file mode 100644 index 0000000..45b7892 --- /dev/null +++ b/hmtl/models/layerRelation.py @@ -0,0 +1,100 @@ +# coding: utf-8 + +import os +import sys +import logging +from typing import Dict +from overrides import overrides + +import torch + +from allennlp.common import Params +from allennlp.data import Vocabulary +from allennlp.models.model import Model +from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder +from allennlp.nn import RegularizerApplicator +from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder + +from hmtl.models.relation_extraction import RelationExtractor + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +@Model.register("relation") +class LayerRelation(Model): + """ + A class that implement one task of HMTL model: Relation Extraction. + + Parameters + ---------- + vocab: ``allennlp.data.Vocabulary``, required. + The vocabulary fitted on the data. + params: ``allennlp.common.Params``, required + Configuration parameters for the multi-task model. + regularizer: ``allennlp.nn.RegularizerApplicator``, optional (default = None) + A reguralizer to apply to the model's layers. + """ + def __init__(self, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator = None): + + super(LayerRelation, self).__init__(vocab = vocab, regularizer = regularizer) + + # Base text Field Embedder + text_field_embedder_params = params.pop("text_field_embedder") + text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, + params=text_field_embedder_params) + self._text_field_embedder = text_field_embedder + + ############################ + # Relation Extraction Stuffs + ############################ + relation_params = params.pop("relation") + + # Encoder + encoder_relation_params = relation_params.pop("encoder") + encoder_relation = Seq2SeqEncoder.from_params(encoder_relation_params) + self._encoder_relation = encoder_relation + + # Tagger: Relation + tagger_relation_params = relation_params.pop("tagger") + tagger_relation = RelationExtractor(vocab = vocab, + text_field_embedder = self._text_field_embedder, + context_layer = self._encoder_relation, + d = tagger_relation_params.pop_int("d"), + l = tagger_relation_params.pop_int("l"), + n_classes = tagger_relation_params.pop("n_classes"), + activation = tagger_relation_params.pop("activation")) + self._tagger_relation = tagger_relation + + logger.info("Multi-Task Learning Model has been instantiated.") + + @overrides + def forward(self, + tensor_batch, + for_training: bool = False, + task_name: str = "relation") -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + + tagger = getattr(self, "_tagger_%s" % task_name) + return tagger.forward(**tensor_batch) + + @overrides + def get_metrics(self, + task_name: str = "relation", + reset: bool = False, + full: bool = False) -> Dict[str, float]: + + task_tagger = getattr(self, "_tagger_" + task_name) + return task_tagger.get_metrics(reset) + + @classmethod + def from_params(cls, + vocab: Vocabulary, + params: Params, + regularizer: RegularizerApplicator) -> "layerRelation": + return cls(vocab = vocab, + params = params, + regularizer = regularizer) + \ No newline at end of file diff --git a/hmtl/models/relation_extraction.py b/hmtl/models/relation_extraction.py new file mode 100644 index 0000000..33533b1 --- /dev/null +++ b/hmtl/models/relation_extraction.py @@ -0,0 +1,274 @@ +# coding: utf-8 + +import logging +import math +from typing import Any, Dict, List, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable #from torch.nn.parameter import Parameter, Variable + +from overrides import overrides + +from allennlp.common import Params +from allennlp.data import Vocabulary +from allennlp.models.model import Model +from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder +from allennlp.modules.span_extractors import SelfAttentiveSpanExtractor, EndpointSpanExtractor +from allennlp.nn import util + +from hmtl.training.metrics import RelationF1Measure + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +# Mapping specific to the dataset used in our setting (ACE2005) +# Please adapt it if necessary +rel_type_2_idx = {"ORG-AFF": 0, + "PHYS": 1, + "ART": 2, + "PER-SOC": 3, + "PART-WHOLE": 4, + "GEN-AFF": 5} +idx_2_rel_type = {value: key for key, value in rel_type_2_idx.items()} + + +@Model.register("relation_extractor") +class RelationExtractor(Model): + """ + A class containing the scoring model for relation extraction. + It is derived the model proposed by Bekoulis G. in + "Joint entity recognition and relation extraction as a multi-head selection problem" + https://bekou.github.io/papers/eswa2018b/bekoulis_eswa_2018b.pdf + + Parameters + ---------- + vocab: ``allennlp.data.Vocabulary``, required. + The vocabulary fitted on the data. + text_field_embedder : ``TextFieldEmbedder``, required + Used to embed the ``text`` ``TextField`` we get as input to the model. + context_layer : ``Seq2SeqEncoder``, required + This layer incorporates contextual information for each word in the document. + d: ``int``, required + The (half) dimension of embedding given by the encoder context_layer. + l: ``int``, required + The dimension of the relation extractor scorer embedding. + n_classes: ``int``, required + The number of different possible relation classes. + activation: ``str``, optional (default = "relu") + Non-linear activation function for the scorer. Can be either "relu" or "tanh". + label_namespace: ``str``, optional (default = "relation_ace_labels") + The namespace for the labels of the task of relation extraction. + """ + def __init__(self, + vocab: Vocabulary, + text_field_embedder: TextFieldEmbedder, + context_layer: Seq2SeqEncoder, + d: int, + l: int, + n_classes: int, + activation: str = "relu", + label_namespace: str = "relation_ace_labels") -> None: + super(RelationExtractor, self).__init__(vocab) + + + self._U = nn.Parameter(torch.Tensor(2*d, l)) + self._W = nn.Parameter(torch.Tensor(2*d, l)) + self._V = nn.Parameter(torch.Tensor(l, n_classes)) + self._b = nn.Parameter(torch.Tensor(l)) + + self.init_weights() + + self._n_classes = n_classes + self._activation = activation + + self._text_field_embedder = text_field_embedder + self._context_layer = context_layer + + self._label_namespace = label_namespace + + self._relation_metric = RelationF1Measure() + + self._loss_fn = nn.BCEWithLogitsLoss() + + + def init_weights(self) -> None: + """ + Initialization for the weights of the model. + """ + nn.init.kaiming_normal_(self._U) + nn.init.kaiming_normal_(self._W) + nn.init.kaiming_normal_(self._V) + + nn.init.normal_(self._b) + + + def multi_class_cross_entropy_loss(self, + scores, + labels, + mask): + """ + Compute the loss from + """ + #Compute the mask before computing the loss + #Transform the mask that is at the sentence level (#Size: n_batches x padded_document_length) + #to a suitable format for the relation labels level + padded_document_length = mask.size(1) + mask = mask.float() #Size: n_batches x padded_document_length + squared_mask = torch.stack([e.view(padded_document_length, 1)*e for e in mask], dim = 0) + squared_mask = squared_mask.unsqueeze(-1).repeat(1,1,1,self._n_classes) #Size: n_batches x padded_document_length x padded_document_length x n_classes + + + #The scores (and gold labels) are flattened before using + #the binary cross entropy loss. + # We thus transform + flat_size = scores.size() + scores = scores*squared_mask #Size: n_batches x padded_document_length x padded_document_length x n_classes + scores_flat = scores.view(flat_size[0], flat_size[1], flat_size[2]*self._n_classes) #Size: n_batches x padded_document_length x (padded_document_length x n_classes) + labels = labels*squared_mask #Size: n_batches x padded_document_length x padded_document_length x n_classes + labels_flat = labels.view(flat_size[0], flat_size[1], flat_size[2]*self._n_classes) #Size: n_batches x padded_document_length x (padded_document_length x n_classes) + + loss = self._loss_fn(scores_flat, labels_flat) + + #Amplify the loss to actually see something... + return 100*loss + + + @overrides + def forward(self, + text: Dict[str, torch.LongTensor], + relations: torch.IntTensor = None) -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + """ + Forward pass of the model. + Compute the predictions and the loss (if labels are available). + + Parameters: + ---------- + text: Dict[str, torch.LongTensor] + The input sentences which have transformed into indexes (integers) according to a mapping token:str -> token:int + relations: torch.IntTensor + The gold relations to predict. + """ + + #Text field embedder map the token:int to their word embedding representation token:embedding (whatever these embeddings are). + text_embeddings = self._text_field_embedder(text) + #Compute the mask from the text: 1 if there is actually a word in the corresponding sentence, 0 if it has been padded. + mask = util.get_text_field_mask(text) #Size: batch_size x padded_document_length + + + #Compute the contextualized representation from the word embeddings. + #Usually, _context_layer is a Seq2seq model such as LSTM + encoded_text = self._context_layer(text_embeddings, mask) #Size: batch_size x padded_document_length x lstm_output_size + + + ###### Relation Scorer ############## + #Compute the relation scores + left = torch.matmul(encoded_text, self._U) #Size: batch_size x padded_document_length x l + right = torch.matmul(encoded_text, self._W) #Size: batch_size x padded_document_length x l + + left = left.permute(1,0,2) + left = left.unsqueeze(3) + right = right.permute(0,2,1) + right = right.unsqueeze(0) + + B = left + right + B = B.permute(1,0,3,2) #Size: batch_size x padded_document_length x padded_document_length x l + + outer_sum_bias = B + self._b #Size: batch_size x padded_document_length x padded_document_length x l + if self._activation == "relu": + activated_outer_sum_bias = F.relu(outer_sum_bias) + elif self._activation == "tanh": + activated_outer_sum_bias = F.tanh(outer_sum_bias) + + relation_scores = torch.matmul(activated_outer_sum_bias, self._V) #Size: batch_size x padded_document_length x padded_document_length x n_classes + ################################################################# + + + batch_size, padded_document_length = mask.size() + + relation_sigmoid_scores = torch.sigmoid(relation_scores) # F.sigmoid(relation_scores) #Size: batch_size x padded_document_length x padded_document_length x n_classes + + #predicted_relations[l, i, j, k] == 1 iif we predict a relation k with ARG1==i, ARG2==j in the l-th sentence of the batch + predicted_relations = torch.round(relation_sigmoid_scores) #Size: batch_size x padded_document_length x padded_document_length x n_classes + + output_dict = { + "relation_sigmoid_scores": relation_sigmoid_scores, + "predicted_relations": predicted_relations, + "mask": mask + } + + + if relations is not None: + #Reformat the gold relations before computing the loss + #Size: batch_size x padded_document_length x padded_document_length x n_classes + #gold_relations[l, i, j, k] == 1 iif we predict a relation k with ARG1==i, ARG2==j in the l-th sentence of the batch + gold_relations = torch.zeros(batch_size, padded_document_length, padded_document_length, self._n_classes) + + + for exple_idx, exple_tags in enumerate(relations): #going through the batch + #rel is a list of list containing the current sentence in the batch + #each sublist in rel is of size padded_document_length + #and encodes a relation in the sentence where the two non zeros elements + #indicate the two words arguments AND the relation type between these two words. + for rel in exple_tags: + #relations have been padded, so for each sentence in the batch there are + #max_nb_of_relations_in_batch_for_one_sentence relations ie (number of sublist such as rel) + #The padded relations are simply list of size padded_document_length filled with 0. + if rel.sum().item()==0: continue + + for idx in rel.nonzero(): + label_srt = self.vocab.get_token_from_index(rel[idx].item(), self._label_namespace) + arg, rel_type = label_srt.split("_") + if arg == "ARG1": x = idx.data[0] + else: y = idx.data[0] + + gold_relations[exple_idx, x, y, rel_type_2_idx[rel_type]] = 1 + + #GPU support + if text_embeddings.is_cuda: gold_relations = gold_relations.cuda() + + + #Compute the loss + output_dict["loss"] = self.multi_class_cross_entropy_loss(scores = relation_scores, labels = gold_relations, mask = mask) + + #Compute the metrics with the predictions. + self._relation_metric(predictions = predicted_relations, gold_labels = gold_relations, mask = mask) + + return output_dict + + + @overrides + def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, Any]: + """ + Decode the predictions + """ + decoded_predictions = [] + + for instance_tags in output_dict["predicted_relations"]: + sentence_length = instance_tags.size(0) + decoded_relations = [] + + for arg1, arg2, rel_type_idx in instance_tags.nonzero().data: + relation = ["*"]*sentence_length + rel_type = idx_2_rel_type[rel_type_idx] + relation[arg1] = "ARG1_" + rel_type + relation[arg2] = "ARG2_" + rel_type + decoded_relations.append(relation) + + decoded_predictions.append(decoded_relations) + + output_dict["decoded_predictions"] = decoded_predictions + + return output_dict + + + @overrides + def get_metrics(self, reset: bool = False) -> Dict[str, float]: + """ + Compute the metrics for relation: precision, recall and f1. + A relation is considered correct if we can correctly predict the last word of ARG1, the last word of ARG2 and the relation type. + """ + metric_dict = self._relation_metric.get_metric(reset = reset) + return {x: y for x, y in metric_dict.items() if "overall" in x} diff --git a/hmtl/modules/__init__.py b/hmtl/modules/__init__.py new file mode 100644 index 0000000..a4d6118 --- /dev/null +++ b/hmtl/modules/__init__.py @@ -0,0 +1,4 @@ +# coding: utf-8 + +from hmtl.modules import seq2seq_encoders +from hmtl.modules import text_field_embedders \ No newline at end of file diff --git a/hmtl/modules/seq2seq_encoders/__init__.py b/hmtl/modules/seq2seq_encoders/__init__.py new file mode 100644 index 0000000..7aef322 --- /dev/null +++ b/hmtl/modules/seq2seq_encoders/__init__.py @@ -0,0 +1,3 @@ +# coding: utf-8 + +from hmtl.modules.seq2seq_encoders.stacked_gru import StackedGRU \ No newline at end of file diff --git a/hmtl/modules/seq2seq_encoders/stacked_gru.py b/hmtl/modules/seq2seq_encoders/stacked_gru.py new file mode 100644 index 0000000..b029e33 --- /dev/null +++ b/hmtl/modules/seq2seq_encoders/stacked_gru.py @@ -0,0 +1,129 @@ +# coding: utf-8 + +from typing import List + +from overrides import overrides +import torch +from torch.nn import Dropout, Linear +from torch.nn import GRU + +from allennlp.nn.util import last_dim_softmax, weighted_sum +from allennlp.modules.seq2seq_encoders.seq2seq_encoder import Seq2SeqEncoder +from allennlp.common.params import Params + + +@Seq2SeqEncoder.register("stacked_gru") +class StackedGRU(Seq2SeqEncoder): + # pylint: disable=line-too-long + """ + This class implements a multiple layer GRU (RNN). + The specificity of this implementation compared to the default one in allennlp + (``allennlp.modules.seq2seq_encoders.Seq2SeqEncoder``) is the ability to + specify differents hidden state size for each layer of the in the + multiple-stacked-layers-GRU. + Optionally, different dropouts can be individually specified for each layer of the encoder. + + Parameters + ---------- + input_dim : ``int``, required. + The size of the last dimension of the input tensor. + hidden_sizes : ``List[int]``, required. + The hidden state sizes of each layer of the stacked-GRU. + num_layers : ``int``, required. + The number of layers to stack in the encoder. + bidirectional : ``bool``, required + Wheter or not the layers should be bidirectional. + dropouts : ``List[float]``, optional (default = None). + The dropout probabilities applied to each layer. The length of this list should + be equal to the number of layers ``num_layers``. + """ + + def __init__(self, + input_dim: int, + hidden_sizes: List[int], + num_layers: int, + bidirectional: bool, + dropouts: List[float] = None) -> None: + super(StackedGRU, self).__init__() + + self._input_dim = input_dim + self._hidden_sizes = hidden_sizes + self._num_layers = num_layers + self._bidirectional = bidirectional + self._dropouts = [0.]*num_layers if dropouts is None else dropouts + + if len(self._hidden_sizes) != self._num_layers: + raise ValueError(f"Number of layers ({self._num_layers}) must be equal to the length of hidden state size list ({len(self._hidden_sizes)})") + if len(self._dropouts) != self._num_layers: + raise ValueError(f"Number of layers ({self._num_layers}) must be equal to the legnth of drouput rates list ({len(self._dropouts)})") + + self._output_dim = hidden_sizes[-1] + if self._bidirectional: + self._output_dim *= 2 + + self._gru_layers: List[GRU] = [] + for k in range(self._num_layers): + input_size = self._input_dim if k==0 else self._hidden_sizes[k-1] + if self._bidirectional and (k!=0): + input_size *= 2 + + gru_layer = GRU(input_size = input_size, + hidden_size = self._hidden_sizes[k], + dropout = self._dropouts[k], + num_layers = 1, + bidirectional = self._bidirectional) + self.add_module(f"gru_{k}", gru_layer) + self._gru_layers.append(gru_layer) + + + def get_input_dim(self): + return self._input_dim + + def get_output_dim(self): + return self._output_dim + + @overrides + def is_bidirectional(self): + return self._bidirectional + + @overrides + def forward(self, # pylint: disable=arguments-differ + inputs: torch.Tensor, + mask: torch.LongTensor = None) -> torch.FloatTensor: + """ + Parameters + ---------- + inputs : ``torch.FloatTensor``, required. + A tensor of shape (batch_size, timesteps, input_dim) + mask : ``torch.FloatTensor``, optional (default = None). + A tensor of shape (batch_size, timesteps). + + Returns + ------- + A tensor of shape (batch_size, timesteps, output_projection_dim), + where output_projection_dim = input_dim by default. + """ + gru = self._gru_layers[0] + outputs, _ = gru(inputs) + + for k in range(1, self._num_layers): + gru = self._gru_layers[k] + next_outputs, _ = gru(outputs) + outputs = next_outputs + + return outputs + + @classmethod + def from_params(cls, params: Params) -> 'StackedGRU': + input_dim = params.pop_int('input_dim') + hidden_sizes = params.pop('hidden_sizes') + dropouts = params.pop('dropouts', None) + num_layers = params.pop_int('num_layers') + bidirectional = params.pop_bool('bidirectional') + params.assert_empty(cls.__name__) + + return cls(input_dim = input_dim, + hidden_sizes = hidden_sizes, + num_layers = num_layers, + bidirectional = bidirectional, + dropouts = dropouts) \ No newline at end of file diff --git a/hmtl/modules/text_field_embedders/__init__.py b/hmtl/modules/text_field_embedders/__init__.py new file mode 100644 index 0000000..f12783b --- /dev/null +++ b/hmtl/modules/text_field_embedders/__init__.py @@ -0,0 +1,3 @@ +# coding: utf-8 + +from hmtl.modules.text_field_embedders.shortcut_connect_text_field_embedder import ShortcutConnectTextFieldEmbedder \ No newline at end of file diff --git a/hmtl/modules/text_field_embedders/shortcut_connect_text_field_embedder.py b/hmtl/modules/text_field_embedders/shortcut_connect_text_field_embedder.py new file mode 100644 index 0000000..1be64f5 --- /dev/null +++ b/hmtl/modules/text_field_embedders/shortcut_connect_text_field_embedder.py @@ -0,0 +1,63 @@ +# coding: utf-8 + +from typing import Dict, List + +import torch +from overrides import overrides + +from allennlp.modules.text_field_embedders.text_field_embedder import TextFieldEmbedder +from allennlp.modules.seq2seq_encoders.seq2seq_encoder import Seq2SeqEncoder +import allennlp.nn.util as util + + +@TextFieldEmbedder.register("shortcut_connect_text_field_embedder") +class ShortcutConnectTextFieldEmbedder(TextFieldEmbedder): + """ + This class implement a specific text field embedder that benefits from the output of + a ``allennlp.modules.seq2seq_encoders.seq2seq_encoder.Seq2SeqEncoder``. + It simply concatenate two embeddings vectors: the one from the previous_encoder + (an ``allennlp.modules.seq2seq_encoders.seq2seq_encoder.Seq2SeqEncoder``) and + the one from the base_text_field_embedder + (an ``allennlp.modules.text_field_embedders.text_field_embedder.TextFieldEmbedder``). + The latter actually computes the word representation and explains the name of this class + "ShortcutConnectTextFieldEmbedder": it will feed the input of a ``Seq2SeqEncoder`` + with the output of the previous_encoder and the output of the base_text_field_embedder, + the connection with base_text_field_embedder actually circumventing the previous_encoder. + + Parameters + ---------- + base_text_field_embedder : ``TextFieldEmbedder``, required + The text field embedder that computes the word representation at the base of the model. + previous_encoder : ``Seq2SeqEncoder``, required + The previous seq2seqencoder. + """ + def __init__(self, + base_text_field_embedder: TextFieldEmbedder, + previous_encoders: List[Seq2SeqEncoder]) -> None: + super(ShortcutConnectTextFieldEmbedder, self).__init__() + self._base_text_field_embedder = base_text_field_embedder + self._previous_encoders = previous_encoders + + @overrides + def get_output_dim(self) -> int: + output_dim = 0 + output_dim += self._base_text_field_embedder.get_output_dim() + output_dim += self._previous_encoders[-1].get_output_dim() + + return output_dim + + @overrides + def forward(self, + text_field_input: Dict[str, torch.Tensor], + num_wrapping_dims: int = 0) -> torch.Tensor: + text_field_embeddings = self._base_text_field_embedder.forward(text_field_input, num_wrapping_dims) + base_representation = text_field_embeddings + mask = util.get_text_field_mask(text_field_input) + + + for encoder in self._previous_encoders: + text_field_embeddings = encoder(text_field_embeddings, mask) + text_field_embeddings = torch.cat([base_representation, text_field_embeddings], dim = -1) + + + return torch.cat([text_field_embeddings], dim=-1) \ No newline at end of file diff --git a/hmtl/tasks/__init__.py b/hmtl/tasks/__init__.py new file mode 100644 index 0000000..ad064fd --- /dev/null +++ b/hmtl/tasks/__init__.py @@ -0,0 +1,3 @@ +# coding: utf-8 + +from hmtl.tasks.task import Task \ No newline at end of file diff --git a/hmtl/tasks/task.py b/hmtl/tasks/task.py new file mode 100644 index 0000000..c04f206 --- /dev/null +++ b/hmtl/tasks/task.py @@ -0,0 +1,96 @@ +# coding: utf-8 + +from typing import List +from allennlp.common import Params +from allennlp.commands.train import datasets_from_params +from allennlp.data.iterators import DataIterator +from allennlp.common.checks import ConfigurationError + +class Task(): + """ + A class to encapsulate the necessary informations (and datasets) + about each task. + + Parameters + ---------- + name : ``str``, required + The name of the task. + validation_metric_name : ``str``, required + The name of the validation metric to use to monitor training + and select the best epoch. + validation_metric_decreases : ``bool``, required + Whether or not the validation metric should decrease for improvement. + evaluate_on_test : ``bool`, optional (default = False) + Whether or not the task should be evaluated on the test set at the end of the training. + """ + def __init__(self, + name: str, + validation_metric_name: str, + validation_metric_decreases: bool, + evaluate_on_test: bool = False) -> None: + self._name = name + + self._train_data = None + self._validation_data = None + self._test_data = None + self._evaluate_on_test = evaluate_on_test + + self._val_metric = validation_metric_name + self._val_metric_decreases = validation_metric_decreases + + self._data_iterator = None + + + def set_data_iterator(self, + data_iterator: DataIterator): + if data_iterator is not None: + self._data_iterator = data_iterator + else: + ConfigurationError(f"data_iterator cannot be None in set_iterator - Task name: {self._name}") + + + def load_data_from_params(self, + params: Params): + all_datasets = datasets_from_params(params) + datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) + + for dataset in datasets_for_vocab_creation: + if dataset not in all_datasets: + raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") + + instances_for_vocab_creation = (instance for key, dataset in all_datasets.items() + for instance in dataset + if key in datasets_for_vocab_creation) + + self._instances_for_vocab_creation = instances_for_vocab_creation + self._datasets_for_vocab_creation = datasets_for_vocab_creation + + if 'train' in all_datasets.keys(): + self._train_data = all_datasets["train"] + self._tr_instances = sum(1 for e in self._train_data) # This is horrible if lazy iterator (Iterable) + if 'validation' in all_datasets.keys(): + self._validation_data = all_datasets["validation"] + self._val_instances = sum(1 for e in self._validation_data) # This is horrible if lazy iterator (Iterable) + if 'test' in all_datasets.keys(): + self._test_data = all_datasets["test"] + self._test_instances = sum(1 for e in self._test_data) # This is horrible if lazy iterator (Iterable) + + # If trying to evaluate on test set, make sure the dataset is loaded + if self._evaluate_on_test: + assert self._test_data is not None + + #return instances_for_vocab_creation, datasets_for_vocab_creation, all_datasets + return instances_for_vocab_creation, datasets_for_vocab_creation + + @classmethod + def from_params(cls, params: Params) -> "Task": + task_name = params.pop("task_name", "ner") + validation_metric_name = params.pop("validation_metric_name", "f1-measure-overall") + validation_metric_decreases = params.pop_bool("validation_metric_decreases", False) + evaluate_on_test = params.pop_bool("evaluate_on_test", False) + + params.assert_empty(cls.__name__) + return cls(name = task_name, + validation_metric_name = validation_metric_name, + validation_metric_decreases = validation_metric_decreases, + evaluate_on_test = evaluate_on_test) \ No newline at end of file diff --git a/hmtl/training/__init__.py b/hmtl/training/__init__.py new file mode 100644 index 0000000..f7eecc3 --- /dev/null +++ b/hmtl/training/__init__.py @@ -0,0 +1,3 @@ +# coding: utf-8 + +from hmtl.training.sampler_multi_task_trainer import SamplerMultiTaskTrainer \ No newline at end of file diff --git a/hmtl/training/metrics/__init__.py b/hmtl/training/metrics/__init__.py new file mode 100644 index 0000000..d0094ad --- /dev/null +++ b/hmtl/training/metrics/__init__.py @@ -0,0 +1,4 @@ +# coding: utf-8 + +from hmtl.training.metrics.relation_f1_measure import RelationF1Measure +from hmtl.training.metrics.conll_coref_full_scores import ConllCorefFullScores \ No newline at end of file diff --git a/hmtl/training/metrics/conll_coref_full_scores.py b/hmtl/training/metrics/conll_coref_full_scores.py new file mode 100644 index 0000000..76bba7a --- /dev/null +++ b/hmtl/training/metrics/conll_coref_full_scores.py @@ -0,0 +1,35 @@ +from overrides import overrides + +from allennlp.training.metrics import ConllCorefScores + +class ConllCorefFullScores(ConllCorefScores): + """ + This is marginal modification of the class ``allennlp.training.metrics.metric.ConllCorefScores``. + It leaves the possibility to get the 3 detailled coreference metrics (B3, MUC, CEAFE), + and not only their average. + """ + def __init__(self) -> None: + super(ConllCorefFullScores, self).__init__() + + @overrides + def get_metric(self, reset: bool = False, full: bool = False): + full_metrics = {} + if full: + for e in self.scorers: + metric_name = e.metric.__name__ + full_metrics[metric_name] = {"precision": e.get_precision(), + "recall": e.get_recall(), + "f1_score": e.get_f1()} + + metrics = (lambda e: e.get_precision(), lambda e: e.get_recall(), lambda e: e.get_f1()) + precision, recall, f1_score = tuple(sum(metric(e) for e in self.scorers) / len(self.scorers) + for metric in metrics) + + full_metrics["coref_precision"] = precision + full_metrics["coref_recall"] = recall + full_metrics["coref_f1"] = f1_score + + if reset: + self.reset() + + return full_metrics \ No newline at end of file diff --git a/hmtl/training/metrics/relation_f1_measure.py b/hmtl/training/metrics/relation_f1_measure.py new file mode 100644 index 0000000..e3fd299 --- /dev/null +++ b/hmtl/training/metrics/relation_f1_measure.py @@ -0,0 +1,109 @@ +from typing import Dict, List, Optional, Set +from collections import defaultdict + +import torch + +from allennlp.common.checks import ConfigurationError +from allennlp.nn.util import get_lengths_from_binary_sequence_mask #, ones_like +from allennlp.data.vocabulary import Vocabulary +from allennlp.training.metrics.metric import Metric + +@Metric.register("relation_f1") +class RelationF1Measure(Metric): + """ + """ + def __init__(self) -> None: + """ + A class for computing the metrics specific to relation extraction. + We consider a relation correct if we correctly predict the last of the head of the two arguments and the relation type. + """ + self._true_positives: int = 0 + self._false_positives: int = 0 + self._false_negatives: int = 0 + + def __call__(self, + predictions: torch.Tensor, + gold_labels: torch.Tensor, + mask: Optional[torch.Tensor] = None): + """ + Update the TP, FP and FN counters. + + Parameters + ---------- + predictions : ``torch.Tensor``, required. + A tensor of predictions of shape (batch_size, sequence_length, num_classes). + gold_labels : ``torch.Tensor``, required. + A tensor of integer class label of shape (batch_size, sequence_length). It must be the same + shape as the ``predictions`` tensor without the ``num_classes`` dimension. + mask: ``torch.Tensor``, optional (default = None). + A masking tensor the same size as ``gold_labels``. + """ + if mask is None: + mask = torch.ones_like(gold_labels) #ones_like(gold_labels) + # Get the data from the Variables. + predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, + gold_labels, + mask) + + if (gold_labels.size() != predictions.size()): + raise ConfigurationError("Predictions and gold labels don't have the same size.") + + #Apply mask + #Compute the mask before computing the loss + #Transform the mask that is at the sentence level (#Size: n_batches x padded_document_length) + #to a suitable format for the relation labels level + _, padded_document_length, _, n_classes = predictions.size() + mask = mask.float() + squared_mask = torch.stack([e.view(padded_document_length, 1)*e for e in mask], dim = 0) + squared_mask = squared_mask.unsqueeze(-1).repeat(1, 1, 1, n_classes) #Size: n_batches x padded_document_length x padded_document_length x n_classes + + gold_labels = gold_labels.cpu() + + predictions = predictions*squared_mask #Size: n_batches x padded_document_length x padded_document_length x n_classes + gold_labels = gold_labels*squared_mask #Size: n_batches x padded_document_length x padded_document_length x n_classes + + + # Iterate over timesteps in batch. + batch_size = gold_labels.size(0) + for i in range(batch_size): + flattened_predictions = predictions[i].view(-1).nonzero().cpu().numpy() + flattened_gold_labels = gold_labels[i].view(-1).nonzero().cpu().numpy() + + for prediction in flattened_predictions: + if prediction in flattened_gold_labels: + self._true_positives += 1 + else: + self._false_positives += 1 + for gold in flattened_gold_labels: + if gold not in flattened_predictions: + self._false_negatives += 1 + + + def get_metric(self, reset: bool = False): + """ + Get the metrics and reset the counters if necessary. + """ + all_metrics = {} + + # Compute the precision, recall and f1 for all spans jointly. + precision, recall, f1_measure = self._compute_metrics(self._true_positives, + self._false_positives, + self._false_negatives) + all_metrics["precision-overall"] = precision + all_metrics["recall-overall"] = recall + all_metrics["f1-measure-overall"] = f1_measure + if reset: + self.reset() + return all_metrics + + @staticmethod + def _compute_metrics(true_positives: int, false_positives: int, false_negatives: int): + precision = float(true_positives) / float(true_positives + false_positives + 1e-13) + recall = float(true_positives) / float(true_positives + false_negatives + 1e-13) + f1_measure = 2. * ((precision * recall) / (precision + recall + 1e-13)) + return precision, recall, f1_measure + + def reset(self): + self._true_positives = 0 + self._false_positives = 0 + self._false_negatives = 0 diff --git a/hmtl/training/multi_task_trainer.py b/hmtl/training/multi_task_trainer.py new file mode 100644 index 0000000..f9b345e --- /dev/null +++ b/hmtl/training/multi_task_trainer.py @@ -0,0 +1,380 @@ +# coding: utf-8 + +import os +import math +import time +from copy import deepcopy +import random +import logging +import itertools +import shutil +from tensorboardX import SummaryWriter + +from typing import List, Optional, Dict, Any, Tuple + +import torch +import torch.optim.lr_scheduler +import tqdm + +from allennlp.common import Params +from allennlp.common.checks import ConfigurationError, check_for_gpu +from allennlp.common.util import peak_memory_mb, gpu_memory_mb +from allennlp.nn.util import device_mapping, move_to_device +from allennlp.training.learning_rate_schedulers import LearningRateScheduler +from allennlp.training.optimizers import Optimizer +from allennlp.training.trainer import sparse_clip_norm, TensorboardWriter +from allennlp.models.model import Model +from allennlp.common.registrable import Registrable + + +from hmtl.tasks import Task + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + +class MultiTaskTrainer(Registrable): + def __init__(self, + model: Model, + task_list: List[Task], + optimizer_params: Params, + lr_scheduler_params: Params, + patience: Optional[int] = None, + num_epochs: int = 20, + serialization_dir: str = None, + cuda_device: int = -1, + grad_norm: Optional[float] = None, + grad_clipping: Optional[float] = None, + min_lr: float = 0.00001, + no_tqdm: bool = False, + summary_interval: int = 50, + log_parameter_statistics: bool = False, + log_gradient_statistics: bool = False): + """ + Parameters + ---------- + model: ``Model``, required. + An AllenNLP model to be optimized. Pytorch Modules can also be optimized if + their ``forward`` method returns a dictionary with a "loss" key, containing a + scalar tensor representing the loss function to be optimized. + iterator: ``DataIterator``, required. + A method for iterating over a ``Dataset``, yielding padded indexed batches. + patience: Optional[int] > 0, optional (default=None) + Number of epochs to be patient before early stopping: the training is stopped + after ``patience`` epochs with no improvement. If given, it must be ``> 0``. + If None, early stopping is disabled. + num_epochs: int, optional (default = 20) + Number of training epochs. + serialization_dir: str, optional (default=None) + Path to directory for saving and loading model files. Models will not be saved if + this parameter is not passed. + cuda_device: int, optional (default = -1) + An integer specifying the CUDA device to use. If -1, the CPU is used. + Multi-gpu training is not currently supported, but will be once the + Pytorch DataParallel API stabilises. + grad_norm: float, optional, (default = None). + If provided, gradient norms will be rescaled to have a maximum of this value. + grad_clipping : float, optional (default = None). + If provided, gradients will be clipped `during the backward pass` to have an (absolute) + maximum of this value. If you are getting ``NaNs`` in your gradients during training + that are not solved by using ``grad_norm``, you may need this. + no_tqdm : bool, optional (default=False) + We use ``tqdm`` for logging, which will print a nice progress bar that updates in place + after every batch. This is nice if you're running training on a local shell, but can + cause problems with log files from, e.g., a docker image running on kubernetes. If + ``no_tqdm`` is ``True``, we will not use tqdm, and instead log batch statistics using + ``logger.info``. + """ + self._model = model + parameters_to_train = [(n, p) for n, p in self._model.named_parameters() if p.requires_grad] + + self._task_list = task_list + self._n_tasks = len(self._task_list) + + self._optimizer_params = optimizer_params + self._optimizers = {} + self._lr_scheduler_params = lr_scheduler_params + self._schedulers = {} + for task in self._task_list: + task_name = task._name + self._optimizers[task_name] = Optimizer.from_params(model_parameters = parameters_to_train, + params = deepcopy(optimizer_params)) + self._schedulers[task_name] = LearningRateScheduler.from_params(optimizer = self._optimizers[task_name], + params = deepcopy(lr_scheduler_params)) + + self._serialization_dir = serialization_dir + + self._patience = patience + self._num_epochs = num_epochs + self._cuda_device = cuda_device + if self._cuda_device >= 0: + check_for_gpu(self._cuda_device) + self._model = self._model.cuda(self._cuda_device) + self._grad_norm = grad_norm + self._grad_clipping = grad_clipping + self._min_lr = min_lr + + self._task_infos = None + self._metric_infos = None + + self._tr_generators = None + self._no_tqdm = no_tqdm + + self._summary_interval = summary_interval # num batches between logging to tensorboard + self._log_parameter_statistics = log_parameter_statistics + self._log_gradient_statistics = log_gradient_statistics + self._global_step = 0 + train_log = SummaryWriter(os.path.join(self._serialization_dir, "log", "train")) + validation_log = SummaryWriter(os.path.join(self._serialization_dir, "log", "validation")) + self._tensorboard = TensorboardWriter(train_log = train_log, validation_log = validation_log) + + + def train(self, + #tasks: List[Task], + #params: Params, + recover: bool = False): + + raise NotImplementedError + + + def _check_history(self, + metric_history: List[float], + cur_score: float, + should_decrease: bool = False): + ''' + Given a task, the history of the performance on that task, + and the current score, check if current score is + best so far and if out of patience. + + Parameters + ---------- + metric_history: List[float], required + cur_score: float, required + should_decrease: bool, default = False + Wheter or not the validation metric should increase while training. + For instance, the bigger the f1 score is, the better it is -> should_decrease = False + + Returns + ------- + best_so_far: bool + Whether or not the current epoch is the best so far in terms of the speicified validation metric. + out_of_patience: bool + Whether or not the training for this specific task should stop (patience parameter). + ''' + patience = self._patience + 1 + best_fn = min if should_decrease else max + best_score = best_fn(metric_history) + if best_score == cur_score: + best_so_far = metric_history.index(best_score) == len(metric_history) - 1 + else: + best_so_far = False + + out_of_patience = False + if len(metric_history) > patience: + if should_decrease: + out_of_patience = max(metric_history[-patience:]) <= cur_score + else: + out_of_patience = min(metric_history[-patience:]) >= cur_score + + if best_so_far and out_of_patience: # then something is up + print("Something is up") + + return best_so_far, out_of_patience + + + def _forward(self, + tensor_batch: torch.Tensor, + for_training: bool = False, + task:Task = None): + if task is not None: + tensor_batch = move_to_device(tensor_batch, self._cuda_device) + output_dict = self._model.forward(task_name = task._name, tensor_batch = tensor_batch, for_training = for_training) + if for_training: + try: + loss = output_dict["loss"] + loss += self._model.get_regularization_penalty() + except KeyError: + raise RuntimeError("The model you are trying to optimize does not contain a" + " `loss` key in the output of model.forward(inputs).") + return output_dict + else: + raise ConfigurationError("Cannot call forward through task `None`") + + + def _get_metrics(self, + task: Task, + reset: bool = False): + task_tagger = getattr(self._model, "_tagger_" + task._name) + return task_tagger.get_metrics(reset) + + + def _description_from_metrics(self, + metrics: Dict[str, float]): + # pylint: disable=no-self-use + return ', '.join(["%s: %.4f" % (name, value) for name, value in metrics.items()]) + " ||" + + + def _rescale_gradients(self) -> Optional[float]: + """ + Performs gradient rescaling. Is a no-op if gradient rescaling is not enabled. + """ + if self._grad_norm: + parameters_to_clip = [p for p in self._model.parameters() + if p.grad is not None] + return sparse_clip_norm(parameters_to_clip, self._grad_norm) + return None + + + def _enable_gradient_clipping(self) -> None: + if self._grad_clipping is not None: + # Pylint is unable to tell that we're in the case that _grad_clipping is not None... + # pylint: disable=invalid-unary-operand-type + clip_function = lambda grad: grad.clamp(-self._grad_clipping, self._grad_clipping) + for parameter in self._model.parameters(): + if parameter.requires_grad: + parameter.register_hook(clip_function) + + + def _save_checkpoint(self, + epoch: int, + should_stop: bool) -> None: + """ + Save the current states (model, training, optimizers, metrics and tasks). + + Parameters + ---------- + epoch: int, required. + The epoch of training. + should_stop: bool, required + Wheter or not the training is finished. + should_save_model: bool, optional (default = True) + Whether or not the model state should be saved. + """ + ### Saving training state ### + training_state = {"epoch": epoch, + "should_stop": should_stop, + "metric_infos": self._metric_infos, + "task_infos": self._task_infos, + "schedulers": {}, + "optimizers": {}} + + if self._optimizers is not None: + for task_name, optimizer in self._optimizers.items(): + training_state["optimizers"][task_name] = optimizer.state_dict() + if self._schedulers is not None: + for task_name, scheduler in self._schedulers.items(): + training_state["schedulers"][task_name] = scheduler.lr_scheduler.state_dict() + + training_path = os.path.join(self._serialization_dir, "training_state.th") + torch.save(training_state, training_path) + logger.info("Checkpoint - Saved training state to %s", training_path) + + + ### Saving model state ### + model_path = os.path.join(self._serialization_dir, "model_state.th") + model_state = self._model.state_dict() + torch.save(model_state, model_path) + logger.info("Checkpoint - Saved model state to %s", model_path) + + + ### Saving best models for each task ### + for task_name, infos in self._metric_infos.items(): + best_epoch, _ = infos["best"] + if best_epoch == epoch: + logger.info("Checkpoint - Best validation performance so far for %s task", task_name) + logger.info("Checkpoint - Copying weights to '%s/best_%s.th'.", self._serialization_dir, task_name) + shutil.copyfile(model_path, os.path.join(self._serialization_dir, "best_{}.th".format(task_name))) + + + def find_latest_checkpoint(self) -> Tuple[str, str]: + """ + Return the location of the latest model and training state files. + If there isn't a valid checkpoint then return None. + """ + have_checkpoint = (self._serialization_dir is not None and + any("model_state" in x for x in os.listdir(self._serialization_dir)) and + any("training_state" in x for x in os.listdir(self._serialization_dir))) + + if not have_checkpoint: + return None + + model_path = os.path.join(self._serialization_dir, + "model_state.th") + training_state_path = os.path.join(self._serialization_dir, + "training_state.th") + + return (model_path, training_state_path) + + + def _restore_checkpoint(self): + """ + Restores a model from a serialization_dir to the last saved checkpoint. + This includes an epoch count, optimizer state, a model state, a task state and + a metric state. All are of which are serialized separately. + This function should only be used to continue training - + if you wish to load a model for inference/load parts of a model into a new + computation graph, you should use the native Pytorch functions: + `` model.load_state_dict(torch.load("/path/to/model/weights.th"))`` + + Returns + ------- + epoch: int, + The epoch at which to resume training. + should_stop: bool + Whether or not the training should already by stopped. + """ + + latest_checkpoint = self.find_latest_checkpoint() + + if not self._serialization_dir: + raise ConfigurationError("`serialization_dir` not specified - cannot " + "restore a model without a directory path.") + if latest_checkpoint is None: + raise ConfigurationError("Cannot restore model because one of" + "`model_state.th` or `training_state.th` is not in directory path.") + + model_path, training_state_path = latest_checkpoint + + # Load the parameters onto CPU, then transfer to GPU. + # This avoids potential OOM on GPU for large models that + # load parameters onto GPU then make a new GPU copy into the parameter + # buffer. The GPU transfer happens implicitly in load_state_dict. + model_state = torch.load(model_path, map_location = device_mapping(-1)) + training_state = torch.load(training_state_path, map_location = device_mapping(-1)) + + # Load model + self._model.load_state_dict(model_state) + logger.info("Checkpoint - Model loaded from %s", model_path) + + # Load optimizers + for task_name, optimizers_state in training_state["optimizers"].items(): + self._optimizers[task_name].load_state_dict(optimizers_state) + logger.info("Checkpoint - Optimizers loaded from %s", training_state_path) + + # Load schedulers + for task_name, scheduler_state in training_state["schedulers"].items(): + self._schedulers[task_name].lr_scheduler.load_state_dict(scheduler_state) + logger.info("Checkpoint - Learning rate schedulers loaded from %s", training_state_path) + + self._metric_infos = training_state["metric_infos"] + self._task_infos = training_state["task_infos"] + logger.info("Checkpoint - Task infos loaded from %s", training_state_path) + logger.info("Checkpoint - Metric infos loaded from %s", training_state_path) + + n_epoch, should_stop = training_state["epoch"], training_state["should_stop"] + + return n_epoch + 1, should_stop + + + @classmethod + def from_params(cls, + model: Model, + task_list: List[Task], + serialization_dir: str, + params: Params) -> 'MultiTaskTrainer': + """ + Static method that constructs the multi task trainer described by ``params``. + """ + choice = params.pop_choice('type', cls.list_available()) + return cls.by_name(choice).from_params(model = model, + task_list = task_list, + serialization_dir = serialization_dir, + params = params) \ No newline at end of file diff --git a/hmtl/training/sampler_multi_task_trainer.py b/hmtl/training/sampler_multi_task_trainer.py new file mode 100644 index 0000000..bfc12de --- /dev/null +++ b/hmtl/training/sampler_multi_task_trainer.py @@ -0,0 +1,501 @@ +# coding: utf-8 + +import os +import math +import time +from copy import deepcopy +import random +import logging +import itertools +import shutil +from tensorboardX import SummaryWriter +import numpy as np + +from typing import List, Optional, Dict, Any +from overrides import overrides + +import torch +import torch.optim.lr_scheduler +import tqdm + +from allennlp.common import Params +from allennlp.common.checks import ConfigurationError, check_for_gpu +from allennlp.common.util import peak_memory_mb, gpu_memory_mb +from allennlp.nn.util import device_mapping +from allennlp.data.iterators import DataIterator +from allennlp.training.learning_rate_schedulers import LearningRateScheduler +from allennlp.training.optimizers import Optimizer +from allennlp.training.trainer import sparse_clip_norm, TensorboardWriter +from allennlp.models.model import Model + +from hmtl.tasks import Task +from hmtl.training.multi_task_trainer import MultiTaskTrainer + + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +@MultiTaskTrainer.register("sampler_multi_task_trainer") +class SamplerMultiTaskTrainer(MultiTaskTrainer): + def __init__(self, + model: Model, + task_list: List[Task], + optimizer_params: Params, + lr_scheduler_params: Params, + patience: Optional[int] = None, + num_epochs: int = 20, + serialization_dir: str = None, + cuda_device: int = -1, + grad_norm: Optional[float] = None, + grad_clipping: Optional[float] = None, + min_lr: float = 0.00001, + no_tqdm: bool = False, + summary_interval: int = 50, + log_parameter_statistics: bool = False, + log_gradient_statistics: bool = False, + sampling_method: str = "proportional"): + + if sampling_method not in ["uniform", "proportional"]: + raise ConfigurationError(f"Sampling method ({sampling_method}) must be `uniform` or `proportional`.") + + self._sampling_method = sampling_method + super(SamplerMultiTaskTrainer, self).__init__(model = model, + task_list = task_list, + optimizer_params = optimizer_params, + lr_scheduler_params = lr_scheduler_params, + patience = patience, + num_epochs = num_epochs, + serialization_dir=serialization_dir, + cuda_device = cuda_device, + grad_norm = grad_norm, + grad_clipping = grad_clipping, + min_lr = min_lr, + no_tqdm = no_tqdm, + summary_interval = summary_interval, + log_parameter_statistics = log_parameter_statistics, + log_gradient_statistics = log_gradient_statistics) + + + + @overrides + def train(self, + recover: bool = False): + ''' + Train the different task_list, save the different checkpoints and metrics, + and save the model at the end of training while logging the training details. + + The metrics through the training are stored in dictionaries with the following structure: + + all_metrics - Dict[str, str] + task_name: val_metric + + metric_infos (Dict[]) + task_name (Dict[str, diverse] + val_metric (str): name (str) + hist (str): history_of_the_val_metric (List[float]) + stopped (str): training_is_stopped (bool) + best (str): best_epoch_for_val_metric (Tuple(int, Dict)) + + all_tr_metrics (Dict[str, Dict[str, float]]) + task_name (Dict[str, float]) + metric_name (str): value (float) + loss: value (float) + + all_val_metrics (Dict[str, Dict[str, float]]) + task_name (Dict[str, float]) + metric_name (str): value (float) + loss (str): value (float) + + Parameters + ---------- + task_list: List[Task], required + A list containing the tasks to train. + params: Params, required + Training parameters + recover: bool, required + Whether or not training should be recovered from a previous training. + + Returns + ------- + return_dict: Dict + A dictionary summarizing the training and the metrics for the best epochs for each task. + ''' + training_start_time = time.time() + + if recover: + try: + n_epoch, should_stop = self._restore_checkpoint() + logger.info("Loaded model from checkpoint. Starting at epoch %d", n_epoch) + except RuntimeError: + raise ConfigurationError("Could not recover training from the checkpoint. Did you mean to output to " + "a different serialization directory or delete the existing serialization " + "directory?") + else: + n_epoch, should_stop = 0, False + + ### Store all the necessary informations and attributes about the tasks ### + task_infos = {task._name: {} for task in self._task_list} + for task_idx, task in enumerate(self._task_list): + task_info = task_infos[task._name] + + # Store statistiscs on training and validation batches + data_iterator = task._data_iterator + n_tr_batches = data_iterator.get_num_batches(task._train_data) + n_val_batches = data_iterator.get_num_batches(task._validation_data) + task_info['n_tr_batches'] = n_tr_batches + task_info['n_val_batches'] = n_val_batches + + # Create counter for number of batches trained during the whole + # training for this specific task + task_info['total_n_batches_trained'] = 0 + + task_info['last_log'] = time.time() # Time of last logging + self._task_infos = task_infos + + ### Bookkeeping the validation metrics ### + metric_infos = {task._name: {'val_metric': task._val_metric, + 'hist': [], + 'is_out_of_patience': False, + 'min_lr_hit': False, + 'best': (-1, {})} + for task in self._task_list} + self._metric_infos = metric_infos + + + ### Write log ### + total_n_tr_batches = 0 # The total number of training batches across all the datasets. + for task_name, info in self._task_infos.items(): + total_n_tr_batches += info["n_tr_batches"] + logger.info("Task %s:", task_name) + logger.info("\t%d training batches", info["n_tr_batches"]) + logger.info("\t%d validation batches", info["n_val_batches"]) + + + ### Create the training generators/iterators tqdm ### + self._tr_generators = {} + for task in self._task_list: + data_iterator = task._data_iterator + tr_generator = data_iterator(task._train_data, + num_epochs = None) + self._tr_generators[task._name] = tr_generator + + + ### Create sampling probability distribution ### + if self._sampling_method == "uniform": + sampling_prob = [float(1/self._n_tasks)]*self._n_tasks + elif self._sampling_method == "proportional": + sampling_prob = [float(info['n_tr_batches']/total_n_tr_batches) for info in self._task_infos.values()] + + + ### Enable gradient clipping ### + # Only if self._grad_clipping is specified + self._enable_gradient_clipping() + + + ### Setup is ready. Training of the model can begin ### + logger.info("Set up ready. Beginning training/validation.") + + + ### Begin Training of the model ### + while not should_stop: + # Train one epoch (training pass + validation pass) + + + self._model.train() # Set the model to "train" mode. + + + ### Log Infos: current epoch count and CPU/GPU usage ### + logger.info("") + logger.info("Epoch %d/%d - Begin", n_epoch, self._num_epochs - 1) + logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}") + for gpu, memory in gpu_memory_mb().items(): + logger.info(f"GPU {gpu} memory usage MB: {memory}") + + logger.info("Training - Begin") + + + ### Reset training and trained batches counter before new training epoch ### + for _, task_info in self._task_infos.items(): + task_info["tr_loss_cum"] = 0.0 + task_info["n_batches_trained_this_epoch"] = 0 + all_tr_metrics = {} # BUG TO COMPLETE COMMENT TO MAKE IT MORE CLEAR + + + ### Start training epoch ### + epoch_tqdm = tqdm.tqdm(range(total_n_tr_batches), total = total_n_tr_batches) + for _ in epoch_tqdm: + task_idx = np.argmax(np.random.multinomial(1, sampling_prob)) + task = self._task_list[task_idx] + task_info = self._task_infos[task._name] + + + ### One forward + backward pass ### + + # Call next batch to train + batch = next(self._tr_generators[task._name]) + task_info["n_batches_trained_this_epoch"] += 1 + + # Load optimizer + optimizer = self._optimizers[task._name] + optimizer.zero_grad() + + # Get the loss for this batch + output_dict = self._forward(tensor_batch = batch, task = task, for_training = True) + assert "loss" in output_dict, "Model must return a dict containing a 'loss' key" + loss = output_dict["loss"] + loss.backward() + task_info["tr_loss_cum"] += loss.item() + + # Gradient rescaling if self._grad_norm is specified + self._rescale_gradients() + + # Take an optimization step + optimizer.step() + + + ### Get metrics for all progress so far, update tqdm, display description ### + task_metrics = self._get_metrics(task = task) + task_metrics["loss"] = float(task_info["tr_loss_cum"] / (task_info["n_batches_trained_this_epoch"]+0.000001)) + description = self._description_from_metrics(task_metrics) + epoch_tqdm.set_description(task._name + ", " + description) + + + ### Tensorboard logging: Training detailled metrics, parameters and gradients ### + if self._global_step % self._summary_interval == 0: + # Metrics + for metric_name, value in task_metrics.items(): + self._tensorboard.add_train_scalar(name = "training_details/" + task._name + "/" + metric_name, + value = value, + global_step = self._global_step) + # Parameters and Gradients + for param_name, param in self._model.named_parameters(): + if self._log_parameter_statistics: + self._tensorboard.add_train_scalar(name = "parameter_mean/" + param_name, + value = param.data.mean(), + global_step = self._global_step) + self._tensorboard.add_train_scalar(name = "parameter_std/" + param_name, + value = param.data.std(), + global_step = self._global_step) + if param.grad is None: + continue + if self._log_gradient_statistics: + self._tensorboard.add_train_scalar(name = "grad_mean/" + param_name, + value = param.grad.data.mean(), + global_step = self._global_step) + self._tensorboard.add_train_scalar(name = "grad_std/" + param_name, + value = param.grad.data.std(), + global_step = self._global_step) + self._global_step += 1 + + + + ### Bookkeeping all the training metrics for all the tasks on the training epoch that just finished ### + for task in self._task_list: + task_info = self._task_infos[task._name] + + task_info['total_n_batches_trained'] += task_info["n_batches_trained_this_epoch"] + task_info['last_log'] = time.time() + + task_metrics = self._get_metrics(task = task, reset = True) + if task._name not in all_tr_metrics: + all_tr_metrics[task._name ] = {} + for name, value in task_metrics.items(): + all_tr_metrics[task._name][name] = value + all_tr_metrics[task._name]["loss"] = \ + float(task_info["tr_loss_cum"] / (task_info["n_batches_trained_this_epoch"]+0.00000001)) + + # Tensorboard - Training metrics for this epoch + self._tensorboard.add_train_scalar(name = "training_proportions/" + task._name, + value = task_info['n_batches_trained_this_epoch'], + global_step = n_epoch) + for metric_name, value in all_tr_metrics[task._name].items(): + self._tensorboard.add_train_scalar(name = "task_" + task._name + "/" + metric_name, + value = value, + global_step = n_epoch) + + + logger.info("Train - End") + + + + + ### Begin validation of the model ### + logger.info("Validation - Begin") + all_val_metrics = {} + + + self._model.eval() #Set the model into evaluation mode + + + for task_idx, task in enumerate(self._task_list): + logger.info("Validation - Task %d/%d: %s", task_idx + 1, self._n_tasks, task._name) + + val_loss = 0.0 + n_batches_val_this_epoch_this_task = 0 + n_val_batches = self._task_infos[task._name]['n_val_batches'] + scheduler = self._schedulers[task._name] + + # Create tqdm generator for current task's validation + data_iterator = task._data_iterator + val_generator = data_iterator(task._validation_data, + num_epochs = 1, + shuffle = False) + val_generator_tqdm = tqdm.tqdm(val_generator, + total = n_val_batches) + + # Iterate over each validation batch for this task + for batch in val_generator_tqdm: + n_batches_val_this_epoch_this_task += 1 + + # Get the loss + val_output_dict = self._forward(batch, task = task, for_training = False) + loss = val_output_dict["loss"] + val_loss += loss.item() + + # Get metrics for all progress so far, update tqdm, display description + task_metrics = self._get_metrics(task = task) + task_metrics["loss"] = float(val_loss / n_batches_val_this_epoch_this_task) + description = self._description_from_metrics(task_metrics) + val_generator_tqdm.set_description(description) + + # Get task validation metrics and store them in all_val_metrics + task_metrics = self._get_metrics(task = task, reset = True) + if task._name not in all_val_metrics: + all_val_metrics[task._name] = {} + for name, value in task_metrics.items(): + all_val_metrics[task._name][name] = value + all_val_metrics[task._name]["loss"] = float(val_loss / n_batches_val_this_epoch_this_task) + + # Tensorboard - Validation metrics for this epoch + for metric_name, value in all_val_metrics[task._name].items(): + self._tensorboard.add_validation_scalar(name = "task_" + task._name + "/" + metric_name, + value = value, + global_step = n_epoch) + + + ### Perform a patience check and update the history of validation metric for this task ### + this_epoch_val_metric = all_val_metrics[task._name][task._val_metric] + metric_history = self._metric_infos[task._name]['hist'] + + metric_history.append(this_epoch_val_metric) + is_best_so_far, out_of_patience = self._check_history(metric_history = metric_history, + cur_score = this_epoch_val_metric, + should_decrease = task._val_metric_decreases) + + if is_best_so_far: + logger.info("Best model found for %s.", task._name) + self._metric_infos[task._name]['best'] = (n_epoch, all_val_metrics) + if out_of_patience and not self._metric_infos[task._name]['is_out_of_patience']: + self._metric_infos[task._name]['is_out_of_patience'] = True + logger.info("Task %s is out of patience and vote to stop the training.", task._name) + + # The LRScheduler API is agnostic to whether your schedule requires a validation metric - + # if it doesn't, the validation metric passed here is ignored. + scheduler.step(this_epoch_val_metric, n_epoch) + + + logger.info("Validation - End") + + + ### Print all training and validation metrics for this epoch ### + logger.info("***** Epoch %d/%d Statistics *****", n_epoch, self._num_epochs - 1) + for task in self._task_list: + logger.info("Statistic: %s", task._name) + logger.info("\tTraining - %s: %3d", "Nb batches trained", self._task_infos[task._name]["n_batches_trained_this_epoch"]) + for metric_name, value in all_tr_metrics[task._name].items(): + logger.info("\tTraining - %s: %3f", metric_name, value) + for metric_name, value in all_val_metrics[task._name].items(): + logger.info("\tValidation - %s: %3f", metric_name, value) + logger.info("**********") + + + ### Check to see if should stop ### + stop_tr, stop_val = True, True + + for task in self._task_list: + #task_info = self._task_infos[task._name] + if self._optimizers[task._name].param_groups[0]['lr'] < self._min_lr: + logger.info("Minimum lr hit on %s.", task._name) + logger.info("Task %s vote to stop training.", task._name) + metric_infos[task._name]['min_lr_hit'] = True + stop_tr = stop_tr and self._metric_infos[task._name]['min_lr_hit'] + stop_val = stop_val and self._metric_infos[task._name]['is_out_of_patience'] + + if stop_tr: + should_stop = True + logging.info("All tasks hit minimum lr. Stopping training.") + if stop_val: + should_stop = True + logging.info("All metrics ran out of patience. Stopping training.") + if n_epoch >= self._num_epochs - 1: + should_stop = True + logging.info("Maximum number of epoch hit. Stopping training.") + + self._save_checkpoint(n_epoch, should_stop) + + + ### Update n_epoch ### + # One epoch = doing N (forward + backward) pass where N is the total number of training batches. + n_epoch += 1 + + + ### Summarize training at the end ### + logging.info('***** Training is finished *****') + logging.info('Stopped training after %d epochs', n_epoch) + return_metrics = {} + for task_name, task_info in self._task_infos.items(): + nb_epoch_trained = int(task_info['total_n_batches_trained'] / task_info['n_tr_batches']) + logging.info('Trained %s for %d batches ~= %d epochs', + task_name, + task_info['total_n_batches_trained'], + nb_epoch_trained) + return_metrics[task._name] = {"best_epoch": self._metric_infos[task_name]['best'][0], + "nb_epoch_trained": nb_epoch_trained, + "best_epoch_val_metrics": self._metric_infos[task_name]['best'][1]} + + training_elapsed_time = time.time() - training_start_time + return_metrics["training_duration"] = time.strftime("%d:%H:%M:%S", time.gmtime(training_elapsed_time)) + return_metrics["nb_epoch_trained"] = n_epoch + + + return return_metrics + + @classmethod + def from_params(cls, + model: Model, + task_list: List[Task], + serialization_dir: str, + params: Params) -> 'SamplerMultiTaskTrainer': + ''' Generator multi-task trainer from parameters. ''' + + optimizer_params = params.pop("optimizer") + lr_scheduler_params = params.pop("scheduler") + patience = params.pop_int("patience", 2) + num_epochs = params.pop_int("num_epochs", 20) + cuda_device = params.pop_int("cuda_device", -1) + grad_norm = params.pop_float("grad_norm", None) + grad_clipping = params.pop_float("grad_clipping", None) + min_lr = params.pop_float("min_lr", 0.00001) + no_tqdm = params.pop_bool("no_tqdm", False) + summary_interval = params.pop("sumarry_interval", 50) + log_parameter_statistics = params.pop("log_parameter_statistics", False) + log_gradient_statistics = params.pop("log_gradient_statistics", False) + sampling_method = params.pop("sampling_method", "proportional") + + params.assert_empty(cls.__name__) + return SamplerMultiTaskTrainer(model = model, + task_list = task_list, + optimizer_params = optimizer_params, + lr_scheduler_params = lr_scheduler_params, + patience = patience, + num_epochs = num_epochs, + serialization_dir = serialization_dir, + cuda_device = cuda_device, + grad_norm = grad_norm, + grad_clipping = grad_clipping, + min_lr = min_lr, + no_tqdm = no_tqdm, + summary_interval = summary_interval, + log_parameter_statistics = log_parameter_statistics, + log_gradient_statistics = log_gradient_statistics, + sampling_method = sampling_method) \ No newline at end of file diff --git a/html_senteval.py b/html_senteval.py new file mode 100644 index 0000000..3d52f2f --- /dev/null +++ b/html_senteval.py @@ -0,0 +1,166 @@ +# coding: utf-8 + +""" +A quick and simple script for evaluating the embeddings throught the HTML model/hierarchy +using SentEval. +""" + + +from __future__ import absolute_import, division, unicode_literals + +import sys +import io +import numpy as np +import logging +import re + +# Set PATHs +PATH_TO_SENTEVAL = './SentEval/' +PATH_TO_DATA = './SentEval/data' +sys.path.insert(0, PATH_TO_SENTEVAL) +import senteval + +import os +import torch +import argparse + +from allennlp.common.params import Params +from allennlp.data.token_indexers import TokenIndexer +from allennlp.data import Token, Instance, Vocabulary +from allennlp.data.dataset import Batch +from allennlp.data.fields import TextField +from allennlp.nn import util +from allennlp.models.model import Model + +import hmtl + + +def text_to_instance(sent, token_indexers): + text = TextField([Token(word) for word in sent], token_indexers = token_indexers) + instance = Instance({"text": text}) + return instance + +def sentences_to_indexed_batch(sentences, token_indexers): + instances = [text_to_instance(sent, token_indexers) for sent in sentences] + batch = Batch(instances) + batch.index_instances(vocab) + return batch + +def compute_embds_from_layer(model, model_layer_name, batch): + batch_tensor = batch.as_tensor_dict(batch.get_padding_lengths()) + text = batch_tensor["text"] + text_mask = util.get_text_field_mask(text) + + if model_layer_name == "text_field_embedder": + embds_text_field_embedder = model._text_field_embedder(text) + embds = embds_text_field_embedder + + if model_layer_name == "encoder_ner": + embds_text_field_embedder = model._text_field_embedder(text) + embds_encoder_ner = model._encoder_ner(embds_text_field_embedder, text_mask) + embds = embds_encoder_ner + + if model_layer_name == "encoder_emd": + embds_text_field_embedder = model._shortcut_text_field_embedder(text) + embds_encoder_emd = model._encoder_emd(embds_text_field_embedder, text_mask) + embds = embds_encoder_emd + + if model_layer_name == "encoder_relation": + embds_text_field_embedder = model._shortcut_text_field_embedder_relation(text) + embds_encoder_relation = model._encoder_relation(embds_text_field_embedder, text_mask) + embds = embds_encoder_relation + + if model_layer_name == "encoder_coref": + embds_text_field_embedder = model._shortcut_text_field_embedder_coref(text) + embds_encoder_coref = model._encoder_coref(embds_text_field_embedder, text_mask) + embds = embds_encoder_coref + + emds_size = embds.size(2) + expanded_text_mask = torch.cat([text_mask.unsqueeze(-1)]*emds_size, dim = -1) + + embds_sum = (embds*expanded_text_mask.float()).sum(dim = 1) + normalization = torch.cat([(1/text_mask.float().sum(-1)).unsqueeze(-1)]*emds_size, dim = -1) + computed_embds = (embds_sum*normalization) + + return computed_embds.detach().numpy() + + +# SentEval prepare and batcher +def prepare(params, samples): + return + +def batcher(params, batch): + batch = sentences_to_indexed_batch(batch, token_index) + embds = compute_embds_from_layer(model, args.layer_name, batch) + return embds + + +# Set params for SentEval +params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} +params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, + 'tenacity': 3, 'epoch_size': 2} + + +# Set up logger +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO) +logger = logging.getLogger(__name__) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-s", + "--serialization_dir", + required = True, + help = "Directory from which to load the pretrained model.", + type = str) + parser.add_argument("-t", + "--task", + required = False, + default = "ner", + help = "Name of the task to load.", + type = str) + parser.add_argument("-l", + "--layer_name", + required = False, + default = "text_field_embedder", + help = "Name of encoder/embedding layer of the model", + type = str) + args = parser.parse_args() + + + serialization_dir = args.serialization_dir + + params = Params.from_file(params_file = os.path.join(args.serialization_dir, "config.json")) + logging.info("Parameters loaded from %s", os.path.join(serialization_dir, "config.json")) + + ### Load Vocabulary from files ### + logging.info("Loading Vocavulary from %s", os.path.join(serialization_dir, "vocabulary")) + vocab = Vocabulary.from_files(os.path.join(args.serialization_dir, "vocabulary")) + logger.info("Vocabulary loaded") + + ### Create model ### + model_params = params.pop("model") + model = Model.from_params(vocab = vocab, params = model_params, regularizer = None) + best_model_state_path = os.path.join(serialization_dir, "best_{}.th".format(args.task)) + best_model_state = torch.load(best_model_state_path) + model.load_state_dict(state_dict = best_model_state) + + ### Create token indexer ### + token_index = {} + task_keys = [key for key in params.keys() if re.search("^task_", key)] + token_indexer_params = params.pop(task_keys[-1]).pop("data_params").pop("dataset_reader").pop("token_indexers") + for name, indexer_params in token_indexer_params.items(): + token_index[name] = TokenIndexer.from_params(indexer_params) + + params_senteval['encoder'] = model + + se = senteval.engine.SE(params_senteval, batcher, prepare) + transfer_tasks = ['Length', 'WordContent', 'Depth', 'TopConstituents', + 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', + 'OddManOut', 'CoordinationInversion'] + results = se.eval(transfer_tasks) + + print(results) + logging.info("SentEval(uation) Finished") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9d5e33b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,102 @@ +alabaster==0.7.12 +allennlp==0.7.0 +asn1crypto==0.24.0 +atomicwrites==1.2.1 +attrs==18.2.0 +aws-xray-sdk==0.95 +awscli==1.16.38 +Babel==2.6.0 +biscuits==0.1.1 +boto==2.49.0 +boto3==1.9.28 +botocore==1.12.28 +certifi==2018.10.15 +cffi==1.11.2 +chardet==3.0.4 +Click==7.0 +colorama==0.3.9 +conllu==0.11 +cookies==2.2.1 +cryptography==2.3.1 +cymem==2.0.2 +cytoolz==0.9.0.1 +dill==0.2.8.2 +docker==3.5.1 +docker-pycreds==0.3.0 +docutils==0.14 +ecdsa==0.13 +editdistance==0.5.2 +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz +flaky==3.4.0 +Flask==0.12.4 +Flask-Cors==3.0.3 +ftfy==5.5.0 +future==0.16.0 +gevent==1.3.6 +greenlet==0.4.15 +h5py==2.8.0 +idna==2.7 +imagesize==1.1.0 +ItsDangerous==1.1.0 +Jinja2==2.10 +jmespath==0.9.3 +jsondiff==1.1.1 +jsonnet==0.10.0 +jsonpickle==1.0 +MarkupSafe==1.0 +mock==2.0.0 +more-itertools==4.3.0 +moto==1.3.4 +msgpack==0.5.6 +msgpack-numpy==0.4.3.2 +murmurhash==1.0.1 +nltk==3.3 +numpy==1.15.2 +numpydoc==0.8.0 +overrides==1.9 +packaging==18.0 +parsimonious==0.8.0 +pbr==5.0.0 +plac==0.9.6 +pluggy==0.8.0 +preshed==2.0.1 +protobuf==3.6.1 +py==1.7.0 +pyaml==17.12.1 +pyasn1==0.4.4 +pycparser==2.19 +pycryptodome==3.6.6 +Pygments==2.2.0 +pyparsing==2.2.2 +pytest==3.9.1 +pytest-pythonpath==0.7.3 +python-dateutil==2.7.3 +python-jose==2.0.2 +pytz==2017.3 +PyYAML==3.13 +regex==2018.1.10 +requests==2.20.0 +responses==0.10.1 +rsa==3.4.2 +s3transfer==0.1.13 +scikit-learn==0.20.0 +scipy==1.1.0 +six==1.11.0 +snowballstemmer==1.2.1 +spacy==2.0.16 +Sphinx==1.8.1 +sphinxcontrib-websupport==1.1.0 +sqlparse==0.2.4 +tensorboardX==1.2 +thinc==6.12.0 +toolz==0.9.0 +torch==0.4.1 +tqdm==4.28.1 +ujson==1.35 +Unidecode==1.0.22 +urllib3==1.24 +wcwidth==0.1.7 +websocket-client==0.53.0 +Werkzeug==0.14.1 +wrapt==1.10.11 +xmltodict==0.11.0 diff --git a/scripts/data_setup.sh b/scripts/data_setup.sh new file mode 100755 index 0000000..382c4f0 --- /dev/null +++ b/scripts/data_setup.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +#Download Data +cd data + +#ELMO +mkdir elmo +cd elmo + +##Original size +wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json +mv elmo_2x4096_512_2048cnn_2xhighway_options.json 2x4096_512_2048cnn_2xhighway_options.json +wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5 +mv elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5 2x4096_512_2048cnn_2xhighway_weights.hdf5 + +##Medium size +wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5 +mv elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5 2x2048_256_2048cnn_1xhighway_weights.hdf5 +wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json +mv elmo_2x2048_256_2048cnn_1xhighway_options.json 2x2048_256_2048cnn_1xhighway_options.json + +##Small size +wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5 +mv elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5 2x1024_128_2048cnn_1xhighway_weights.hdf5 +wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json +mv elmo_2x1024_128_2048cnn_1xhighway_options.json 2x1024_128_2048cnn_1xhighway_options.json + +#Glove +cd .. +mkdir glove +cd glove +wget https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz \ No newline at end of file diff --git a/scripts/machine_setup.sh b/scripts/machine_setup.sh new file mode 100755 index 0000000..d3342e2 --- /dev/null +++ b/scripts/machine_setup.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +### Install git-lfs ### +curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash +sudo apt-get install git-lfs +git lfs install + + +### Install Python3.6 ### +sudo add-apt-repository ppa:deadsnakes/ppa +sudo apt-get update +sudo apt-get install python3.6 python3.6-dev +wget https://bootstrap.pypa.io/get-pip.py +sudo python3.6 get-pip.py +sudo ln -s /usr/bin/python3.6 /usr/local/bin/python3 +sudo ln -s /usr/local/bin/pip /usr/local/bin/pip3 + + +### Create a clean Python3.6 environment ### +sudo pip3 install virtualenv +virtualenv -p /usr/bin/python3.6 .env +source ./.env/bin/activate + + +### Install dependencies ### +pip install -r requirements.txt + + +### Install submodules (SentEval) ### +git submodule init +git submodule update + +sudo apt-get install unzip +cd SentEval/data/downstream/ +./get_transfer_data.bash diff --git a/train.py b/train.py new file mode 100644 index 0000000..f0a54f4 --- /dev/null +++ b/train.py @@ -0,0 +1,237 @@ +# coding: utf-8 + +""" +The ``train.py`` file can be used to train a model. +It requires a configuration file and a directory in +which to write the results. + +.. code-block:: bash + + $ python train.py --help + usage: train.py [-h] -s SERIALIZATION_DIR -c CONFIG_FILE_PATH [-r] + + optional arguments: + -h, --help show this help message and exit + -s SERIALIZATION_DIR, --serialization_dir SERIALIZATION_DIR + Directory in which to save the model and its logs. + -c CONFIG_FILE_PATH, --config_file_path CONFIG_FILE_PATH + Path to parameter file describing the multi-tasked + model to be trained. + -r, --recover Recover a previous training from the state in + serialization_dir. +""" + +import argparse +import itertools +import os +import json +import re +from copy import deepcopy +import torch +import logging +from typing import List, Dict, Any, Tuple +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO) + +from hmtl.tasks import Task +from hmtl.training.multi_task_trainer import MultiTaskTrainer +from hmtl.common import create_and_set_iterators +from evaluate import evaluate + +from allennlp.models.model import Model +from allennlp.data import Vocabulary +from allennlp.data.iterators import DataIterator +from allennlp.commands.train import create_serialization_dir +from allennlp.common.params import Params +from allennlp.nn import RegularizerApplicator + +logger = logging.getLogger(__name__) + + +def tasks_and_vocab_from_params(params: Params, + serialization_dir: str) -> Tuple[List[Task], Vocabulary]: + ''' + Load each of the tasks in the model from the ``params`` file + and load the datasets associated with each of these task. + Create the vocavulary from ``params`` using the concatenation of the ``datasets_for_vocab_creation`` + from each of the task specific dataset. + + Parameters + ---------- + params: ``Params`` + A parameter object specifing an experiment. + serialization_dir: ``str`` + Directory in which to save the model and its logs. + Returns + ------- + task_list: ``List[Task]`` + A list containing the tasks of the model to train. + vocab: ``Vocabulary`` + The vocabulary fitted on the datasets_for_vocab_creation. + ''' + ### Instantiate the different tasks ### + task_list = [] + instances_for_vocab_creation = itertools.chain() + datasets_for_vocab_creation = {} + task_keys = [key for key in params.keys() if re.search("^task_", key)] + + for key in task_keys: + logger.info("Creating %s", key) + task_params = params.pop(key) + task_description = task_params.pop("task_description") + task_data_params = task_params.pop("data_params") + + task = Task.from_params(params = task_description) + task_list.append(task) + + task_instances_for_vocab, task_datasets_for_vocab = task.load_data_from_params(params = task_data_params) + instances_for_vocab_creation = itertools.chain(instances_for_vocab_creation, task_instances_for_vocab) + datasets_for_vocab_creation[task._name] = task_datasets_for_vocab + + + ### Create and save the vocabulary ### + for task_name, task_dataset_list in datasets_for_vocab_creation.items(): + logger.info("Creating a vocabulary using %s data from %s.", ", ".join(task_dataset_list), task_name) + + logger.info("Fitting vocabulary from dataset") + vocab = Vocabulary.from_params(params.pop("vocabulary", {}), instances_for_vocab_creation) + + vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) + logger.info("Vocabulary saved to %s", os.path.join(serialization_dir, "vocabulary")) + + return task_list, vocab + +def train_model(multi_task_trainer: MultiTaskTrainer, + recover: bool = False)-> Dict[str, Any]: + ''' + Launching the training of the multi-task model. + +    Parameters +    ---------- + multi_task_trainer: ``MultiTaskTrainer`` +        A trainer (similar to allennlp.training.trainer.Trainer) that can handle multi-task training. + recover : ``bool``, optional (default=False) + If ``True``, we will try to recover a training run from an existing serialization + directory. This is only intended for use when something actually crashed during the middle + of a run. For continuing training a model on new data, see the ``fine-tune`` command. +      +    Returns +    ------- + metrics: ``Dict[str, Any] + The different metrics summarizing the training of the model. + It includes the validation and test (if necessary) metrics. + ''' + ### Train the multi-task model ### + metrics = multi_task_trainer.train(recover = recover) + + task_list = multi_task_trainer._task_list + serialization_dir = multi_task_trainer._serialization_dir + model = multi_task_trainer._model + + ### Evaluate the model on test data if necessary ### + # This is a multi-task learning framework, the best validation metrics for one task are not necessarily + # obtained from the same epoch for all the tasks, one epoch begin equal to N forward+backward passes, + # where N is the total number of batches in all the training sets. + # We evaluate each of the best model for each task (based on the validation metrics) for all the other tasks (which have a test set). + for task in task_list: + if not task._evaluate_on_test: continue + + logger.info("Task %s will be evaluated using the best epoch weights.", task._name) + assert task._test_data is not None, "Task {} wants to be evaluated on test dataset but no there is no test data loaded.".format(task._name) + + logger.info("Loading the best epoch weights for task %s", task._name) + best_model_state_path = os.path.join(serialization_dir, "best_{}.th".format(task._name)) + best_model_state = torch.load(best_model_state_path) + best_model = model + best_model.load_state_dict(state_dict = best_model_state) + + test_metric_dict = {} + + for pair_task in task_list: + if not pair_task._evaluate_on_test: continue + + logger.info("Pair task %s is evaluated with the best model for %s", pair_task._name, task._name) + test_metric_dict[pair_task._name] = {} + test_metrics = evaluate(model = best_model, + task_name = pair_task._name, + instances = pair_task._test_data, + data_iterator = pair_task._data_iterator, + cuda_device = multi_task_trainer._cuda_device) + + for metric_name, value in test_metrics.items(): + test_metric_dict[pair_task._name][metric_name] = value + + metrics[task._name]["test"] = deepcopy(test_metric_dict) + logger.info("Finished evaluation of task %s.", task._name) + + + ### Dump validation and possibly test metrics ### + metrics_json = json.dumps(metrics, indent = 2) + with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: + metrics_file.write(metrics_json) + logger.info("Metrics: %s", metrics_json) + + return metrics + + +if __name__ == "__main__": + # Parse arguments + parser = argparse.ArgumentParser() + parser.add_argument("-s", + "--serialization_dir", + required = True, + help = "Directory in which to save the model and its logs.", + type = str) + parser.add_argument("-c", + "--config_file_path", + required = True, + help = "Path to parameter file describing the multi-tasked model to be trained.", + type = str) + parser.add_argument("-r", + "--recover", + action = "store_true", + default = False, + help = "Recover a previous training from the state in serialization_dir.") + args = parser.parse_args() + + + params = Params.from_file(params_file = args.config_file_path) + serialization_dir = args.serialization_dir + create_serialization_dir(params, serialization_dir, args.recover) + + serialization_params = deepcopy(params).as_dict(quiet=True) + with open(os.path.join(serialization_dir, "config.json"), "w") as param_file: + json.dump(serialization_params, param_file, indent = 4) + + + ### Instantiate the different tasks from the param file, load datasets and create vocabulary ### + tasks, vocab = tasks_and_vocab_from_params(params = params, serialization_dir = serialization_dir) + + + ### Load the data iterators for each task ### + tasks = create_and_set_iterators(params = params, task_list = tasks, vocab = vocab) + + + ### Load Regularizations ### + regularizer = RegularizerApplicator.from_params(params.pop('regularizer', [])) + + + ### Create model ### + model_params = params.pop("model") + model = Model.from_params(vocab = vocab, params = model_params, regularizer = regularizer) + + + ### Create multi-task trainer ### + multi_task_trainer_params = params.pop("multi_task_trainer") + trainer = MultiTaskTrainer.from_params(model = model, + task_list = tasks, + serialization_dir = serialization_dir, + params = multi_task_trainer_params) + + + ### Launch training ### + metrics = train_model(multi_task_trainer = trainer, + recover = args.recover) + if metrics is not None: + logging.info("Training is finished ! Let's have a drink. It's on the house !") \ No newline at end of file