From 0d042cffeebbec17cc6dd8729ab805e39e0ebc7e Mon Sep 17 00:00:00 2001 From: Jon Gauthier Date: Mon, 25 May 2020 15:58:22 -0400 Subject: [PATCH 1/4] optional GPU support for both tinylstm and ordered-neurons' --- models/ordered-neurons/bin/get_surprisals | 2 +- models/ordered-neurons/get_surprisals.py | 35 ++++----- models/ordered-neurons/spec.template.json | 2 +- models/tinylstm/Dockerfile | 1 + models/tinylstm/bin/get_surprisals | 2 +- models/tinylstm/get_surprisals.py | 86 +++++++++++++++++++++++ models/tinylstm/spec.template.json | 2 +- 7 files changed, 105 insertions(+), 25 deletions(-) create mode 100644 models/tinylstm/get_surprisals.py diff --git a/models/ordered-neurons/bin/get_surprisals b/models/ordered-neurons/bin/get_surprisals index 0d1f5e0..cedeccf 100755 --- a/models/ordered-neurons/bin/get_surprisals +++ b/models/ordered-neurons/bin/get_surprisals @@ -8,4 +8,4 @@ INPUT_FILE="$1" python ${MODEL_ROOT}/get_surprisals.py \ "$MODEL_CHECKPOINT" /tmp/input_tokenized \ - --corpus_file "$MODEL_CORPUS" --no-cuda + --corpus_file "$MODEL_CORPUS" diff --git a/models/ordered-neurons/get_surprisals.py b/models/ordered-neurons/get_surprisals.py index e91d298..8c63cba 100644 --- a/models/ordered-neurons/get_surprisals.py +++ b/models/ordered-neurons/get_surprisals.py @@ -1,4 +1,5 @@ import argparse +import os from pathlib import Path import pickle import sys @@ -20,30 +21,26 @@ parser.add_argument("--bptt", type=int, default=70, help="sequence length") parser.add_argument("--emsize", type=int, default=400, help="size of word embeddings") parser.add_argument("--seed", type=int, default=1111, help="random seed") -parser.add_argument("--cuda", dest="cuda", action="store_true") -parser.add_argument("--no-cuda", dest="cuda", action="store_false") -parser.set_defaults(cuda=True) args = parser.parse_args() +use_cuda = torch.cuda.is_available() and os.environ.get("LMZOO_USE_GPU", False) +if use_cuda: + sys.stderr.write("Using GPU device.\n") +device = "cuda" if use_cuda else "cpu" + # Set the random seed manually for reproducibility. def set_seed(seed): np.random.seed(seed) torch.manual_seed(seed) - if torch.cuda.is_available(): - if not args.cuda: - print("WARNING: You have a CUDA device, so you should probably run with --cuda") - else: - torch.cuda.manual_seed(seed) + if use_cuda: + torch.cuda.manual_seed(seed) def model_load(fn): global model, criterion, optimizer with open(fn, 'rb') as f: - kwargs = {} - if not args.cuda: - kwargs["map_location"] = "cpu" - model, criterion, optimizer = torch.load(f, **kwargs) + model, criterion, optimizer = torch.load(f, map_location=device) def get_batch(data_source, i, window): @@ -73,7 +70,7 @@ def get_surprisals(sentences, corpus, outf, seed): raise RuntimeError("Internal error: Dictionary lookup failed. This should not happen with properly unked inputs.") # model expects T * batch_size array - data_source = data_source.unsqueeze(1) + data_source = data_source.unsqueeze(1).to(device) with torch.no_grad(): hidden = model.init_hidden(1) @@ -84,19 +81,15 @@ def get_surprisals(sentences, corpus, outf, seed): torch.nn.functional.linear(output, model.decoder.weight, bias=model.decoder.bias), dim=1) - # Convert to numpy and change to log2. - logprobs = logprobs.detach().numpy() - logprobs /= np.log(2) - - # Retrieve relevant surprisal values. - targets = targets.numpy() - target_surprisals = -logprobs[np.arange(len(targets)), targets] + # Convert to surprisals and extract relevant surprisal value. + surprisals = - logprobs / np.log(2) + target_surprisals = surprisals[np.arange(len(targets)), targets].cpu() for k, surp in enumerate(target_surprisals): outf.write("%i\t%i\t%s\t%f\n" % (i + 1, j + k + 2, sentence[j + k + 1], surp)) -corpus = torch.load(args.corpus_file) +corpus = torch.load(args.corpus_file, map_location=device) model_load(args.model_checkpoint) sentences = [line.strip().split(" ") for line in args.file.readlines() if line.strip()] get_surprisals(sentences, corpus, args.outf, args.seed) diff --git a/models/ordered-neurons/spec.template.json b/models/ordered-neurons/spec.template.json index aa16f57..ecf96e6 100644 --- a/models/ordered-neurons/spec.template.json +++ b/models/ordered-neurons/spec.template.json @@ -16,7 +16,7 @@ }, "gpu": { "required": false, - "supported": false + "supported": true } }, diff --git a/models/tinylstm/Dockerfile b/models/tinylstm/Dockerfile index 778e714..c5b3c72 100644 --- a/models/tinylstm/Dockerfile +++ b/models/tinylstm/Dockerfile @@ -31,6 +31,7 @@ ARG MODEL_ROOT=models/tinylstm COPY --from=builder /opt/tinylstm /opt/tinylstm RUN pip install -q nltk +COPY ${MODEL_ROOT}/get_surprisals.py /opt/tinylstm # Add test dependencies. RUN pip install nose jsonschema diff --git a/models/tinylstm/bin/get_surprisals b/models/tinylstm/bin/get_surprisals index 7cd4ff2..d7ffda1 100755 --- a/models/tinylstm/bin/get_surprisals +++ b/models/tinylstm/bin/get_surprisals @@ -6,7 +6,7 @@ INPUT_FILE="$1" /opt/bin/tokenize $1 > /tmp/input_tokenized -python ${MODEL_ROOT}/eval.py \ +python ${MODEL_ROOT}/get_surprisals.py \ --checkpoint "$MODEL_CHECKPOINT" \ --corpus "$MODEL_CORPUS" \ --eval_data /tmp/input_tokenized \ diff --git a/models/tinylstm/get_surprisals.py b/models/tinylstm/get_surprisals.py new file mode 100644 index 0000000..a958780 --- /dev/null +++ b/models/tinylstm/get_surprisals.py @@ -0,0 +1,86 @@ +import argparse +import os +import sys + +import numpy as np +import torch +import torch.nn.functional as F +from tqdm import tqdm + +sys.path.append("/opt/tinylstm") +import data + +parser = argparse.ArgumentParser(description='PyTorch PTB Language Model') + +# Model parameters. +parser.add_argument('--corpus', type=str, default=None, required=True, + help='location of corpus file') +parser.add_argument('--checkpoint', type=str, required=True, + help='model checkpoint to use') +parser.add_argument('--seed', type=int, default=1111, + help='random seed') +parser.add_argument('--eval_data', type=str, default='stimuli_items/input_test.raw') +parser.add_argument('--outf', type=argparse.FileType("w", encoding="utf-8"), default=sys.stdout, + help='output file for generated text') + +parser.set_defaults(refresh_state=True) +parser.add_argument("--no_refresh_state", dest="refresh_state", action="store_false", + help="Don't refresh the RNN hidden state between sentences.") + +args = parser.parse_args() + +use_cuda = torch.cuda.is_available() and bool(os.environ.get("LMZOO_USE_GPU", False)) +if use_cuda: + sys.stderr.write("Using GPU device.\n") + +# Set the random seed manually for reproducibility. +torch.manual_seed(args.seed) +if use_cuda: + torch.cuda.manual_seed(args.seed) + +device = torch.device("cuda" if use_cuda else "cpu") + +with open(args.checkpoint, 'rb') as f: + if use_cuda: + model = torch.load(f).to(device) + else: + model = torch.load(f, map_location=lambda storage, loc: storage) + model.cpu() +model.eval() + + +corpus = torch.load(args.corpus) + +ntokens = len(corpus.dictionary) +hidden = model.init_hidden(1) +input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device) + + +# read eval data +with open(args.eval_data, 'r') as f: + lines = f.readlines() +sents = [line.strip().split() + [""] for line in lines] + + +with args.outf as f: + # write header. + f.write("sentence_id\ttoken_id\ttoken\tsurprisal\n") + with torch.no_grad(): # no tracking history + # all_ppls = [] + for sent_id, sent in enumerate(tqdm(sents)): + if args.refresh_state: + hidden = model.init_hidden(1) + + input = torch.tensor([[corpus.dictionary.word2idx[sent[0]]]],dtype=torch.long).to(device) + + f.write("%i\t%i\t%s\t%f\n" % (sent_id + 1, 1, sent[0], 0.0)) + + for i, w in enumerate(sent[1:]): + output, hidden = model(input, hidden) + surprisals = - F.log_softmax(output, dim=2) / np.log(2) + word_idx = corpus.dictionary.word2idx[w] + word_surprisal = surprisals[0, 0, word_idx].item() + + f.write("%i\t%i\t%s\t%f\n" % (sent_id + 1, i + 2, w, word_surprisal)) + + input.fill_(word_idx) diff --git a/models/tinylstm/spec.template.json b/models/tinylstm/spec.template.json index 14c81b9..d2adce1 100644 --- a/models/tinylstm/spec.template.json +++ b/models/tinylstm/spec.template.json @@ -16,7 +16,7 @@ }, "gpu": { "required": false, - "supported": false + "supported": true } }, From 75a97862cbbade0574d9a03ddd60103b7fe07363 Mon Sep 17 00:00:00 2001 From: Jon Gauthier Date: Mon, 25 May 2020 16:02:16 -0400 Subject: [PATCH 2/4] update registry --- docs/registry.json | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/registry.json b/docs/registry.json index 5bdde1d..cc25ae8 100644 --- a/docs/registry.json +++ b/docs/registry.json @@ -121,9 +121,9 @@ "ref_url": "https://github.com/yikangshen/Ordered-Neurons", "image": { "maintainer": "jon@gauthiers.net", - "version": "ca867981a9bd1ac987447ab6f77e0850236f604d", - "checksum": "f302550ef5bd6f45c6adbbd98eefc56db7f0dd73", - "datetime": "2020-05-25T17:43:02.573233114Z", + "version": "0d042cffeebbec17cc6dd8729ab805e39e0ebc7e", + "checksum": "7e1edbbe949c47933978572cc666648052e00a6c", + "datetime": "2020-05-25T20:00:54.469411391Z", "supported_features": { "tokenize": true, "unkify": true, @@ -133,12 +133,12 @@ }, "gpu": { "required": false, - "supported": false + "supported": true }, "registry": "docker.io", "name": "cpllab/language-models", "tag": "ordered-neurons", - "size": 3218739014 + "size": 3218738720 }, "tokenizer": { "type": "word", @@ -179,9 +179,9 @@ "ref_url": "https://github.com/cpllab/tinylstm", "image": { "maintainer": "jon@gauthiers.net", - "version": "3ee853277f636efd11b77c5572728a2aeec7f7cd", - "checksum": "ad49307002c24954c8fc9aa0e4c63f805862ae2b", - "datetime": "2020-05-25T18:15:23.979397297Z", + "version": "0d042cffeebbec17cc6dd8729ab805e39e0ebc7e", + "checksum": "f53db46ca34d796f1710a0fae193c0d538c32c46", + "datetime": "2020-05-25T20:01:15.019868616Z", "supported_features": { "tokenize": true, "unkify": true, @@ -191,12 +191,12 @@ }, "gpu": { "required": false, - "supported": false + "supported": true }, "registry": "docker.io", "name": "cpllab/language-models", "tag": "tinylstm", - "size": 3295281522 + "size": 3295284373 }, "tokenizer": { "type": "word", From 9d8db666f1fe9fa22cbd37ab6eff7c15a0fb6a8a Mon Sep 17 00:00:00 2001 From: Jon Gauthier Date: Tue, 26 May 2020 14:38:02 -0400 Subject: [PATCH 3/4] tinylstm: don't surreptitiously add token --- models/tinylstm/get_surprisals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/tinylstm/get_surprisals.py b/models/tinylstm/get_surprisals.py index a958780..2ed8692 100644 --- a/models/tinylstm/get_surprisals.py +++ b/models/tinylstm/get_surprisals.py @@ -59,7 +59,7 @@ # read eval data with open(args.eval_data, 'r') as f: lines = f.readlines() -sents = [line.strip().split() + [""] for line in lines] +sents = [line.strip().split() for line in lines] with args.outf as f: From f80b40198db3c9e6ea91379dfc559a7332a3e655 Mon Sep 17 00:00:00 2001 From: Jon Gauthier Date: Tue, 26 May 2020 14:40:27 -0400 Subject: [PATCH 4/4] [ci skip] another rebuild & update registry --- docs/registry.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/registry.json b/docs/registry.json index cc25ae8..42b42b4 100644 --- a/docs/registry.json +++ b/docs/registry.json @@ -121,9 +121,9 @@ "ref_url": "https://github.com/yikangshen/Ordered-Neurons", "image": { "maintainer": "jon@gauthiers.net", - "version": "0d042cffeebbec17cc6dd8729ab805e39e0ebc7e", - "checksum": "7e1edbbe949c47933978572cc666648052e00a6c", - "datetime": "2020-05-25T20:00:54.469411391Z", + "version": "9d8db666f1fe9fa22cbd37ab6eff7c15a0fb6a8a", + "checksum": "8e99c084ea20275e9a247badf99139a8a20eb6c7", + "datetime": "2020-05-26T18:39:36.015922392Z", "supported_features": { "tokenize": true, "unkify": true, @@ -179,9 +179,9 @@ "ref_url": "https://github.com/cpllab/tinylstm", "image": { "maintainer": "jon@gauthiers.net", - "version": "0d042cffeebbec17cc6dd8729ab805e39e0ebc7e", - "checksum": "f53db46ca34d796f1710a0fae193c0d538c32c46", - "datetime": "2020-05-25T20:01:15.019868616Z", + "version": "9d8db666f1fe9fa22cbd37ab6eff7c15a0fb6a8a", + "checksum": "344a20e0e67a8ee39c38e34ae1a015f2eb8e99d3", + "datetime": "2020-05-26T18:39:22.988852042Z", "supported_features": { "tokenize": true, "unkify": true, @@ -196,7 +196,7 @@ "registry": "docker.io", "name": "cpllab/language-models", "tag": "tinylstm", - "size": 3295284373 + "size": 3295284357 }, "tokenizer": { "type": "word",