Skip to content

Commit

Permalink
Fix some comments and docstrings.
Browse files Browse the repository at this point in the history
  • Loading branch information
Karan Desai committed Feb 11, 2019
1 parent 82e3c6b commit feba6de
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 50 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ If you are a returning user (from Visual Dialog Challenge 2018), here are some k
Setup and Dependencies
----------------------

This starter code is implemented using PyTorch v1.0, and provides out of the box support with cuda 9 and CuDNN 7.
This starter code is implemented using PyTorch v1.0, and provides out of the box support with CUDA 9 and CuDNN 7.
There are two recommended ways to set up this codebase: Anaconda or Miniconda, and Docker.

### Anaconda or Miniconda
Expand Down
13 changes: 6 additions & 7 deletions evaluate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import argparse
from datetime import datetime
import json
import os

Expand Down Expand Up @@ -89,7 +88,7 @@
if isinstance(args.gpu_ids, int): args.gpu_ids = [args.gpu_ids]
device = torch.device("cuda", args.gpu_ids[0]) if args.gpu_ids[0] >= 0 else torch.device("cpu")

# print config and args
# Print config and args.
print(yaml.dump(config, default_flow_style=False))
for arg in vars(args):
print("{:<20}: {}".format(arg, getattr(args, arg)))
Expand All @@ -112,16 +111,16 @@
val_dataset, batch_size=config["solver"]["batch_size"], num_workers=args.cpu_workers
)

# pass vocabulary to construct nn.Embedding
# Pass vocabulary to construct Embedding layer.
encoder = Encoder(config["model"], val_dataset.vocabulary)
decoder = Decoder(config["model"], val_dataset.vocabulary)
print("Encoder: {}".format(config["model"]["encoder"]))
print("Decoder: {}".format(config["model"]["decoder"]))

# share word embedding between encoder and decoder
# Share word embedding between encoder and decoder.
decoder.word_embed = encoder.word_embed

# wrap encoder and decoder in a model
# Wrap encoder and decoder in a model.
model = EncoderDecoderModel(encoder, decoder).to(device)
if -1 not in args.gpu_ids:
model = nn.DataParallel(model, args.gpu_ids)
Expand Down Expand Up @@ -152,8 +151,8 @@

ranks = scores_to_ranks(output)
for i in range(len(batch["img_ids"])):
# cast into types explicitly to ensure no errors in schema
# round ids are 1-10, not 0-9
# Cast into types explicitly to ensure no errors in schema.
# Round ids are 1-10, not 0-9
if args.split == "test":
ranks_json.append({
"image_id": batch["img_ids"][i].item(),
Expand Down
19 changes: 9 additions & 10 deletions train.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import argparse
import itertools
import os

from tensorboardX import SummaryWriter
import torch
Expand Down Expand Up @@ -89,7 +88,7 @@
if isinstance(args.gpu_ids, int): args.gpu_ids = [args.gpu_ids]
device = torch.device("cuda", args.gpu_ids[0]) if args.gpu_ids[0] >= 0 else torch.device("cpu")

# print config and args
# Print config and args.
print(yaml.dump(config, default_flow_style=False))
for arg in vars(args):
print("{:<20}: {}".format(arg, getattr(args, arg)))
Expand All @@ -113,16 +112,16 @@
val_dataset, batch_size=config["solver"]["batch_size"], num_workers=args.cpu_workers
)

# pass vocabulary to construct nn.Embedding
# Pass vocabulary to construct Embedding layer.
encoder = Encoder(config["model"], train_dataset.vocabulary)
decoder = Decoder(config["model"], train_dataset.vocabulary)
print("Encoder: {}".format(config["model"]["encoder"]))
print("Decoder: {}".format(config["model"]["decoder"]))

# share word embedding between encoder and decoder
# Share word embedding between encoder and decoder.
decoder.word_embed = encoder.word_embed

# wrap encoder and decoder in a model
# Wrap encoder and decoder in a model.
model = EncoderDecoderModel(encoder, decoder).to(device)
if -1 not in args.gpu_ids:
model = nn.DataParallel(model, args.gpu_ids)
Expand All @@ -141,7 +140,7 @@
sparse_metrics = SparseGTMetrics()
ndcg = NDCG()

# if loading from checkpoint, adjust start epoch and load parameters
# If loading from checkpoint, adjust start epoch and load parameters.
if args.load_pthpath == "":
start_epoch = 0
else:
Expand Down Expand Up @@ -169,14 +168,14 @@
global_iteration_step = start_epoch * iterations
for epoch in range(start_epoch, config["solver"]["num_epochs"] + 1):

# --------------------------------------------------------------------------------------------
# ON EPOCH START (combine dataloaders if training on train + val)
# --------------------------------------------------------------------------------------------
if config["solver"]["training_splits"] == "trainval":
combined_dataloader = itertools.chain(train_dataloader, val_dataloader)
else:
combined_dataloader = itertools.chain(train_dataloader)

# --------------------------------------------------------------------------------------------
# ON EPOCH START (combine dataloaders if training on train + val)
# --------------------------------------------------------------------------------------------
print(f"\nTraining for epoch {epoch}:")
for i, batch in enumerate(tqdm(combined_dataloader)):
for key in batch:
Expand All @@ -199,7 +198,7 @@
# --------------------------------------------------------------------------------------------
checkpoint_manager.step()

# validate and report automatic metrics
# Validate and report automatic metrics.
if args.validate:
print(f"\nValidation after epoch {epoch}:")
for i, batch in enumerate(tqdm(val_dataloader)):
Expand Down
41 changes: 23 additions & 18 deletions visdialch/data/dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union

import torch
from torch.nn.functional import normalize
Expand All @@ -10,8 +10,13 @@


class VisDialDataset(Dataset):
"""
A full representation of VisDial v1.0 (train/val/test) dataset. According to the appropriate
split, it returns dictionary of question, image, history, ground truth answer, answer options,
dense annotations etc.
"""
def __init__(self,
config: Dict[str, Union[int, str]],
config: Dict[str, Any],
dialogs_jsonpath: str,
dense_annotations_jsonpath: Optional[str] = None,
overfit: bool = False,
Expand All @@ -29,7 +34,7 @@ def __init__(self,
config["word_counts_json"], min_count=config["vocab_min_count"]
)

# initialize image features reader according to split
# Initialize image features reader according to split.
image_features_hdfpath = config["image_features_train_h5"]
if "val" in self.dialogs_reader.split:
image_features_hdfpath = config["image_features_val_h5"]
Expand All @@ -38,7 +43,7 @@ def __init__(self,

self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory)

# keep a list of image_ids as primary keys to access data
# Keep a list of image_ids as primary keys to access data.
self.image_ids = list(self.dialogs_reader.dialogs.keys())
if overfit:
self.image_ids = self.image_ids[:5]
Expand All @@ -51,22 +56,22 @@ def __len__(self):
return len(self.image_ids)

def __getitem__(self, index):
# get image_id, which serves as a primary key for current instance
# Get image_id, which serves as a primary key for current instance.
image_id = self.image_ids[index]

# get image features for this image_id using hdf reader
# Get image features for this image_id using hdf reader.
image_features = self.hdf_reader[image_id]
image_features = torch.tensor(image_features)
# normalize image features at zero-th dimension (since there's no batch dimension)
# Normalize image features at zero-th dimension (since there's no batch dimension).
if self.config["img_norm"]:
image_features = normalize(image_features, dim=0, p=2)

# retrieve instance for this image_id using json reader
# Retrieve instance for this image_id using json reader.
visdial_instance = self.dialogs_reader[image_id]
caption = visdial_instance["caption"]
dialog = visdial_instance["dialog"]

# convert word tokens of caption, question, answer and answer options to integers
# Convert word tokens of caption, question, answer and answer options to integers.
caption = self.vocabulary.to_indices(caption)
for i in range(len(dialog)):
dialog[i]["question"] = self.vocabulary.to_indices(dialog[i]["question"])
Expand Down Expand Up @@ -97,8 +102,8 @@ def __getitem__(self, index):
if "test" not in self.split:
answer_indices = [dialog_round["gt_index"] for dialog_round in dialog]

# collect everything as tensors for ``collate_fn`` of dataloader to work seemlessly
# questions, history, etc. are converted to LongTensors, for nn.Embedding input
# Collect everything as tensors for ``collate_fn`` of dataloader to work seemlessly
# questions, history, etc. are converted to LongTensors, for nn.Embedding input.
item = {}
item["img_ids"] = torch.tensor(image_id).long()
item["img_feat"] = image_features
Expand All @@ -112,7 +117,7 @@ def __getitem__(self, index):
if "test" not in self.split:
item["ans_ind"] = torch.tensor(answer_indices).long()

# gather dense annotations
# Gather dense annotations.
if "val" in self.split:
dense_annotations = self.annotations_reader[image_id]
item["gt_relevance"] = torch.tensor(dense_annotations["gt_relevance"]).float()
Expand Down Expand Up @@ -142,7 +147,7 @@ def _pad_sequences(self, sequences: List[List[int]]):
sequences[i] = sequences[i][: self.config["max_sequence_length"] - 1]
sequence_lengths = [len(sequence) for sequence in sequences]

# pad all sequences to max_sequence_length
# Pad all sequences to max_sequence_length.
maxpadded_sequences = torch.full(
(len(sequences), self.config["max_sequence_length"]),
fill_value=self.vocabulary.PAD_INDEX,
Expand All @@ -158,7 +163,7 @@ def _get_history(self,
caption: List[int],
questions: List[List[int]],
answers: List[List[int]]):
# allow double length of caption, equivalent to a concatenated QA pair
# Allow double length of caption, equivalent to a concatenated QA pair.
caption = caption[: self.config["max_sequence_length"] * 2 - 1]

for i in range(len(questions)):
Expand All @@ -167,18 +172,18 @@ def _get_history(self,
for i in range(len(answers)):
answers[i] = answers[i][: self.config["max_sequence_length"] - 1]

# history for first round is caption, else concatenated QA pair of previous round
# History for first round is caption, else concatenated QA pair of previous round.
history = []
history.append(caption)
for question, answer in zip(questions, answers):
history.append(question + answer + [self.vocabulary.EOS_INDEX])
# drop last entry from history (there's no eleventh question)
# Drop last entry from history (there's no eleventh question).
history = history[:-1]
max_history_length = self.config["max_sequence_length"] * 2

if self.config.get("concat_history", False):
# concatenated_history has similar structure as history, except it contains
# concatenated QA pairs from previous rounds
# Concatenated_history has similar structure as history, except it contains
# concatenated QA pairs from previous rounds.
concatenated_history = []
concatenated_history.append(caption)
for i in range(1, len(history)):
Expand Down
22 changes: 11 additions & 11 deletions visdialch/data/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from typing import Dict, List, Union

import h5py
# a bit slow, and just splits sentences to list of words, can be doable in VisDialJsonReader
# A bit slow, and just splits sentences to list of words, can be doable in `DialogsReader`.
from nltk.tokenize import word_tokenize
from tqdm import tqdm

Expand All @@ -39,27 +39,27 @@ def __init__(self, dialogs_jsonpath: str):
self.questions = visdial_data["data"]["questions"]
self.answers = visdial_data["data"]["answers"]

# add empty question, answer at the end, useful for padding dialog rounds for test
# Add empty question, answer at the end, useful for padding dialog rounds for test.
self.questions.append("")
self.answers.append("")

# image_id serves as key for all three dicts here
# Image_id serves as key for all three dicts here.
self.captions = {}
self.dialogs = {}
self.num_rounds = {}

for dialog_for_image in visdial_data["data"]["dialogs"]:
self.captions[dialog_for_image["image_id"]] = dialog_for_image["caption"]

# record original length of dialog, before padding
# 10 for train and val splits, 10 or less for test split
# Record original length of dialog, before padding.
# 10 for train and val splits, 10 or less for test split.
self.num_rounds[dialog_for_image["image_id"]] = len(dialog_for_image["dialog"])

# pad dialog at the end with empty question and answer pairs (for test split)
# Pad dialog at the end with empty question and answer pairs (for test split).
while len(dialog_for_image["dialog"]) < 10:
dialog_for_image["dialog"].append({"question": -1, "answer": -1})

# add empty answer /answer options if not provided (for test split)
# Add empty answer /answer options if not provided (for test split).
for i in range(len(dialog_for_image["dialog"])):
if "answer" not in dialog_for_image["dialog"][i]:
dialog_for_image["dialog"][i]["answer"] = -1
Expand Down Expand Up @@ -88,7 +88,7 @@ def __getitem__(self, image_id: int) -> Dict[str, Union[int, str, List]]:
dialog_for_image = copy.copy(self.dialogs[image_id])
num_rounds = self.num_rounds[image_id]

# replace question and answer indices with actual word tokens
# Replace question and answer indices with actual word tokens.
for i in range(len(dialog_for_image)):
dialog_for_image[i]["question"] = self.questions[dialog_for_image[i]["question"]]
dialog_for_image[i]["answer"] = self.answers[dialog_for_image[i]["answer"]]
Expand Down Expand Up @@ -171,7 +171,7 @@ def __init__(self, features_hdfpath: str, in_memory: bool = False):
self._split = features_hdf.attrs["split"]
self.image_id_list = list(features_hdf["image_id"])
# "features" is List[np.ndarray] if the dataset is loaded in-memory
# if not loaded in memory, then list of None
# If not loaded in memory, then list of None.
self.features = [None] * len(self.image_id_list)


Expand All @@ -181,15 +181,15 @@ def __len__(self):
def __getitem__(self, image_id: int):
index = self.image_id_list.index(image_id)
if self._in_memory:
# load features during first epoch, all not loaded together as it has a slow start
# Load features during first epoch, all not loaded together as it has a slow start.
if self.features[index] is not None:
image_id_features = self.features[index]
else:
with h5py.File(self.features_hdfpath, "r") as features_hdf:
image_id_features = features_hdf["features"][index]
self.features[index] = image_id_features
else:
# read chunk from file everytime if not loaded in memory
# Read chunk from file everytime if not loaded in memory.
with h5py.File(self.features_hdfpath, "r") as features_hdf:
image_id_features = features_hdf["features"][index]

Expand Down
6 changes: 3 additions & 3 deletions visdialch/decoders/disc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self, config, vocabulary):
config["lstm_hidden_size"],
batch_first=True)

# options are variable length padded sequences, use DynamicRNN
# Options are variable length padded sequences, use DynamicRNN.
self.option_rnn = DynamicRNN(self.option_rnn)

def forward(self, encoder_output, batch):
Expand Down Expand Up @@ -58,11 +58,11 @@ def forward(self, encoder_output, batch):
)
options_embed[nonzero_options_length_indices] = nonzero_options_embed

# Repeat encoder output for every option
# Repeat encoder output for every option.
# shape: (batch_size, num_rounds, num_options, max_sequence_length)
encoder_output = encoder_output.unsqueeze(2).repeat(1, 1, num_options, 1)

# shape now same as `options`, can calculate dot product similarity
# Shape now same as `options`, can calculate dot product similarity.
# shape: (batch_size * num_rounds * num_options, lstm_hidden_state)
encoder_output = encoder_output.view(
batch_size * num_rounds * num_options, self.config["lstm_hidden_size"]
Expand Down

0 comments on commit feba6de

Please sign in to comment.