Fix some comments and docstrings.

batra-mlp-lab · Feb 11, 2019 · feba6de · feba6de
1 parent 82e3c6b
commit feba6de
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ If you are a returning user (from Visual Dialog Challenge 2018), here are some k
 Setup and Dependencies
 ----------------------
 
-This starter code is implemented using PyTorch v1.0, and provides out of the box support with cuda 9 and CuDNN 7. 
+This starter code is implemented using PyTorch v1.0, and provides out of the box support with CUDA 9 and CuDNN 7. 
 There are two recommended ways to set up this codebase: Anaconda or Miniconda, and Docker.
 
 ### Anaconda or Miniconda

diff --git a/evaluate.py b/evaluate.py
@@ -1,5 +1,4 @@
 import argparse
-from datetime import datetime
 import json
 import os
 
@@ -89,7 +88,7 @@
 if isinstance(args.gpu_ids, int): args.gpu_ids = [args.gpu_ids]
 device = torch.device("cuda", args.gpu_ids[0]) if args.gpu_ids[0] >= 0 else torch.device("cpu")
 
-# print config and args
+# Print config and args.
 print(yaml.dump(config, default_flow_style=False))
 for arg in vars(args):
     print("{:<20}: {}".format(arg, getattr(args, arg)))
@@ -112,16 +111,16 @@
     val_dataset, batch_size=config["solver"]["batch_size"], num_workers=args.cpu_workers
 )
 
-# pass vocabulary to construct nn.Embedding
+# Pass vocabulary to construct Embedding layer.
 encoder = Encoder(config["model"], val_dataset.vocabulary)
 decoder = Decoder(config["model"], val_dataset.vocabulary)
 print("Encoder: {}".format(config["model"]["encoder"]))
 print("Decoder: {}".format(config["model"]["decoder"]))
 
-# share word embedding between encoder and decoder
+# Share word embedding between encoder and decoder.
 decoder.word_embed = encoder.word_embed
 
-# wrap encoder and decoder in a model
+# Wrap encoder and decoder in a model.
 model = EncoderDecoderModel(encoder, decoder).to(device)
 if -1 not in args.gpu_ids:
     model = nn.DataParallel(model, args.gpu_ids)
@@ -152,8 +151,8 @@
 
     ranks = scores_to_ranks(output)
     for i in range(len(batch["img_ids"])):
-        # cast into types explicitly to ensure no errors in schema
-        # round ids are 1-10, not 0-9
+        # Cast into types explicitly to ensure no errors in schema.
+        # Round ids are 1-10, not 0-9
         if args.split == "test":
             ranks_json.append({
                 "image_id": batch["img_ids"][i].item(),

diff --git a/train.py b/train.py
@@ -1,6 +1,5 @@
 import argparse
 import itertools
-import os
 
 from tensorboardX import SummaryWriter
 import torch
@@ -89,7 +88,7 @@
 if isinstance(args.gpu_ids, int): args.gpu_ids = [args.gpu_ids]
 device = torch.device("cuda", args.gpu_ids[0]) if args.gpu_ids[0] >= 0 else torch.device("cpu")
 
-# print config and args
+# Print config and args.
 print(yaml.dump(config, default_flow_style=False))
 for arg in vars(args):
     print("{:<20}: {}".format(arg, getattr(args, arg)))
@@ -113,16 +112,16 @@
     val_dataset, batch_size=config["solver"]["batch_size"], num_workers=args.cpu_workers
 )
 
-# pass vocabulary to construct nn.Embedding
+# Pass vocabulary to construct Embedding layer.
 encoder = Encoder(config["model"], train_dataset.vocabulary)
 decoder = Decoder(config["model"], train_dataset.vocabulary)
 print("Encoder: {}".format(config["model"]["encoder"]))
 print("Decoder: {}".format(config["model"]["decoder"]))
 
-# share word embedding between encoder and decoder
+# Share word embedding between encoder and decoder.
 decoder.word_embed = encoder.word_embed
 
-# wrap encoder and decoder in a model
+# Wrap encoder and decoder in a model.
 model = EncoderDecoderModel(encoder, decoder).to(device)
 if -1 not in args.gpu_ids:
     model = nn.DataParallel(model, args.gpu_ids)
@@ -141,7 +140,7 @@
 sparse_metrics = SparseGTMetrics()
 ndcg = NDCG()
 
-# if loading from checkpoint, adjust start epoch and load parameters
+# If loading from checkpoint, adjust start epoch and load parameters.
 if args.load_pthpath == "":
     start_epoch = 0
 else:
@@ -169,14 +168,14 @@
 global_iteration_step = start_epoch * iterations
 for epoch in range(start_epoch, config["solver"]["num_epochs"] + 1):
 
+    # --------------------------------------------------------------------------------------------
+    #   ON EPOCH START  (combine dataloaders if training on train + val)
+    # --------------------------------------------------------------------------------------------
     if config["solver"]["training_splits"] == "trainval":
         combined_dataloader = itertools.chain(train_dataloader, val_dataloader)
     else:
         combined_dataloader = itertools.chain(train_dataloader)
 
-    # --------------------------------------------------------------------------------------------
-    #   ON EPOCH START  (combine dataloaders if training on train + val)
-    # --------------------------------------------------------------------------------------------
     print(f"\nTraining for epoch {epoch}:")
     for i, batch in enumerate(tqdm(combined_dataloader)):
         for key in batch:
@@ -199,7 +198,7 @@
     # --------------------------------------------------------------------------------------------
     checkpoint_manager.step()
 
-    # validate and report automatic metrics
+    # Validate and report automatic metrics.
     if args.validate:
         print(f"\nValidation after epoch {epoch}:")
         for i, batch in enumerate(tqdm(val_dataloader)):

diff --git a/visdialch/data/dataset.py b/visdialch/data/dataset.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from torch.nn.functional import normalize
@@ -10,8 +10,13 @@
 
 
 class VisDialDataset(Dataset):
+    """
+    A full representation of VisDial v1.0 (train/val/test) dataset. According to the appropriate
+    split, it returns dictionary of question, image, history, ground truth answer, answer options,
+    dense annotations etc.        
+    """
     def __init__(self,
-                 config: Dict[str, Union[int, str]],
+                 config: Dict[str, Any],
                  dialogs_jsonpath: str,
                  dense_annotations_jsonpath: Optional[str] = None,
                  overfit: bool = False,
@@ -29,7 +34,7 @@ def __init__(self,
             config["word_counts_json"], min_count=config["vocab_min_count"]
         )
 
-        # initialize image features reader according to split
+        # Initialize image features reader according to split.
         image_features_hdfpath = config["image_features_train_h5"]
         if "val" in self.dialogs_reader.split:
             image_features_hdfpath = config["image_features_val_h5"]
@@ -38,7 +43,7 @@ def __init__(self,
 
         self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory)
 
-        # keep a list of image_ids as primary keys to access data
+        # Keep a list of image_ids as primary keys to access data.
         self.image_ids = list(self.dialogs_reader.dialogs.keys())
         if overfit:
             self.image_ids = self.image_ids[:5]
@@ -51,22 +56,22 @@ def __len__(self):
         return len(self.image_ids)
 
     def __getitem__(self, index):
-        # get image_id, which serves as a primary key for current instance
+        # Get image_id, which serves as a primary key for current instance.
         image_id = self.image_ids[index]
 
-        # get image features for this image_id using hdf reader
+        # Get image features for this image_id using hdf reader.
         image_features = self.hdf_reader[image_id]
         image_features = torch.tensor(image_features)
-        # normalize image features at zero-th dimension (since there's no batch dimension)
+        # Normalize image features at zero-th dimension (since there's no batch dimension).
         if self.config["img_norm"]:
             image_features = normalize(image_features, dim=0, p=2)
 
-        # retrieve instance for this image_id using json reader
+        # Retrieve instance for this image_id using json reader.
         visdial_instance = self.dialogs_reader[image_id]
         caption = visdial_instance["caption"]
         dialog = visdial_instance["dialog"]
 
-        # convert word tokens of caption, question, answer and answer options to integers
+        # Convert word tokens of caption, question, answer and answer options to integers.
         caption = self.vocabulary.to_indices(caption)
         for i in range(len(dialog)):
             dialog[i]["question"] = self.vocabulary.to_indices(dialog[i]["question"])
@@ -97,8 +102,8 @@ def __getitem__(self, index):
         if "test" not in self.split:
             answer_indices = [dialog_round["gt_index"] for dialog_round in dialog]
 
-        # collect everything as tensors for ``collate_fn`` of dataloader to work seemlessly
-        # questions, history, etc. are converted to LongTensors, for nn.Embedding input
+        # Collect everything as tensors for ``collate_fn`` of dataloader to work seemlessly
+        # questions, history, etc. are converted to LongTensors, for nn.Embedding input.
         item = {}
         item["img_ids"] = torch.tensor(image_id).long()
         item["img_feat"] = image_features
@@ -112,7 +117,7 @@ def __getitem__(self, index):
         if "test" not in self.split:
             item["ans_ind"] = torch.tensor(answer_indices).long()
 
-        # gather dense annotations
+        # Gather dense annotations.
         if "val" in self.split:
             dense_annotations = self.annotations_reader[image_id]
             item["gt_relevance"] = torch.tensor(dense_annotations["gt_relevance"]).float()
@@ -142,7 +147,7 @@ def _pad_sequences(self, sequences: List[List[int]]):
             sequences[i] = sequences[i][: self.config["max_sequence_length"] - 1]
         sequence_lengths = [len(sequence) for sequence in sequences]
 
-        # pad all sequences to max_sequence_length
+        # Pad all sequences to max_sequence_length.
         maxpadded_sequences = torch.full(
             (len(sequences), self.config["max_sequence_length"]),
             fill_value=self.vocabulary.PAD_INDEX,
@@ -158,7 +163,7 @@ def _get_history(self,
                      caption: List[int],
                      questions: List[List[int]],
                      answers: List[List[int]]):
-        # allow double length of caption, equivalent to a concatenated QA pair
+        # Allow double length of caption, equivalent to a concatenated QA pair.
         caption = caption[: self.config["max_sequence_length"] * 2 - 1]
 
         for i in range(len(questions)):
@@ -167,18 +172,18 @@ def _get_history(self,
         for i in range(len(answers)):
             answers[i] = answers[i][: self.config["max_sequence_length"] - 1]
 
-        # history for first round is caption, else concatenated QA pair of previous round
+        # History for first round is caption, else concatenated QA pair of previous round.
         history = []
         history.append(caption)
         for question, answer in zip(questions, answers):
             history.append(question + answer + [self.vocabulary.EOS_INDEX])
-        # drop last entry from history (there's no eleventh question)
+        # Drop last entry from history (there's no eleventh question).
         history = history[:-1]
         max_history_length = self.config["max_sequence_length"] * 2
 
         if self.config.get("concat_history", False):
-            # concatenated_history has similar structure as history, except it contains
-            # concatenated QA pairs from previous rounds
+            # Concatenated_history has similar structure as history, except it contains
+            # concatenated QA pairs from previous rounds.
             concatenated_history = []
             concatenated_history.append(caption)
             for i in range(1, len(history)):

diff --git a/visdialch/data/readers.py b/visdialch/data/readers.py
@@ -15,7 +15,7 @@
 from typing import Dict, List, Union
 
 import h5py
-# a bit slow, and just splits sentences to list of words, can be doable in VisDialJsonReader
+# A bit slow, and just splits sentences to list of words, can be doable in `DialogsReader`.
 from nltk.tokenize import word_tokenize
 from tqdm import tqdm
 
@@ -39,27 +39,27 @@ def __init__(self, dialogs_jsonpath: str):
             self.questions = visdial_data["data"]["questions"]
             self.answers = visdial_data["data"]["answers"]
 
-            # add empty question, answer at the end, useful for padding dialog rounds for test
+            # Add empty question, answer at the end, useful for padding dialog rounds for test.
             self.questions.append("")
             self.answers.append("")
 
-            # image_id serves as key for all three dicts here
+            # Image_id serves as key for all three dicts here.
             self.captions = {}
             self.dialogs = {}
             self.num_rounds = {}
 
             for dialog_for_image in visdial_data["data"]["dialogs"]:
                 self.captions[dialog_for_image["image_id"]] = dialog_for_image["caption"]
 
-                # record original length of dialog, before padding
-                # 10 for train and val splits, 10 or less for test split
+                # Record original length of dialog, before padding.
+                # 10 for train and val splits, 10 or less for test split.
                 self.num_rounds[dialog_for_image["image_id"]] = len(dialog_for_image["dialog"])
 
-                # pad dialog at the end with empty question and answer pairs (for test split)
+                # Pad dialog at the end with empty question and answer pairs (for test split).
                 while len(dialog_for_image["dialog"]) < 10:
                     dialog_for_image["dialog"].append({"question": -1, "answer": -1})
 
-                # add empty answer /answer options if not provided (for test split)
+                # Add empty answer /answer options if not provided (for test split).
                 for i in range(len(dialog_for_image["dialog"])):
                     if "answer" not in dialog_for_image["dialog"][i]:
                         dialog_for_image["dialog"][i]["answer"] = -1
@@ -88,7 +88,7 @@ def __getitem__(self, image_id: int) -> Dict[str, Union[int, str, List]]:
         dialog_for_image = copy.copy(self.dialogs[image_id])
         num_rounds = self.num_rounds[image_id]
 
-        # replace question and answer indices with actual word tokens
+        # Replace question and answer indices with actual word tokens.
         for i in range(len(dialog_for_image)):
             dialog_for_image[i]["question"] = self.questions[dialog_for_image[i]["question"]]
             dialog_for_image[i]["answer"] = self.answers[dialog_for_image[i]["answer"]]
@@ -171,7 +171,7 @@ def __init__(self, features_hdfpath: str, in_memory: bool = False):
             self._split = features_hdf.attrs["split"]
             self.image_id_list = list(features_hdf["image_id"])
             # "features" is List[np.ndarray] if the dataset is loaded in-memory
-            # if not loaded in memory, then list of None
+            # If not loaded in memory, then list of None.
             self.features = [None] * len(self.image_id_list)
 
 
@@ -181,15 +181,15 @@ def __len__(self):
     def __getitem__(self, image_id: int):
         index = self.image_id_list.index(image_id)
         if self._in_memory:
-            # load features during first epoch, all not loaded together as it has a slow start
+            # Load features during first epoch, all not loaded together as it has a slow start.
             if self.features[index] is not None:
                 image_id_features = self.features[index]
             else:
                 with h5py.File(self.features_hdfpath, "r") as features_hdf:
                     image_id_features = features_hdf["features"][index]
                     self.features[index] = image_id_features
         else:
-            # read chunk from file everytime if not loaded in memory
+            # Read chunk from file everytime if not loaded in memory.
             with h5py.File(self.features_hdfpath, "r") as features_hdf:
                 image_id_features = features_hdf["features"][index]
 

diff --git a/visdialch/decoders/disc.py b/visdialch/decoders/disc.py
@@ -16,7 +16,7 @@ def __init__(self, config, vocabulary):
                                   config["lstm_hidden_size"],
                                   batch_first=True)
 
-        # options are variable length padded sequences, use DynamicRNN
+        # Options are variable length padded sequences, use DynamicRNN.
         self.option_rnn = DynamicRNN(self.option_rnn)
 
     def forward(self, encoder_output, batch):
@@ -58,11 +58,11 @@ def forward(self, encoder_output, batch):
         )
         options_embed[nonzero_options_length_indices] = nonzero_options_embed
 
-        # Repeat encoder output for every option
+        # Repeat encoder output for every option.
         # shape: (batch_size, num_rounds, num_options, max_sequence_length)
         encoder_output = encoder_output.unsqueeze(2).repeat(1, 1, num_options, 1)
 
-        # shape now same as `options`, can calculate dot product similarity
+        # Shape now same as `options`, can calculate dot product similarity.
         # shape: (batch_size * num_rounds * num_options, lstm_hidden_state)
         encoder_output = encoder_output.view(
             batch_size * num_rounds * num_options, self.config["lstm_hidden_size"]