NVIDIA · tijyojwad · Oct 21, 2020 · Oct 22, 2020 · Oct 23, 2020 · Oct 23, 2020
diff --git a/docs/source/features.rst b/docs/source/features.rst
@@ -62,8 +62,8 @@ NeMo toolkit, we leverage the data loader abstractions defined in
 
 Currently available data loaders - 
 
-* :class:`ReadPileupDataLoader<variantworks.dataloader.ReadPileupDataLoader>` - encapsulates loading samples from VCF and using PileupEncoders to generate training data.
-  This type of data loader is typically useful for variant calling tasks which process BAMs and VCFs simultaneously.
+* :class:`VariantDataLoader<variantworks.dataloader.VariantDataLoader>` - encapsulates a generalized loading for loading entries from VCF and applying
+  user defined input and label encoders to generate data training for training and test. This type of data loader is typically useful for tasks such as variant calling.
 * :class:`HDFDataLoader<variantworks.dataloader.HDFDataLoader>` - encapsulates a generalized, multi-threaded data loader for loading tensors from HDF files. This type
   of data loader is frequently used when data is prepared/serialized ahead of time into a HDF file and directly read from the HDF file during training/evaluation loops.
 

diff --git a/docs/source/snippets/snp_zygosity_predictor_inference.py b/docs/source/snippets/snp_zygosity_predictor_inference.py
@@ -22,7 +22,7 @@
 import pathlib
 import torch
 
-from variantworks.dataloader import ReadPileupDataLoader
+from variantworks.dataloader import VariantDataLoader
 from variantworks.io.vcfio import VCFReader, VCFWriter
 from variantworks.networks import AlexNet
 from variantworks.encoders import PileupEncoder, ZygosityLabelDecoder
@@ -52,8 +52,8 @@
 bam = os.path.join(data_folder, "small_bam.bam")
 labels = os.path.join(data_folder, "candidates.vcf.gz")
 vcf_loader = VCFReader(vcf=labels, bams=[bam], is_fp=False)
-test_dataset = ReadPileupDataLoader(ReadPileupDataLoader.Type.TEST, [vcf_loader], batch_size=32,
-                                    shuffle=False, sample_encoder=pileup_encoder)
+test_dataset = VariantDataLoader(VariantDataLoader.Type.TEST, [vcf_loader], batch_size=32,
+                                 shuffle=False, input_encoder=pileup_encoder)
 
 # Create inference DAG
 encoding = test_dataset()

diff --git a/docs/source/snippets/snp_zygosity_predictor_training.py b/docs/source/snippets/snp_zygosity_predictor_training.py
@@ -21,7 +21,7 @@
 import os
 import pathlib
 
-from variantworks.dataloader import ReadPileupDataLoader
+from variantworks.dataloader import VariantDataLoader
 from variantworks.io.vcfio import VCFReader
 from variantworks.networks import AlexNet
 from variantworks.encoders import PileupEncoder, ZygosityLabelEncoder
@@ -59,15 +59,15 @@
 vcf_loader = VCFReader(vcf=samples, bams=[bam], is_fp=False)
 
 # Create a data loader with custom sample and label encoder.
-dataset_train = ReadPileupDataLoader(ReadPileupDataLoader.Type.TRAIN, [vcf_loader],
-                                     batch_size=32, shuffle=True,
-                                     sample_encoder=pileup_encoder, label_encoder=zyg_encoder)
+dataset_train = VariantDataLoader(VariantDataLoader.Type.TRAIN, [vcf_loader],
+                                  batch_size=32, shuffle=True,
+                                  input_encoder=pileup_encoder, label_encoder=zyg_encoder)
 
 # Use CrossEntropyLoss to train.
 vz_ce_loss = nemo.backends.pytorch.common.losses.CrossEntropyLossNM(logits_ndim=2)
 
 # Create NeMo training DAG.
-vz_labels, encoding = dataset_train()
+encoding, vz_labels = dataset_train()
 vz = model(encoding=encoding)
 vz_loss = vz_ce_loss(logits=vz, labels=vz_labels)
 

diff --git a/hooks/pre-push b/hooks/pre-push
@@ -37,6 +37,14 @@ echo ""
 python $sdk_root_dir/ci/checks/check_copyright.py
 $sdk_root_dir/style_check
 
+echo ""
+echo "Running doc source checks..."
+echo ""
+for f in $(find docs/source/snippets/*.py | sort -r); do
+  logger "Executing \"${f}\""
+  python "${f}"
+done
+
 cd $tests_dir
 echo ""
 echo "Running pre-push tests from $PWD..."

diff --git a/samples/custom_encoder/README.md b/samples/custom_encoder/README.md
@@ -0,0 +1,23 @@
+# Custom Encoder Samples
+
+This sample covers how to write a custom encoder in `VariantWorks` using scalar
+values from a VCF. Such a setup is useful when considering pipelines which may
+require additional metadata information to be passed into the network along with, e.g.
+the read pileup, to better inform the network about the context.
+
+## Training pipeline
+
+The training pipeline in this sample is kept extremely simple. The training and evaluation data
+is generated using an online encoder (i.e. an encoder that gnerates tensors during the training pipeline)
+and passed into a multi-layer perceptron. The objective of this pipeline is to predict the zygosity of
+each variant only using metadata from the variant records.
+
+Key files in this sample are:
+
+1. `custom_encoder.py` - This file contains the custom encoder implementation for the VCF scalar values.
+2. `custom_model.py` - This file contains a simple MLP model.
+3. `pipeline.py` - The training and evaluation pipeline.
+
+## Data
+
+A sample training and evaluation VCF is available under the `data` folder. This can be used to run the sample.
diff --git a/samples/custom_encoder/custom_encoder.py b/samples/custom_encoder/custom_encoder.py
@@ -0,0 +1,52 @@
+#
+# Copyright 2020 NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Custom encoder that converts VCF scalar values into a torch tensor."""
+
+import torch
+
+from variantworks.encoders import Encoder
+
+
+class CustomEncoder(Encoder):
+    """An encoder that converts scalar VCF format values into a flattened tensor."""
+
+    def __init__(self, vcf_format_keys=[]):
+        """Constructor for the encoder.
+
+        Args:
+            vcf_format_keys : A list of format keys to process for the encoding.
+
+        Returns:
+            Instance of class.
+        """
+        self._vcf_format_keys = vcf_format_keys
+
+    def __call__(self, variant):
+        """Virtual function that implements the actual encoding.
+
+        Returns:
+            VCF values in torch tensor.
+        """
+        data = []
+        for key in self._vcf_format_keys:
+            idx = variant.format.index(key)
+            val = variant.samples[0][idx]
+            if isinstance(val, list):
+                data.extend(val)
+            else:
+                data.append(val)
+        tensor = torch.FloatTensor(data)
+        return tensor
diff --git a/samples/custom_encoder/custom_model.py b/samples/custom_encoder/custom_model.py
@@ -0,0 +1,96 @@
+#
+# Copyright 2020 NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A simple MLP model for the sample."""
+
+import torch
+import torch.nn as nn
+
+from nemo.backends.pytorch.nm import TrainableNM
+from nemo.utils.decorators import add_port_docs
+from nemo.core.neural_types import NeuralType, LogitsType, VoidType
+from nemo.core.neural_factory import DeviceType
+
+
+class MLP(TrainableNM):
+    """A Neural Module for an MLP."""
+
+    @property
+    @add_port_docs()
+    def input_ports(self):
+        """Return definitions of module input ports.
+
+        Returns:
+            Module input ports.
+        """
+        return {
+            "encoding": NeuralType(('B', 'D'), VoidType()),
+        }
+
+    @property
+    @add_port_docs()
+    def output_ports(self):
+        """Return definitions of module output ports.
+
+        Returns:
+            Module output ports.
+        """
+        return {
+            # Variant type
+            'output_logit': NeuralType(('B', 'D'), LogitsType()),
+        }
+
+    def __init__(self, num_input_nodes, num_hidden_nodes, num_output_logits, apply_softmax=False):
+        """Construct an AlexNet NeMo instance.
+
+        Args:
+            num_input_nodes : Number of input nodes.
+            num_hidden_nodes : Size of hidden nodes.
+            num_output_logits : Number of output logits of classifier.
+            apply_softmax : Flag to optionally apply softmax to last layer's output.
+
+        Returns:
+            Instance of class.
+        """
+        super().__init__()
+        self._num_input_nodes = num_input_nodes
+        self._num_hidden_nodes = num_hidden_nodes
+        self._num_output_logits = num_output_logits
+        self._apply_softmax = apply_softmax
+
+        self._fc1 = nn.Linear(self._num_input_nodes, self._num_hidden_nodes)
+        self._relu = nn.ReLU()
+        self._fc2 = nn.Linear(self._num_hidden_nodes, self._num_output_logits)
+        self._softmax = nn.Softmax(dim=1)
+
+        self._device = torch.device(
+            "cuda" if self.placement == DeviceType.GPU else "cpu")
+        self.to(self._device)
+
+    def forward(self, encoding):
+        """Abstract function to run the network.
+
+        Args:
+            encoding : Input to run network on.
+
+        Returns:
+            Output of forward pass.
+        """
+        output = self._fc1(encoding)
+        output = self._relu(output)
+        output = self._fc2(output)
+        if self._apply_softmax:
+            output = self._softmax(output)
+        return output