From 559b3b6463cb62469eea691a5215af58c8c71ca6 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 13 May 2021 11:48:20 -0600
Subject: [PATCH 01/15] :sparkles: introduce finetuning example

---
 .../video_classification_example/finetune.py  | 255 ++++++++++++++++++
 .../video_classification_example/train.py     |  15 +-
 2 files changed, 264 insertions(+), 6 deletions(-)
 create mode 100644 tutorials/video_classification_example/finetune.py

diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py
new file mode 100644
index 00000000..21fce5bb
--- /dev/null
+++ b/tutorials/video_classification_example/finetune.py
@@ -0,0 +1,255 @@
+from pathlib import Path
+from argparse import Namespace
+from torchvision.transforms._transforms_video import CenterCropVideo
+from pytorchvideo.data import LabeledVideoDataset
+from pytorchvideo.data.clip_sampling import UniformClipSampler
+import pytorch_lightning as pl
+import torch
+from pytorchvideo.models.head import create_res_basic_head
+from torch import nn
+from torch.optim import Adam
+
+# HACK
+from train import *
+
+
+class UCF11DataModule(KineticsDataModule):
+
+    def __init__(
+        self,
+        root="./",
+        batch_size=32,
+        num_workers=8,
+        holdout_scene=None,
+        side_size = 256,
+        crop_size = 256,
+        clip_mean = (0.45, 0.45, 0.45),
+        clip_std = (0.225, 0.225, 0.225),
+        num_frames = 8,
+        sampling_rate = 8,
+        frames_per_second = 30
+    ):
+        super().__init__(Namespace(data_type='video', batch_size=batch_size, workers=num_workers))
+
+        self.root = Path(root) / 'action_youtube_naudio'
+        assert self.root.exists(), "Dataset not found."
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.holdout_scene = holdout_scene
+        self.side_size = side_size
+        self.mean = clip_mean
+        self.std = clip_std
+        self.crop_size = crop_size
+        self.num_frames = num_frames
+        self.sampling_rate = sampling_rate
+        self.frames_per_second = frames_per_second
+        self.clip_duration = (self.num_frames * self.sampling_rate) / self.frames_per_second
+
+        self.classes = [x.name for x in self.root.glob("*") if x.is_dir()]
+        self.id_to_label = dict(zip(range(len(self.classes)), self.classes))
+        self.class_to_label = dict(zip(self.classes, range(len(self.classes))))
+        self.num_classes = len(self.classes)
+
+
+        # TODO - too many repeated .glob calls here.
+        self.train_paths = []
+        self.val_paths = []
+        self.holdout_scenes = {}
+        for c in self.classes:
+
+            # Scenes within each class directory
+            scene_names = sorted(set(x.name for x in (self.root / c).glob("*") if x.is_dir() and x.name != 'Annotation'))
+            
+            # Holdout the last scene
+            # TODO - wrap this in a function so users can override the split logic
+            holdout_scene = scene_names[-1]
+            scene_names = scene_names[:-1]
+
+            # Keep track of which scenes we held out for each class w/ a dict
+            self.holdout_scenes[c] = holdout_scene
+
+            # Prepare the list of 'labeled paths' required by the LabeledVideoDataset
+            label_paths = [(v, {"label": self.class_to_label[c]}) for v in (self.root / c).glob("**/*.avi")]
+
+            # HACK - this is no bueno. Can be done within the loop above
+            self.train_paths.extend([x for x in label_paths if x[0].parent.name != holdout_scene])
+            self.val_paths.extend([x for x in label_paths if x[0].parent.name == holdout_scene])
+
+    def _video_transform(self, mode: str):
+        # TODO - different tsfm for val/train
+        return ApplyTransformToKey(
+            key="video",
+            transform=Compose(
+                [
+                    UniformTemporalSubsample(self.num_frames),
+                    Lambda(lambda x: x / 255.0),
+                    Normalize(self.mean, self.std),
+                    ShortSideScale(size=self.side_size),
+                    CenterCropVideo(crop_size=(self.crop_size, self.crop_size)),
+                ]
+            ),
+        )
+
+    def _make_dataset(self, mode: str):
+        """
+        Defines the train DataLoader that the PyTorch Lightning Trainer trains/tests with.
+        """
+        sampler = DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler
+        return LimitDataset(LabeledVideoDataset(
+            self.train_paths if mode == 'train' else self.val_paths,
+            UniformClipSampler(self.clip_duration),
+            decode_audio=False,
+            transform=self._make_transforms(mode=mode),
+            video_sampler=sampler,
+        ))
+
+    def train_dataloader(self):
+        self.train_dataset = self._make_dataset('train')
+        return torch.utils.data.DataLoader(
+            self.train_dataset,
+            batch_size=self.args.batch_size,
+            num_workers=self.args.workers,
+        )
+
+    def val_dataloader(self):
+        self.val_dataset = self._make_dataset('val')
+        return torch.utils.data.DataLoader(
+            self.val_dataset,
+            batch_size=self.args.batch_size,
+            num_workers=self.args.workers,
+        )
+
+
+class MiniKineticsDataModule(KineticsDataModule):
+    TRAIN_PATH = 'train'
+    VAL_PATH = 'val'
+
+
+class Classifier(pl.LightningModule):
+
+    def __init__(self, num_classes: int = 11, lr: float = 2e-4, freeze_backbone: bool = True):
+        super().__init__()
+        self.save_hyperparameters()
+
+        # Backbone
+        resnet = torch.hub.load("facebookresearch/pytorchvideo", "slow_r50", pretrained=True)
+        self.backbone = nn.Sequential(*list(resnet.children())[0][:-1])
+
+        if self.hparams.freeze_backbone:
+            for param in self.backbone.parameters():
+                param.requires_grad = False
+
+        # Head
+        self.head = create_res_basic_head(in_features=2048, out_features=self.hparams.num_classes)
+
+        # Metrics
+        self.loss_fn = nn.CrossEntropyLoss()
+        self.train_acc = pl.metrics.Accuracy()
+        self.val_acc = pl.metrics.Accuracy()
+        self.accuracy = {'train': self.train_acc, 'val': self.val_acc}
+
+    def forward(self, x):
+        if isinstance(x, dict):
+            x = x["video"]
+        feats = self.backbone(x)
+        return self.head(feats)
+
+    def shared_step(self, batch, mode: str):
+        y_hat = self(batch["video"])
+        loss = self.loss_fn(y_hat, batch["label"])
+        self.log(f"{mode}_loss", loss)
+
+        if mode in ["val", "test"]:
+            preds = y_hat.argmax(dim=1)
+            acc = self.accuracy[mode](preds, batch["label"])
+            self.log(f"{mode}_acc", acc, prog_bar=True)
+
+        return loss
+
+    def training_step(self, batch, batch_idx):
+        return self.shared_step(batch, "train")
+
+    def validation_step(self, batch, batch_idx):
+        return self.shared_step(batch, "val")
+
+    def test_step(self, batch, batch_idx):
+        return self.shared_step(batch, "test")
+
+    def configure_optimizers(self):
+        return Adam(self.parameters(), lr=self.hparams.lr)
+
+
+def main():
+    """
+    To train the ResNet with the Kinetics dataset we construct the two modules above,
+    and pass them to the fit function of a pytorch_lightning.Trainer.
+
+    This example can be run either locally (with default parameters) or on a Slurm
+    cluster. To run on a Slurm cluster provide the --on_cluster argument.
+    """
+    setup_logger()
+
+    pytorch_lightning.trainer.seed_everything()
+    parser = argparse.ArgumentParser()
+
+    #  Cluster parameters.
+    parser.add_argument("--on_cluster", action="store_true")
+    parser.add_argument("--job_name", default="ptv_video_classification", type=str)
+    parser.add_argument("--working_directory", default=".", type=str)
+    parser.add_argument("--partition", default="dev", type=str)
+
+    # Model parameters.
+    parser.add_argument("--lr", "--learning-rate", default=0.1, type=float)
+    parser.add_argument("--momentum", default=0.9, type=float)
+    parser.add_argument("--weight_decay", default=1e-4, type=float)
+    parser.add_argument(
+        "--arch",
+        default="video_resnet",
+        choices=["video_resnet", "audio_resnet"],
+        type=str,
+    )
+
+    # Data parameters.
+    parser.add_argument("--data_path", default=None, type=str, required=True)
+    parser.add_argument("--video_path_prefix", default="", type=str)
+    parser.add_argument("--workers", default=8, type=int)
+    parser.add_argument("--batch_size", default=32, type=int)
+    parser.add_argument("--clip_duration", default=2, type=float)
+    parser.add_argument(
+        "--data_type", default="video", choices=["video", "audio"], type=str
+    )
+    parser.add_argument("--video_num_subsampled", default=8, type=int)
+    parser.add_argument("--video_means", default=(0.45, 0.45, 0.45), type=tuple)
+    parser.add_argument("--video_stds", default=(0.225, 0.225, 0.225), type=tuple)
+    parser.add_argument("--video_crop_size", default=224, type=int)
+    parser.add_argument("--video_min_short_side_scale", default=256, type=int)
+    parser.add_argument("--video_max_short_side_scale", default=320, type=int)
+    parser.add_argument("--video_horizontal_flip_p", default=0.5, type=float)
+    parser.add_argument("--audio_raw_sample_rate", default=44100, type=int)
+    parser.add_argument("--audio_resampled_rate", default=16000, type=int)
+    parser.add_argument("--audio_mel_window_size", default=32, type=int)
+    parser.add_argument("--audio_mel_step_size", default=16, type=int)
+    parser.add_argument("--audio_num_mels", default=80, type=int)
+    parser.add_argument("--audio_mel_num_subsample", default=128, type=int)
+    parser.add_argument("--audio_logmel_mean", default=-7.03, type=float)
+    parser.add_argument("--audio_logmel_std", default=4.66, type=float)
+
+    # Trainer parameters.
+    parser = pytorch_lightning.Trainer.add_argparse_args(parser)
+    parser.set_defaults(
+        max_epochs=200,
+        callbacks=[LearningRateMonitor()],
+        replace_sampler_ddp=False,
+        reload_dataloaders_every_epoch=False,
+    )
+    args = parser.parse_args()
+
+    # Get data, model, configure trainer, and train
+    data = MiniKineticsDataModule(args)
+    model = Classifier(num_classes=6)
+    trainer = pl.Trainer(gpus=1, precision=16, max_epochs=5)
+    trainer.fit(model, data)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorials/video_classification_example/train.py b/tutorials/video_classification_example/train.py
index b2d896ba..17beb719 100644
--- a/tutorials/video_classification_example/train.py
+++ b/tutorials/video_classification_example/train.py
@@ -19,7 +19,6 @@
     ShortSideScale,
     UniformTemporalSubsample,
 )
-from slurm import copy_and_run_with_config
 from torch.utils.data import DistributedSampler, RandomSampler
 from torchaudio.transforms import MelSpectrogram, Resample
 from torchvision.transforms import (
@@ -179,6 +178,9 @@ class KineticsDataModule(pytorch_lightning.LightningDataModule):
     preprocessing transforms and configures the PyTorch DataLoaders.
     """
 
+    TRAIN_PATH = 'train.csv'
+    VAL_PATH = 'val.csv'
+
     def __init__(self, args):
         self.args = args
         super().__init__()
@@ -297,11 +299,11 @@ def train_dataloader(self):
         """
         Defines the train DataLoader that the PyTorch Lightning Trainer trains/tests with.
         """
-        sampler = DistributedSampler if self.trainer.use_ddp else RandomSampler
+        sampler = DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler
         train_transform = self._make_transforms(mode="train")
         self.train_dataset = LimitDataset(
             pytorchvideo.data.Kinetics(
-                data_path=os.path.join(self.args.data_path, "train.csv"),
+                data_path=os.path.join(self.args.data_path, self.TRAIN_PATH),
                 clip_sampler=pytorchvideo.data.make_clip_sampler(
                     "random", self.args.clip_duration
                 ),
@@ -320,11 +322,11 @@ def val_dataloader(self):
         """
         Defines the train DataLoader that the PyTorch Lightning Trainer trains/tests with.
         """
-        sampler = DistributedSampler if self.trainer.use_ddp else RandomSampler
+        sampler = DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler
         val_transform = self._make_transforms(mode="val")
         self.val_dataset = LimitDataset(
             pytorchvideo.data.Kinetics(
-                data_path=os.path.join(self.args.data_path, "val.csv"),
+                data_path=os.path.join(self.args.data_path, self.VAL_PATH),
                 clip_sampler=pytorchvideo.data.make_clip_sampler(
                     "uniform", self.args.clip_duration
                 ),
@@ -359,7 +361,7 @@ def __getitem__(self, index):
         return next(self.dataset_iter)
 
     def __len__(self):
-        return self.dataset.num_videos()
+        return self.dataset.num_videos
 
 
 def main():
@@ -430,6 +432,7 @@ def main():
     args = parser.parse_args()
 
     if args.on_cluster:
+        from slurm import copy_and_run_with_config
         copy_and_run_with_config(
             train,
             args,

From 4c97dfc794606549cff860ae9c8d2ae49dd5a8c8 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 13 May 2021 19:07:49 -0600
Subject: [PATCH 02/15] :art: improve structure, modularity of code

---
 .../video_classification_example/data.py      | 250 ++++++++++++++++++
 .../video_classification_example/finetune.py  | 182 +++----------
 2 files changed, 282 insertions(+), 150 deletions(-)
 create mode 100644 tutorials/video_classification_example/data.py

diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py
new file mode 100644
index 00000000..13344680
--- /dev/null
+++ b/tutorials/video_classification_example/data.py
@@ -0,0 +1,250 @@
+import requests
+from argparse import Namespace, ArgumentParser
+import pytorch_lightning
+from pathlib import Path
+from shutil import unpack_archive
+from pytorchvideo.transforms import (
+    ApplyTransformToKey,
+    Normalize,
+    RandomShortSideScale,
+    RemoveKey,
+    ShortSideScale,
+    UniformTemporalSubsample,
+)
+from pytorchvideo.data import LabeledVideoDataset
+
+from torch.utils.data import DistributedSampler, RandomSampler
+from torchaudio.transforms import MelSpectrogram, Resample
+from torchvision.transforms import (
+    CenterCrop,
+    Compose,
+    Lambda,
+    RandomCrop,
+    RandomHorizontalFlip,
+)
+from pytorchvideo.data import make_clip_sampler
+from pytorchvideo.data.labeled_video_dataset import labeled_video_dataset
+import torch
+import itertools
+from torch.utils.data import DataLoader
+from random import shuffle
+
+
+class LabeledVideoDataModule(pytorch_lightning.LightningDataModule):
+
+    TRAIN_PATH = "train.csv"
+    VAL_PATH = "val.csv"
+    SOURCE_URL = None
+    SOURCE_DIR_NAME = None
+
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.root = Path(self.args.data_path) / self.SOURCE_DIR_NAME
+        if not (self.SOURCE_URL is None or self.SOURCE_DIR_NAME is None):
+            if not self.root.exists():
+                download_and_unzip(self.SOURCE_URL, self.args.data_path, verify=getattr(self.args, 'verify', True))
+
+    def _make_transforms(self, mode: str):
+
+        if self.args.data_type == "video":
+            transform = [
+                self._video_transform(mode),
+                RemoveKey("audio"),
+            ]
+        elif self.args.data_type == "audio":
+            transform = [
+                self._audio_transform(),
+                RemoveKey("video"),
+            ]
+        else:
+            raise Exception(f"{self.args.data_type} not supported")
+
+        return Compose(transform)
+
+    def _video_transform(self, mode: str):
+        args = self.args
+        return ApplyTransformToKey(
+            key="video",
+            transform=Compose(
+                [
+                    UniformTemporalSubsample(args.video_num_subsampled),
+                    Normalize(args.video_means, args.video_stds),
+                ]
+                + (
+                    [
+                        RandomShortSideScale(
+                            min_size=args.video_min_short_side_scale,
+                            max_size=args.video_max_short_side_scale,
+                        ),
+                        RandomCrop(args.video_crop_size),
+                        RandomHorizontalFlip(p=args.video_horizontal_flip_p),
+                    ]
+                    if mode == "train"
+                    else [
+                        ShortSideScale(args.video_min_short_side_scale),
+                        CenterCrop(args.video_crop_size),
+                    ]
+                )
+            ),
+        )
+
+    def _audio_transform(self):
+        args = self.args
+        n_fft = int(float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size)
+        hop_length = int(float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size)
+        eps = 1e-10
+        return ApplyTransformToKey(
+            key="audio",
+            transform=Compose(
+                [
+                    Resample(
+                        orig_freq=args.audio_raw_sample_rate,
+                        new_freq=args.audio_resampled_rate,
+                    ),
+                    MelSpectrogram(
+                        sample_rate=args.audio_resampled_rate,
+                        n_fft=n_fft,
+                        hop_length=hop_length,
+                        n_mels=args.audio_num_mels,
+                        center=False,
+                    ),
+                    Lambda(lambda x: x.clamp(min=eps)),
+                    Lambda(torch.log),
+                    UniformTemporalSubsample(args.audio_mel_num_subsample),
+                    Lambda(lambda x: x.transpose(1, 0)),  # (F, T) -> (T, F)
+                    Lambda(lambda x: x.view(1, x.size(0), 1, x.size(1))),  # (T, F) -> (1, T, 1, F)
+                    Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)),
+                ]
+            ),
+        )
+
+    def _make_ds_and_loader(self, mode: str):
+        ds = LimitDataset(
+            labeled_video_dataset(
+                data_path=str(Path(self.root) / (self.TRAIN_PATH if mode == 'train' else self.VAL_PATH)),
+                clip_sampler=make_clip_sampler("random" if mode == 'train' else 'uniform', self.args.clip_duration),
+                video_path_prefix=self.args.video_path_prefix,
+                transform=self._make_transforms(mode=mode),
+                video_sampler=DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler,
+            )
+        )
+        return ds, DataLoader(ds, batch_size=self.args.batch_size, num_workers=self.args.workers)
+
+    def train_dataloader(self):
+        self.train_dataset, loader = self._make_ds_and_loader('train')
+        return loader
+
+    def val_dataloader(self):
+        self.val_dataset, loader = self._make_ds_and_loader('val')
+        return loader
+
+
+class LimitDataset(torch.utils.data.Dataset):
+    """
+    To ensure a constant number of samples are retrieved from the dataset we use this
+    LimitDataset wrapper. This is necessary because several of the underlying videos
+    may be corrupted while fetching or decoding, however, we always want the same
+    number of steps per epoch.
+    """
+
+    def __init__(self, dataset):
+        super().__init__()
+        self.dataset = dataset
+        self.dataset_iter = itertools.chain.from_iterable(itertools.repeat(iter(dataset), 2))
+
+    def __getitem__(self, index):
+        return next(self.dataset_iter)
+
+    def __len__(self):
+        return self.dataset.num_videos
+
+
+class KineticsDataModule(LabeledVideoDataModule):
+    TRAIN_PATH = 'train.csv'
+    VAL_PATH = 'val.csv'
+    NUM_CLASSES = 700
+
+
+class MiniKineticsDataModule(LabeledVideoDataModule):
+
+    TRAIN_PATH = "train"
+    VAL_PATH = "val"
+    SOURCE_URL = "https://pl-flash-data.s3.amazonaws.com/kinetics.zip"
+    SOURCE_DIR_NAME = 'kinetics'
+    NUM_CLASSES = 6
+
+
+class UCF11DataModule(LabeledVideoDataModule):
+    TRAIN_PATH = None
+    VAL_PATH = None
+    SOURCE_URL = "https://www.crcv.ucf.edu/data/YouTube_DataSet_Annotated.zip"
+    SOURCE_DIR_NAME = 'action_youtube_naudio'
+    NUM_CLASSES = 11
+
+    def __init__(self, args):
+        args.verify = False
+        super().__init__(args)
+
+        data_path = Path(self.args.data_path)
+        root = data_path / self.SOURCE_DIR_NAME
+        self.classes = [x.name for x in root.glob("*") if x.is_dir()]
+        self.id_to_label = dict(zip(range(len(self.classes)), self.classes))
+        self.class_to_label = {v: k for k, v in self.id_to_label.items()}
+        self.num_classes = len(self.classes)
+
+        self.train_paths = []
+        self.val_paths = []
+        self.holdout_scenes = {}
+        for c in self.classes:
+
+            # Scenes within each class directory
+            scene_names = sorted(x.name for x in (root / c).glob("*") if x.is_dir() and x.name != 'Annotation')
+            shuffle(scene_names)
+
+            # Holdout a random actor/scene
+            holdout_scene = scene_names[-1]
+            scene_names = scene_names[:-1]
+
+            # Keep track of which scenes we held out for each class w/ a dict
+            self.holdout_scenes[c] = holdout_scene
+
+            for v in (root / c).glob('**/*.avi'):
+                labeled_path = (v, {"label": self.class_to_label[c]})
+                if v.parent.name != holdout_scene:
+                    self.train_paths.append(labeled_path)
+                else:
+                    self.val_paths.append(labeled_path)
+
+
+    def _make_ds_and_loader(self, mode: str):
+        ds = LimitDataset(
+            LabeledVideoDataset(
+                self.train_paths if mode == 'train' else self.val_paths,
+                clip_sampler=make_clip_sampler("random" if mode == 'train' else 'uniform', self.args.clip_duration),
+                decode_audio=False,
+                transform=self._make_transforms(mode=mode),
+                video_sampler=DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler,
+            )
+        )
+        return ds, DataLoader(ds, batch_size=self.args.batch_size, num_workers=self.args.workers)
+
+
+def download_and_unzip(url, data_dir="./", verify=True):
+    data_dir = Path(data_dir)
+    zipfile_name = url.split("/")[-1]
+    data_zip_path = data_dir / zipfile_name
+    data_dir.mkdir(exist_ok=True, parents=True)
+
+    if not data_zip_path.exists():
+        resp = requests.get(url, verify=verify)
+
+        with data_zip_path.open("wb") as f:
+            f.write(resp.content)
+
+    unpack_archive(data_zip_path, extract_dir=data_dir)
+
+
+if __name__ == "__main__":
+    args = parse_args('--batch_size 4 --data_path ./yt_data'.split())
+    dm = UCF11DataModule(args)
diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py
index 21fce5bb..35055634 100644
--- a/tutorials/video_classification_example/finetune.py
+++ b/tutorials/video_classification_example/finetune.py
@@ -1,138 +1,30 @@
-from pathlib import Path
-from argparse import Namespace
-from torchvision.transforms._transforms_video import CenterCropVideo
-from pytorchvideo.data import LabeledVideoDataset
-from pytorchvideo.data.clip_sampling import UniformClipSampler
+from argparse import ArgumentParser
+
 import pytorch_lightning as pl
 import torch
-from pytorchvideo.models.head import create_res_basic_head
 from torch import nn
 from torch.optim import Adam
+from pytorchvideo.models.head import create_res_basic_head
 
-# HACK
-from train import *
-
-
-class UCF11DataModule(KineticsDataModule):
-
-    def __init__(
-        self,
-        root="./",
-        batch_size=32,
-        num_workers=8,
-        holdout_scene=None,
-        side_size = 256,
-        crop_size = 256,
-        clip_mean = (0.45, 0.45, 0.45),
-        clip_std = (0.225, 0.225, 0.225),
-        num_frames = 8,
-        sampling_rate = 8,
-        frames_per_second = 30
-    ):
-        super().__init__(Namespace(data_type='video', batch_size=batch_size, workers=num_workers))
-
-        self.root = Path(root) / 'action_youtube_naudio'
-        assert self.root.exists(), "Dataset not found."
-        self.batch_size = batch_size
-        self.num_workers = num_workers
-        self.holdout_scene = holdout_scene
-        self.side_size = side_size
-        self.mean = clip_mean
-        self.std = clip_std
-        self.crop_size = crop_size
-        self.num_frames = num_frames
-        self.sampling_rate = sampling_rate
-        self.frames_per_second = frames_per_second
-        self.clip_duration = (self.num_frames * self.sampling_rate) / self.frames_per_second
-
-        self.classes = [x.name for x in self.root.glob("*") if x.is_dir()]
-        self.id_to_label = dict(zip(range(len(self.classes)), self.classes))
-        self.class_to_label = dict(zip(self.classes, range(len(self.classes))))
-        self.num_classes = len(self.classes)
-
-
-        # TODO - too many repeated .glob calls here.
-        self.train_paths = []
-        self.val_paths = []
-        self.holdout_scenes = {}
-        for c in self.classes:
-
-            # Scenes within each class directory
-            scene_names = sorted(set(x.name for x in (self.root / c).glob("*") if x.is_dir() and x.name != 'Annotation'))
-            
-            # Holdout the last scene
-            # TODO - wrap this in a function so users can override the split logic
-            holdout_scene = scene_names[-1]
-            scene_names = scene_names[:-1]
-
-            # Keep track of which scenes we held out for each class w/ a dict
-            self.holdout_scenes[c] = holdout_scene
-
-            # Prepare the list of 'labeled paths' required by the LabeledVideoDataset
-            label_paths = [(v, {"label": self.class_to_label[c]}) for v in (self.root / c).glob("**/*.avi")]
-
-            # HACK - this is no bueno. Can be done within the loop above
-            self.train_paths.extend([x for x in label_paths if x[0].parent.name != holdout_scene])
-            self.val_paths.extend([x for x in label_paths if x[0].parent.name == holdout_scene])
-
-    def _video_transform(self, mode: str):
-        # TODO - different tsfm for val/train
-        return ApplyTransformToKey(
-            key="video",
-            transform=Compose(
-                [
-                    UniformTemporalSubsample(self.num_frames),
-                    Lambda(lambda x: x / 255.0),
-                    Normalize(self.mean, self.std),
-                    ShortSideScale(size=self.side_size),
-                    CenterCropVideo(crop_size=(self.crop_size, self.crop_size)),
-                ]
-            ),
-        )
-
-    def _make_dataset(self, mode: str):
-        """
-        Defines the train DataLoader that the PyTorch Lightning Trainer trains/tests with.
-        """
-        sampler = DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler
-        return LimitDataset(LabeledVideoDataset(
-            self.train_paths if mode == 'train' else self.val_paths,
-            UniformClipSampler(self.clip_duration),
-            decode_audio=False,
-            transform=self._make_transforms(mode=mode),
-            video_sampler=sampler,
-        ))
-
-    def train_dataloader(self):
-        self.train_dataset = self._make_dataset('train')
-        return torch.utils.data.DataLoader(
-            self.train_dataset,
-            batch_size=self.args.batch_size,
-            num_workers=self.args.workers,
-        )
-
-    def val_dataloader(self):
-        self.val_dataset = self._make_dataset('val')
-        return torch.utils.data.DataLoader(
-            self.val_dataset,
-            batch_size=self.args.batch_size,
-            num_workers=self.args.workers,
-        )
-
-
-class MiniKineticsDataModule(KineticsDataModule):
-    TRAIN_PATH = 'train'
-    VAL_PATH = 'val'
+from data import UCF11DataModule, KineticsDataModule, MiniKineticsDataModule
+from models import Classifier
+
+
+DATASET_MAP = {
+    "ucf11": UCF11DataModule,
+    "kinetics": KineticsDataModule,
+    "kinetics-mini": MiniKineticsDataModule,
+}
 
 
 class Classifier(pl.LightningModule):
 
-    def __init__(self, num_classes: int = 11, lr: float = 2e-4, freeze_backbone: bool = True):
+    def __init__(self, num_classes: int = 11, lr: float = 2e-4, freeze_backbone: bool = True, pretrained: bool = True):
         super().__init__()
         self.save_hyperparameters()
 
         # Backbone
-        resnet = torch.hub.load("facebookresearch/pytorchvideo", "slow_r50", pretrained=True)
+        resnet = torch.hub.load("facebookresearch/pytorchvideo", 'slow_r50', pretrained=self.hparams.pretrained)
         self.backbone = nn.Sequential(*list(resnet.children())[0][:-1])
 
         if self.hparams.freeze_backbone:
@@ -179,24 +71,8 @@ def configure_optimizers(self):
         return Adam(self.parameters(), lr=self.hparams.lr)
 
 
-def main():
-    """
-    To train the ResNet with the Kinetics dataset we construct the two modules above,
-    and pass them to the fit function of a pytorch_lightning.Trainer.
-
-    This example can be run either locally (with default parameters) or on a Slurm
-    cluster. To run on a Slurm cluster provide the --on_cluster argument.
-    """
-    setup_logger()
-
-    pytorch_lightning.trainer.seed_everything()
-    parser = argparse.ArgumentParser()
-
-    #  Cluster parameters.
-    parser.add_argument("--on_cluster", action="store_true")
-    parser.add_argument("--job_name", default="ptv_video_classification", type=str)
-    parser.add_argument("--working_directory", default=".", type=str)
-    parser.add_argument("--partition", default="dev", type=str)
+def parse_args(args=None):
+    parser = ArgumentParser()
 
     # Model parameters.
     parser.add_argument("--lr", "--learning-rate", default=0.1, type=float)
@@ -209,7 +85,10 @@ def main():
         type=str,
     )
 
-    # Data parameters.
+    # Data parameters
+    parser.add_argument(
+        "--dataset", default="ucf11", choices=["ucf11", "kinetics", "kinetics-mini"]
+    )
     parser.add_argument("--data_path", default=None, type=str, required=True)
     parser.add_argument("--video_path_prefix", default="", type=str)
     parser.add_argument("--workers", default=8, type=int)
@@ -235,21 +114,24 @@ def main():
     parser.add_argument("--audio_logmel_std", default=4.66, type=float)
 
     # Trainer parameters.
-    parser = pytorch_lightning.Trainer.add_argparse_args(parser)
+    parser = pl.Trainer.add_argparse_args(parser)
     parser.set_defaults(
         max_epochs=200,
-        callbacks=[LearningRateMonitor()],
+        callbacks=[pl.callbacks.LearningRateMonitor()],
         replace_sampler_ddp=False,
         reload_dataloaders_every_epoch=False,
     )
-    args = parser.parse_args()
+    return parser.parse_args(args=args)
+
 
-    # Get data, model, configure trainer, and train
-    data = MiniKineticsDataModule(args)
-    model = Classifier(num_classes=6)
-    trainer = pl.Trainer(gpus=1, precision=16, max_epochs=5)
-    trainer.fit(model, data)
+def main(args):
+    pl.trainer.seed_everything()
+    dm_cls = DATASET_MAP.get(args.dataset)
+    dm = dm_cls(args)
+    model = Classifier(num_classes=dm_cls.NUM_CLASSES)
+    trainer = pl.Trainer.from_argparse_args(args)
+    trainer.fit(model, dm)
 
 
 if __name__ == "__main__":
-    main()
+    main(parse_args())

From 42b3d7c679359f65f1a13fa9111caff57c1db9ca Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 13 May 2021 19:10:44 -0600
Subject: [PATCH 03/15] :lipstick: style

---
 .../video_classification_example/data.py      | 98 ++++++++++++-------
 .../video_classification_example/finetune.py  | 28 ++++--
 2 files changed, 82 insertions(+), 44 deletions(-)

diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py
index 13344680..4dd379cb 100644
--- a/tutorials/video_classification_example/data.py
+++ b/tutorials/video_classification_example/data.py
@@ -1,8 +1,14 @@
-import requests
-from argparse import Namespace, ArgumentParser
-import pytorch_lightning
+import itertools
+from argparse import ArgumentParser, Namespace
 from pathlib import Path
+from random import shuffle
 from shutil import unpack_archive
+
+import pytorch_lightning
+import requests
+import torch
+from pytorchvideo.data import LabeledVideoDataset, make_clip_sampler
+from pytorchvideo.data.labeled_video_dataset import labeled_video_dataset
 from pytorchvideo.transforms import (
     ApplyTransformToKey,
     Normalize,
@@ -11,9 +17,7 @@
     ShortSideScale,
     UniformTemporalSubsample,
 )
-from pytorchvideo.data import LabeledVideoDataset
-
-from torch.utils.data import DistributedSampler, RandomSampler
+from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
 from torchaudio.transforms import MelSpectrogram, Resample
 from torchvision.transforms import (
     CenterCrop,
@@ -22,12 +26,6 @@
     RandomCrop,
     RandomHorizontalFlip,
 )
-from pytorchvideo.data import make_clip_sampler
-from pytorchvideo.data.labeled_video_dataset import labeled_video_dataset
-import torch
-import itertools
-from torch.utils.data import DataLoader
-from random import shuffle
 
 
 class LabeledVideoDataModule(pytorch_lightning.LightningDataModule):
@@ -43,7 +41,11 @@ def __init__(self, args):
         self.root = Path(self.args.data_path) / self.SOURCE_DIR_NAME
         if not (self.SOURCE_URL is None or self.SOURCE_DIR_NAME is None):
             if not self.root.exists():
-                download_and_unzip(self.SOURCE_URL, self.args.data_path, verify=getattr(self.args, 'verify', True))
+                download_and_unzip(
+                    self.SOURCE_URL,
+                    self.args.data_path,
+                    verify=getattr(self.args, "verify", True),
+                )
 
     def _make_transforms(self, mode: str):
 
@@ -91,8 +93,12 @@ def _video_transform(self, mode: str):
 
     def _audio_transform(self):
         args = self.args
-        n_fft = int(float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size)
-        hop_length = int(float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size)
+        n_fft = int(
+            float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size
+        )
+        hop_length = int(
+            float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size
+        )
         eps = 1e-10
         return ApplyTransformToKey(
             key="audio",
@@ -113,7 +119,9 @@ def _audio_transform(self):
                     Lambda(torch.log),
                     UniformTemporalSubsample(args.audio_mel_num_subsample),
                     Lambda(lambda x: x.transpose(1, 0)),  # (F, T) -> (T, F)
-                    Lambda(lambda x: x.view(1, x.size(0), 1, x.size(1))),  # (T, F) -> (1, T, 1, F)
+                    Lambda(
+                        lambda x: x.view(1, x.size(0), 1, x.size(1))
+                    ),  # (T, F) -> (1, T, 1, F)
                     Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)),
                 ]
             ),
@@ -122,21 +130,30 @@ def _audio_transform(self):
     def _make_ds_and_loader(self, mode: str):
         ds = LimitDataset(
             labeled_video_dataset(
-                data_path=str(Path(self.root) / (self.TRAIN_PATH if mode == 'train' else self.VAL_PATH)),
-                clip_sampler=make_clip_sampler("random" if mode == 'train' else 'uniform', self.args.clip_duration),
+                data_path=str(
+                    Path(self.root)
+                    / (self.TRAIN_PATH if mode == "train" else self.VAL_PATH)
+                ),
+                clip_sampler=make_clip_sampler(
+                    "random" if mode == "train" else "uniform", self.args.clip_duration
+                ),
                 video_path_prefix=self.args.video_path_prefix,
                 transform=self._make_transforms(mode=mode),
-                video_sampler=DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler,
+                video_sampler=DistributedSampler
+                if (self.trainer is not None and self.trainer.use_ddp)
+                else RandomSampler,
             )
         )
-        return ds, DataLoader(ds, batch_size=self.args.batch_size, num_workers=self.args.workers)
+        return ds, DataLoader(
+            ds, batch_size=self.args.batch_size, num_workers=self.args.workers
+        )
 
     def train_dataloader(self):
-        self.train_dataset, loader = self._make_ds_and_loader('train')
+        self.train_dataset, loader = self._make_ds_and_loader("train")
         return loader
 
     def val_dataloader(self):
-        self.val_dataset, loader = self._make_ds_and_loader('val')
+        self.val_dataset, loader = self._make_ds_and_loader("val")
         return loader
 
 
@@ -151,7 +168,9 @@ class LimitDataset(torch.utils.data.Dataset):
     def __init__(self, dataset):
         super().__init__()
         self.dataset = dataset
-        self.dataset_iter = itertools.chain.from_iterable(itertools.repeat(iter(dataset), 2))
+        self.dataset_iter = itertools.chain.from_iterable(
+            itertools.repeat(iter(dataset), 2)
+        )
 
     def __getitem__(self, index):
         return next(self.dataset_iter)
@@ -161,8 +180,8 @@ def __len__(self):
 
 
 class KineticsDataModule(LabeledVideoDataModule):
-    TRAIN_PATH = 'train.csv'
-    VAL_PATH = 'val.csv'
+    TRAIN_PATH = "train.csv"
+    VAL_PATH = "val.csv"
     NUM_CLASSES = 700
 
 
@@ -171,7 +190,7 @@ class MiniKineticsDataModule(LabeledVideoDataModule):
     TRAIN_PATH = "train"
     VAL_PATH = "val"
     SOURCE_URL = "https://pl-flash-data.s3.amazonaws.com/kinetics.zip"
-    SOURCE_DIR_NAME = 'kinetics'
+    SOURCE_DIR_NAME = "kinetics"
     NUM_CLASSES = 6
 
 
@@ -179,7 +198,7 @@ class UCF11DataModule(LabeledVideoDataModule):
     TRAIN_PATH = None
     VAL_PATH = None
     SOURCE_URL = "https://www.crcv.ucf.edu/data/YouTube_DataSet_Annotated.zip"
-    SOURCE_DIR_NAME = 'action_youtube_naudio'
+    SOURCE_DIR_NAME = "action_youtube_naudio"
     NUM_CLASSES = 11
 
     def __init__(self, args):
@@ -199,7 +218,11 @@ def __init__(self, args):
         for c in self.classes:
 
             # Scenes within each class directory
-            scene_names = sorted(x.name for x in (root / c).glob("*") if x.is_dir() and x.name != 'Annotation')
+            scene_names = sorted(
+                x.name
+                for x in (root / c).glob("*")
+                if x.is_dir() and x.name != "Annotation"
+            )
             shuffle(scene_names)
 
             # Holdout a random actor/scene
@@ -209,25 +232,30 @@ def __init__(self, args):
             # Keep track of which scenes we held out for each class w/ a dict
             self.holdout_scenes[c] = holdout_scene
 
-            for v in (root / c).glob('**/*.avi'):
+            for v in (root / c).glob("**/*.avi"):
                 labeled_path = (v, {"label": self.class_to_label[c]})
                 if v.parent.name != holdout_scene:
                     self.train_paths.append(labeled_path)
                 else:
                     self.val_paths.append(labeled_path)
 
-
     def _make_ds_and_loader(self, mode: str):
         ds = LimitDataset(
             LabeledVideoDataset(
-                self.train_paths if mode == 'train' else self.val_paths,
-                clip_sampler=make_clip_sampler("random" if mode == 'train' else 'uniform', self.args.clip_duration),
+                self.train_paths if mode == "train" else self.val_paths,
+                clip_sampler=make_clip_sampler(
+                    "random" if mode == "train" else "uniform", self.args.clip_duration
+                ),
                 decode_audio=False,
                 transform=self._make_transforms(mode=mode),
-                video_sampler=DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler,
+                video_sampler=DistributedSampler
+                if (self.trainer is not None and self.trainer.use_ddp)
+                else RandomSampler,
             )
         )
-        return ds, DataLoader(ds, batch_size=self.args.batch_size, num_workers=self.args.workers)
+        return ds, DataLoader(
+            ds, batch_size=self.args.batch_size, num_workers=self.args.workers
+        )
 
 
 def download_and_unzip(url, data_dir="./", verify=True):
@@ -246,5 +274,5 @@ def download_and_unzip(url, data_dir="./", verify=True):
 
 
 if __name__ == "__main__":
-    args = parse_args('--batch_size 4 --data_path ./yt_data'.split())
+    args = parse_args("--batch_size 4 --data_path ./yt_data".split())
     dm = UCF11DataModule(args)
diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py
index 35055634..d77c9eb1 100644
--- a/tutorials/video_classification_example/finetune.py
+++ b/tutorials/video_classification_example/finetune.py
@@ -2,12 +2,11 @@
 
 import pytorch_lightning as pl
 import torch
+from data import KineticsDataModule, MiniKineticsDataModule, UCF11DataModule
+from models import Classifier
+from pytorchvideo.models.head import create_res_basic_head
 from torch import nn
 from torch.optim import Adam
-from pytorchvideo.models.head import create_res_basic_head
-
-from data import UCF11DataModule, KineticsDataModule, MiniKineticsDataModule
-from models import Classifier
 
 
 DATASET_MAP = {
@@ -18,13 +17,22 @@
 
 
 class Classifier(pl.LightningModule):
-
-    def __init__(self, num_classes: int = 11, lr: float = 2e-4, freeze_backbone: bool = True, pretrained: bool = True):
+    def __init__(
+        self,
+        num_classes: int = 11,
+        lr: float = 2e-4,
+        freeze_backbone: bool = True,
+        pretrained: bool = True,
+    ):
         super().__init__()
         self.save_hyperparameters()
 
         # Backbone
-        resnet = torch.hub.load("facebookresearch/pytorchvideo", 'slow_r50', pretrained=self.hparams.pretrained)
+        resnet = torch.hub.load(
+            "facebookresearch/pytorchvideo",
+            "slow_r50",
+            pretrained=self.hparams.pretrained,
+        )
         self.backbone = nn.Sequential(*list(resnet.children())[0][:-1])
 
         if self.hparams.freeze_backbone:
@@ -32,13 +40,15 @@ def __init__(self, num_classes: int = 11, lr: float = 2e-4, freeze_backbone: boo
                 param.requires_grad = False
 
         # Head
-        self.head = create_res_basic_head(in_features=2048, out_features=self.hparams.num_classes)
+        self.head = create_res_basic_head(
+            in_features=2048, out_features=self.hparams.num_classes
+        )
 
         # Metrics
         self.loss_fn = nn.CrossEntropyLoss()
         self.train_acc = pl.metrics.Accuracy()
         self.val_acc = pl.metrics.Accuracy()
-        self.accuracy = {'train': self.train_acc, 'val': self.val_acc}
+        self.accuracy = {"train": self.train_acc, "val": self.val_acc}
 
     def forward(self, x):
         if isinstance(x, dict):

From fb55f1f53add12fd8522ed9e36110296281cf0e6 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 13 May 2021 19:13:53 -0600
Subject: [PATCH 04/15] :rotating_light: remove unused import

---
 tutorials/video_classification_example/data.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py
index 4dd379cb..6c23dce2 100644
--- a/tutorials/video_classification_example/data.py
+++ b/tutorials/video_classification_example/data.py
@@ -1,5 +1,4 @@
 import itertools
-from argparse import ArgumentParser, Namespace
 from pathlib import Path
 from random import shuffle
 from shutil import unpack_archive

From 7fd088010c798a53cce0431f4bb78ec276ab771c Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 20 May 2021 00:58:12 -0600
Subject: [PATCH 05/15] :construction: wip

---
 website/docs/tutorial_finetuning.md | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 website/docs/tutorial_finetuning.md

diff --git a/website/docs/tutorial_finetuning.md b/website/docs/tutorial_finetuning.md
new file mode 100644
index 00000000..ba7d1763
--- /dev/null
+++ b/website/docs/tutorial_finetuning.md
@@ -0,0 +1,9 @@
+---
+id: tutorial_finetuning
+title: Finetune a TorchHub model for Classification
+---
+
+# Introduction
+
+In this tutorial, you will learn how to finetune a pre-trained [Slow Resnet50 model from TorchHub](https://pytorch.org/hub/facebookresearch_pytorchvideo_resnet/) on the [UCF11 Dataset](https://www.crcv.ucf.edu/data/UCF_YouTube_Action.php).
+

From 0ec28aa3117e6e236c87d6372f30214945a66940 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 20 May 2021 00:58:35 -0600
Subject: [PATCH 06/15] :construction: wip

---
 .../video_classification_example/data.py      | 16 +++++++--
 .../video_classification_example/finetune.py  | 33 ++++++++++++-------
 .../video_classification_example/train.py     |  3 +-
 3 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py
index 6c23dce2..16eb3857 100644
--- a/tutorials/video_classification_example/data.py
+++ b/tutorials/video_classification_example/data.py
@@ -70,6 +70,7 @@ def _video_transform(self, mode: str):
             transform=Compose(
                 [
                     UniformTemporalSubsample(args.video_num_subsampled),
+                    Lambda(lambda x: x / 255.0),
                     Normalize(args.video_means, args.video_stds),
                 ]
                 + (
@@ -217,7 +218,7 @@ def __init__(self, args):
         for c in self.classes:
 
             # Scenes within each class directory
-            scene_names = sorted(
+            scene_names = list(
                 x.name
                 for x in (root / c).glob("*")
                 if x.is_dir() and x.name != "Annotation"
@@ -273,5 +274,16 @@ def download_and_unzip(url, data_dir="./", verify=True):
 
 
 if __name__ == "__main__":
-    args = parse_args("--batch_size 4 --data_path ./yt_data".split())
+    from finetune import parse_args
+    from train import LearningRateMonitor, VideoClassificationLightningModule
+    args = parse_args("--gpus 1 --precision 16 --batch_size 8 --data_path ./yt_data".split())
+    args.max_epochs = 200
+    args.callbacks = [LearningRateMonitor()]
+    args.replace_sampler_ddp = False
+    args.reload_dataloaders_every_epoch = False
+
+    pytorch_lightning.trainer.seed_everything(244)
     dm = UCF11DataModule(args)
+    model = VideoClassificationLightningModule(args)
+    trainer = pytorch_lightning.Trainer.from_argparse_args(args)
+    trainer.fit(model, dm)
diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py
index d77c9eb1..cff7fa99 100644
--- a/tutorials/video_classification_example/finetune.py
+++ b/tutorials/video_classification_example/finetune.py
@@ -2,8 +2,7 @@
 
 import pytorch_lightning as pl
 import torch
-from data import KineticsDataModule, MiniKineticsDataModule, UCF11DataModule
-from models import Classifier
+from .data import KineticsDataModule, MiniKineticsDataModule, UCF11DataModule
 from pytorchvideo.models.head import create_res_basic_head
 from torch import nn
 from torch.optim import Adam
@@ -17,17 +16,30 @@
 
 
 class Classifier(pl.LightningModule):
+    """
+    """
     def __init__(
         self,
         num_classes: int = 11,
         lr: float = 2e-4,
         freeze_backbone: bool = True,
         pretrained: bool = True,
+        **kwargs
     ):
+        """A classifier for finetuning pretrained video classification backbones from
+        torchhub. We use the slow_r50 model here, but you can edit this class to
+        use whatever backbone/head you'd like.
+
+        Args:
+            num_classes (int, optional): Number of output classes. Defaults to 11.
+            lr (float, optional): The learning rate for the Adam optimizer. Defaults to 2e-4.
+            freeze_backbone (bool, optional): Whether to freeze the backbone or leave it trainable. Defaults to True.
+            pretrained (bool, optional): Use the pretrained model from torchhub. When False, we initialize the slow_r50 model from scratch. Defaults to True.
+        """
         super().__init__()
         self.save_hyperparameters()
 
-        # Backbone
+        # The pretrained resnet model - we strip off its head to get the backbone
         resnet = torch.hub.load(
             "facebookresearch/pytorchvideo",
             "slow_r50",
@@ -35,26 +47,24 @@ def __init__(
         )
         self.backbone = nn.Sequential(*list(resnet.children())[0][:-1])
 
+        # Freeze the backbone layers if specified
         if self.hparams.freeze_backbone:
             for param in self.backbone.parameters():
                 param.requires_grad = False
 
-        # Head
+        # Create a new head we will train on top of the backbone
         self.head = create_res_basic_head(
             in_features=2048, out_features=self.hparams.num_classes
         )
 
-        # Metrics
+        # Metrics we will keep track of
         self.loss_fn = nn.CrossEntropyLoss()
         self.train_acc = pl.metrics.Accuracy()
         self.val_acc = pl.metrics.Accuracy()
         self.accuracy = {"train": self.train_acc, "val": self.val_acc}
 
-    def forward(self, x):
-        if isinstance(x, dict):
-            x = x["video"]
-        feats = self.backbone(x)
-        return self.head(feats)
+    def forward(self, x: torch.Tensor):
+        return self.head(self.backbone(x))
 
     def shared_step(self, batch, mode: str):
         y_hat = self(batch["video"])
@@ -127,7 +137,6 @@ def parse_args(args=None):
     parser = pl.Trainer.add_argparse_args(parser)
     parser.set_defaults(
         max_epochs=200,
-        callbacks=[pl.callbacks.LearningRateMonitor()],
         replace_sampler_ddp=False,
         reload_dataloaders_every_epoch=False,
     )
@@ -138,7 +147,7 @@ def main(args):
     pl.trainer.seed_everything()
     dm_cls = DATASET_MAP.get(args.dataset)
     dm = dm_cls(args)
-    model = Classifier(num_classes=dm_cls.NUM_CLASSES)
+    model = Classifier(num_classes=dm_cls.NUM_CLASSES, **vars(args))
     trainer = pl.Trainer.from_argparse_args(args)
     trainer.fit(model, dm)
 
diff --git a/tutorials/video_classification_example/train.py b/tutorials/video_classification_example/train.py
index 17beb719..8588d129 100644
--- a/tutorials/video_classification_example/train.py
+++ b/tutorials/video_classification_example/train.py
@@ -80,7 +80,7 @@ def __init__(self, args):
         if self.args.arch == "video_resnet":
             self.model = pytorchvideo.models.resnet.create_resnet(
                 input_channel=3,
-                model_num_class=400,
+                model_num_class=11 # 400,
             )
             self.batch_key = "video"
         elif self.args.arch == "audio_resnet":
@@ -235,6 +235,7 @@ def _video_transform(self, mode: str):
             transform=Compose(
                 [
                     UniformTemporalSubsample(args.video_num_subsampled),
+                    Lambda(lambda x: x/255.0),
                     Normalize(args.video_means, args.video_stds),
                 ]
                 + (

From e126c63143c3d7a9d495ee415645c9e0eccdaa91 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 20 May 2021 01:30:45 -0600
Subject: [PATCH 07/15] :pencil: Writing docs.

---
 .../video_classification_example/data.py      | 29 +++++-----
 .../video_classification_example/finetune.py  | 55 ++++++++++++++++---
 2 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py
index 16eb3857..988e47fa 100644
--- a/tutorials/video_classification_example/data.py
+++ b/tutorials/video_classification_example/data.py
@@ -128,6 +128,12 @@ def _audio_transform(self):
         )
 
     def _make_ds_and_loader(self, mode: str):
+        """Creates both the dataset and dataloader for a given dataset split 'mode'. This returns
+        both the dataset and the dataloader specified, and should be called from self.{train|val|test}_dataloader().
+
+        Args:
+            mode (str): The dataset split to create. Should be 'train' or 'val'.
+        """
         ds = LimitDataset(
             labeled_video_dataset(
                 data_path=str(
@@ -259,6 +265,13 @@ def _make_ds_and_loader(self, mode: str):
 
 
 def download_and_unzip(url, data_dir="./", verify=True):
+    """Download a zip file from a given URL and unpack it within data_dir.
+
+    Args:
+        url (str): A URL to a zip file.
+        data_dir (str, optional): Directory where the zip will be unpacked. Defaults to "./".
+        verify (bool, optional): Whether to verify SSL certificate when requesting the zip file. Defaults to True.
+    """
     data_dir = Path(data_dir)
     zipfile_name = url.split("/")[-1]
     data_zip_path = data_dir / zipfile_name
@@ -271,19 +284,3 @@ def download_and_unzip(url, data_dir="./", verify=True):
             f.write(resp.content)
 
     unpack_archive(data_zip_path, extract_dir=data_dir)
-
-
-if __name__ == "__main__":
-    from finetune import parse_args
-    from train import LearningRateMonitor, VideoClassificationLightningModule
-    args = parse_args("--gpus 1 --precision 16 --batch_size 8 --data_path ./yt_data".split())
-    args.max_epochs = 200
-    args.callbacks = [LearningRateMonitor()]
-    args.replace_sampler_ddp = False
-    args.reload_dataloaders_every_epoch = False
-
-    pytorch_lightning.trainer.seed_everything(244)
-    dm = UCF11DataModule(args)
-    model = VideoClassificationLightningModule(args)
-    trainer = pytorch_lightning.Trainer.from_argparse_args(args)
-    trainer.fit(model, dm)
diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py
index cff7fa99..2aca590d 100644
--- a/tutorials/video_classification_example/finetune.py
+++ b/tutorials/video_classification_example/finetune.py
@@ -2,7 +2,7 @@
 
 import pytorch_lightning as pl
 import torch
-from .data import KineticsDataModule, MiniKineticsDataModule, UCF11DataModule
+from data import KineticsDataModule, MiniKineticsDataModule, UCF11DataModule
 from pytorchvideo.models.head import create_res_basic_head
 from torch import nn
 from torch.optim import Adam
@@ -16,8 +16,7 @@
 
 
 class Classifier(pl.LightningModule):
-    """
-    """
+
     def __init__(
         self,
         num_classes: int = 11,
@@ -35,6 +34,8 @@ def __init__(
             lr (float, optional): The learning rate for the Adam optimizer. Defaults to 2e-4.
             freeze_backbone (bool, optional): Whether to freeze the backbone or leave it trainable. Defaults to True.
             pretrained (bool, optional): Use the pretrained model from torchhub. When False, we initialize the slow_r50 model from scratch. Defaults to True.
+        
+        All extra kwargs will be available via self.hparams.<name-of-arg>. These will also be saved as TensorBoard Hparams.
         """
         super().__init__()
         self.save_hyperparameters()
@@ -64,9 +65,23 @@ def __init__(
         self.accuracy = {"train": self.train_acc, "val": self.val_acc}
 
     def forward(self, x: torch.Tensor):
+        """
+        Forward defines the prediction/inference actions.
+        """
         return self.head(self.backbone(x))
 
     def shared_step(self, batch, mode: str):
+        """This shared step handles both the training and validation steps to avoid
+        re-writing the same code more than once. The given `mode` will change the name
+        of the logged metrics.
+
+        Args:
+            batch (dict): PyTorchVideo batch dictionary containing a single batch of data.
+            mode (str): The type of step. Can be 'train', 'val', or 'test'.
+
+        Returns:
+            torch.Tensor: The loss for a single batch step.
+        """
         y_hat = self(batch["video"])
         loss = self.loss_fn(y_hat, batch["label"])
         self.log(f"{mode}_loss", loss)
@@ -79,9 +94,35 @@ def shared_step(self, batch, mode: str):
         return loss
 
     def training_step(self, batch, batch_idx):
+        """
+        This function is called in the inner loop of the training epoch. It must
+        return a loss that is used for loss.backwards() internally. The self.log(...)
+        function can be used to log any training metrics.
+
+        PyTorchVideo batches are dictionaries containing each modality or metadata of
+        the batch collated video clips. Kinetics contains the following notable keys:
+           {
+               'video': <video_tensor>,
+               'audio': <audio_tensor>,
+               'label': <action_label>,
+           }
+
+        - "video" is a Tensor of shape (batch, channels, time, height, Width)
+        - "audio" is a Tensor of shape (batch, channels, time, 1, frequency)
+        - "label" is a Tensor of shape (batch, 1)
+
+        The PyTorchVideo models and transforms expect the same input shapes and
+        dictionary structure making this function just a matter of unwrapping the dict and
+        feeding it through the model/loss.
+        """
         return self.shared_step(batch, "train")
 
     def validation_step(self, batch, batch_idx):
+        """
+        This function is called in the inner loop of the evaluation cycle. For this
+        simple example it's mostly the same as the training loop but with a different
+        metric name.
+        """
         return self.shared_step(batch, "val")
 
     def test_step(self, batch, batch_idx):
@@ -133,13 +174,9 @@ def parse_args(args=None):
     parser.add_argument("--audio_logmel_mean", default=-7.03, type=float)
     parser.add_argument("--audio_logmel_std", default=4.66, type=float)
 
-    # Trainer parameters.
+    # Add PyTorch Lightning's Trainer init arguments as parser flags
     parser = pl.Trainer.add_argparse_args(parser)
-    parser.set_defaults(
-        max_epochs=200,
-        replace_sampler_ddp=False,
-        reload_dataloaders_every_epoch=False,
-    )
+
     return parser.parse_args(args=args)
 
 

From e4c8cbff1d843c94013536b604ddcbd8524ae415 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 20 May 2021 11:36:01 -0600
Subject: [PATCH 08/15] :art: improve structure + cleanup unnecessary code

---
 .../video_classification_example/data.py      | 394 ++++++++-------
 .../video_classification_example/finetune.py  | 214 ++------
 .../video_classification_example/models.py    | 150 ++++++
 .../video_classification_example/train.py     | 460 +-----------------
 4 files changed, 411 insertions(+), 807 deletions(-)
 create mode 100644 tutorials/video_classification_example/models.py

diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py
index 988e47fa..b1c29268 100644
--- a/tutorials/video_classification_example/data.py
+++ b/tutorials/video_classification_example/data.py
@@ -2,8 +2,9 @@
 from pathlib import Path
 from random import shuffle
 from shutil import unpack_archive
+from typing import Tuple
 
-import pytorch_lightning
+import pytorch_lightning as pl
 import requests
 import torch
 from pytorchvideo.data import LabeledVideoDataset, make_clip_sampler
@@ -12,12 +13,11 @@
     ApplyTransformToKey,
     Normalize,
     RandomShortSideScale,
-    RemoveKey,
     ShortSideScale,
     UniformTemporalSubsample,
 )
+
 from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
-from torchaudio.transforms import MelSpectrogram, Resample
 from torchvision.transforms import (
     CenterCrop,
     Compose,
@@ -27,241 +27,234 @@
 )
 
 
-class LabeledVideoDataModule(pytorch_lightning.LightningDataModule):
-
-    TRAIN_PATH = "train.csv"
-    VAL_PATH = "val.csv"
-    SOURCE_URL = None
-    SOURCE_DIR_NAME = None
+class LabeledVideoDataModule(pl.LightningDataModule):
+
+    SOURCE_URL: str = None
+    SOURCE_DIR_NAME: str = ""
+    NUM_CLASSES: int = 700
+    VERIFY_SSL: bool = True
+
+    def __init__(
+        self,
+        root: str = './',
+        clip_duration: int = 2,
+        video_num_subsampled: int = 8,
+        video_crop_size: int = 224,
+        video_means: Tuple[float] = (0.45, 0.45, 0.45),
+        video_stds: Tuple[float] = (0.225, 0.225, 0.225),
+        video_min_short_side_scale: int = 256,
+        video_max_short_side_scale: int = 320,
+        video_horizontal_flip_p: float = 0.5,
+        batch_size: int = 4,
+        workers: int = 4,
+        **kwargs
+    ):
 
-    def __init__(self, args):
         super().__init__()
-        self.args = args
-        self.root = Path(self.args.data_path) / self.SOURCE_DIR_NAME
-        if not (self.SOURCE_URL is None or self.SOURCE_DIR_NAME is None):
-            if not self.root.exists():
-                download_and_unzip(
-                    self.SOURCE_URL,
-                    self.args.data_path,
-                    verify=getattr(self.args, "verify", True),
-                )
-
-    def _make_transforms(self, mode: str):
-
-        if self.args.data_type == "video":
-            transform = [
-                self._video_transform(mode),
-                RemoveKey("audio"),
-            ]
-        elif self.args.data_type == "audio":
-            transform = [
-                self._audio_transform(),
-                RemoveKey("video"),
-            ]
-        else:
-            raise Exception(f"{self.args.data_type} not supported")
-
-        return Compose(transform)
-
-    def _video_transform(self, mode: str):
-        args = self.args
-        return ApplyTransformToKey(
-            key="video",
+        self.root = root
+        self.data_path = Path(self.root) / self.SOURCE_DIR_NAME
+        self.clip_duration = clip_duration
+        self.video_num_subsampled = video_num_subsampled
+        self.video_crop_size = video_crop_size
+        self.video_means = video_means
+        self.video_stds = video_stds
+        self.video_min_short_side_scale = video_min_short_side_scale
+        self.video_max_short_side_scale = video_max_short_side_scale
+        self.video_horizontal_flip_p = video_horizontal_flip_p
+        self.batch_size = batch_size
+        self.workers = workers
+
+        # Transforms applied to train dataset
+        self.train_transform = ApplyTransformToKey(
+            key='video',
             transform=Compose(
                 [
-                    UniformTemporalSubsample(args.video_num_subsampled),
+                    UniformTemporalSubsample(self.video_num_subsampled),
                     Lambda(lambda x: x / 255.0),
-                    Normalize(args.video_means, args.video_stds),
+                    Normalize(self.video_means, self.video_stds),
+                    RandomShortSideScale(
+                        min_size=self.video_min_short_side_scale,
+                        max_size=self.video_max_short_side_scale,
+                    ),
+                    RandomCrop(self.video_crop_size),
+                    RandomHorizontalFlip(p=self.video_horizontal_flip_p),
                 ]
-                + (
-                    [
-                        RandomShortSideScale(
-                            min_size=args.video_min_short_side_scale,
-                            max_size=args.video_max_short_side_scale,
-                        ),
-                        RandomCrop(args.video_crop_size),
-                        RandomHorizontalFlip(p=args.video_horizontal_flip_p),
-                    ]
-                    if mode == "train"
-                    else [
-                        ShortSideScale(args.video_min_short_side_scale),
-                        CenterCrop(args.video_crop_size),
-                    ]
-                )
-            ),
+            )
         )
 
-    def _audio_transform(self):
-        args = self.args
-        n_fft = int(
-            float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size
-        )
-        hop_length = int(
-            float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size
-        )
-        eps = 1e-10
-        return ApplyTransformToKey(
-            key="audio",
+        # Transforms applied on val dataset or for inference
+        self.val_transform = ApplyTransformToKey(
+            key='video',
             transform=Compose(
                 [
-                    Resample(
-                        orig_freq=args.audio_raw_sample_rate,
-                        new_freq=args.audio_resampled_rate,
-                    ),
-                    MelSpectrogram(
-                        sample_rate=args.audio_resampled_rate,
-                        n_fft=n_fft,
-                        hop_length=hop_length,
-                        n_mels=args.audio_num_mels,
-                        center=False,
-                    ),
-                    Lambda(lambda x: x.clamp(min=eps)),
-                    Lambda(torch.log),
-                    UniformTemporalSubsample(args.audio_mel_num_subsample),
-                    Lambda(lambda x: x.transpose(1, 0)),  # (F, T) -> (T, F)
-                    Lambda(
-                        lambda x: x.view(1, x.size(0), 1, x.size(1))
-                    ),  # (T, F) -> (1, T, 1, F)
-                    Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)),
+                    UniformTemporalSubsample(self.video_num_subsampled),
+                    Lambda(lambda x: x / 255.0),
+                    Normalize(self.video_means, self.video_stds),
+                    ShortSideScale(self.video_min_short_side_scale),
+                    CenterCrop(self.video_crop_size)
                 ]
-            ),
+            )
         )
 
-    def _make_ds_and_loader(self, mode: str):
-        """Creates both the dataset and dataloader for a given dataset split 'mode'. This returns
-        both the dataset and the dataloader specified, and should be called from self.{train|val|test}_dataloader().
+    def prepare_data(self):
+        """Download the dataset if it doesn't already exist. This runs only on rank 0"""
+        if not (self.SOURCE_URL is None or self.SOURCE_DIR_NAME is None):
+            if not self.data_path.exists():
+                download_and_unzip(self.SOURCE_URL, self.root, verify=self.VERIFY_SSL)
 
-        Args:
-            mode (str): The dataset split to create. Should be 'train' or 'val'.
-        """
-        ds = LimitDataset(
+    def train_dataloader(self):
+        self.train_dataset = LimitDataset(
             labeled_video_dataset(
-                data_path=str(
-                    Path(self.root)
-                    / (self.TRAIN_PATH if mode == "train" else self.VAL_PATH)
-                ),
-                clip_sampler=make_clip_sampler(
-                    "random" if mode == "train" else "uniform", self.args.clip_duration
-                ),
-                video_path_prefix=self.args.video_path_prefix,
-                transform=self._make_transforms(mode=mode),
+                data_path=str(Path(self.data_path) / 'train'),
+                clip_sampler=make_clip_sampler("random", self.clip_duration),
+                transform=self.train_transform,
+                decode_audio=False,
                 video_sampler=DistributedSampler
                 if (self.trainer is not None and self.trainer.use_ddp)
-                else RandomSampler,
+                else RandomSampler
             )
         )
-        return ds, DataLoader(
-            ds, batch_size=self.args.batch_size, num_workers=self.args.workers
+        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.workers)
+
+    def val_dataloader(self):
+        self.val_dataset = LimitDataset(
+            labeled_video_dataset(
+                data_path=str(Path(self.data_path) / 'val'),
+                clip_sampler=make_clip_sampler("uniform", self.clip_duration),
+                transform=self.val_transform,
+                decode_audio=False,
+                video_sampler=DistributedSampler
+                if (self.trainer is not None and self.trainer.use_ddp)
+                else RandomSampler
+            )
         )
+        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.workers)
 
-    def train_dataloader(self):
-        self.train_dataset, loader = self._make_ds_and_loader("train")
-        return loader
 
-    def val_dataloader(self):
-        self.val_dataset, loader = self._make_ds_and_loader("val")
-        return loader
+class UCF11DataModule(LabeledVideoDataModule):
 
+    SOURCE_URL: str = "https://www.crcv.ucf.edu/data/YouTube_DataSet_Annotated.zip"
+    SOURCE_DIR_NAME: str = "action_youtube_naudio"
+    NUM_CLASSES: int = 11
+    VERIFY_SSL: bool = False
 
-class LimitDataset(torch.utils.data.Dataset):
-    """
-    To ensure a constant number of samples are retrieved from the dataset we use this
-    LimitDataset wrapper. This is necessary because several of the underlying videos
-    may be corrupted while fetching or decoding, however, we always want the same
-    number of steps per epoch.
-    """
+    def __init__(self, **kwargs):
+        """
+        The UCF11 Dataset contains 11 action classes: basketball shooting, biking/cycling, diving,
+        golf swinging, horse back riding, soccer juggling, swinging, tennis swinging, trampoline jumping,
+        volleyball spiking, and walking with a dog.
+
+        For each class, the videos are grouped into 25 group/scene folders containing at least 4 video clips each.
+        The video clips in the same scene folder share some common features, such as the same actor, similar
+        background, similar viewpoint, and so on.
+
+        The folder structure looks like the following:
+
+        /data_dir
+        ├── basketball                     # Class Folder Path
+        │   ├── v_shooting_01              # Scene/Group Folder Path
+        │   │   ├── v_shooting_01_01.avi   # Video Path
+        │   │   ├── v_shooting_01_02.avi
+        │   │   ├── v_shooting_01_03.avi
+        │   │   ├── ...
+        │   ├── v_shooting_02
+        │   ├── v_shooting_03
+        │   ├── ...
+        │   ...
+        ├── biking
+        │   ├── v_biking_01
+        │   │   ├── v_biking_01_01.avi
+        │   │   ├── v_biking_01_02.avi
+        │   │   ├── v_biking_01_03.avi
+        │   ├── v_biking_02
+        │   ├── v_biking_03
+        │   ...
+        ...
+
+        We take 80% of all scenes and use the videos within for training. The remaining scenes' videos
+        are used for validation. We do this so the validation data contains only videos from scenes/actors
+        that the model has not seen yet.
+        """
+        super().__init__(**kwargs)
 
-    def __init__(self, dataset):
-        super().__init__()
-        self.dataset = dataset
-        self.dataset_iter = itertools.chain.from_iterable(
-            itertools.repeat(iter(dataset), 2)
-        )
+    def setup(self, stage=None):
+        """Set up anything needed for initializing train/val datasets. This runs on all nodes"""
 
-    def __getitem__(self, index):
-        return next(self.dataset_iter)
+        # Names of classes to predict
+        # Ex. ['basketball', 'biking', 'diving', ...]
+        self.classes = sorted(x.name for x in self.data_path.glob("*") if x.is_dir())
 
-    def __len__(self):
-        return self.dataset.num_videos
+        # Mapping from label to class id.
+        # Ex. {'basketball': 0, 'biking': 1, 'diving': 2, ...}
+        self.label_to_id = {}
 
+        # A list to hold all available scenes across all classes
+        scene_folders = []
 
-class KineticsDataModule(LabeledVideoDataModule):
-    TRAIN_PATH = "train.csv"
-    VAL_PATH = "val.csv"
-    NUM_CLASSES = 700
+        for class_id, class_name in enumerate(self.classes):
 
+            self.label_to_id[class_name] = class_id
 
-class MiniKineticsDataModule(LabeledVideoDataModule):
+            # The path of a class folder within self.data_path
+            # Ex. 'action_youtube_naudio/{basketball|biking|diving|...}'
+            class_folder = self.data_path / class_name
 
-    TRAIN_PATH = "train"
-    VAL_PATH = "val"
-    SOURCE_URL = "https://pl-flash-data.s3.amazonaws.com/kinetics.zip"
-    SOURCE_DIR_NAME = "kinetics"
-    NUM_CLASSES = 6
+            # Collect scene folders within this class
+            # Ex. 'action_youtube_naudio/basketball/v_shooting_01'
+            for scene_folder in filter(Path.is_dir, class_folder.glob('v_*')):
+                scene_folders.append(scene_folder)
 
+        # Randomly shuffle the scene folders before splitting them into train/val
+        shuffle(scene_folders)
 
-class UCF11DataModule(LabeledVideoDataModule):
-    TRAIN_PATH = None
-    VAL_PATH = None
-    SOURCE_URL = "https://www.crcv.ucf.edu/data/YouTube_DataSet_Annotated.zip"
-    SOURCE_DIR_NAME = "action_youtube_naudio"
-    NUM_CLASSES = 11
-
-    def __init__(self, args):
-        args.verify = False
-        super().__init__(args)
-
-        data_path = Path(self.args.data_path)
-        root = data_path / self.SOURCE_DIR_NAME
-        self.classes = [x.name for x in root.glob("*") if x.is_dir()]
-        self.id_to_label = dict(zip(range(len(self.classes)), self.classes))
-        self.class_to_label = {v: k for k, v in self.id_to_label.items()}
-        self.num_classes = len(self.classes)
+        # Determine number of scenes in train/validation splits.
+        self.num_train_scenes = int(0.8 * len(scene_folders))
+        self.num_val_scenes = len(scene_folders) - self.num_train_scenes
 
+        # Collect train/val paths to videos within each scene folder.
+        # Validation only uses videos from scenes not seen by model during training
         self.train_paths = []
         self.val_paths = []
-        self.holdout_scenes = {}
-        for c in self.classes:
-
-            # Scenes within each class directory
-            scene_names = list(
-                x.name
-                for x in (root / c).glob("*")
-                if x.is_dir() and x.name != "Annotation"
-            )
-            shuffle(scene_names)
+        for i, scene_path in enumerate(scene_folders):
+
+            # The actual name of the class (Ex. 'basketball')
+            class_name = scene_path.parent.name
 
-            # Holdout a random actor/scene
-            holdout_scene = scene_names[-1]
-            scene_names = scene_names[:-1]
+            # Loop over all the videos within the given scene folder.
+            for video_path in scene_path.glob("*.avi"):
 
-            # Keep track of which scenes we held out for each class w/ a dict
-            self.holdout_scenes[c] = holdout_scene
+                # Construct a tuple containing (<path to a video>, <dict containing extra attributes/metadata>)
+                # In our case, we assign the class's ID as 'label'.
+                labeled_path = (video_path, {"label": self.label_to_id[class_name]})
 
-            for v in (root / c).glob("**/*.avi"):
-                labeled_path = (v, {"label": self.class_to_label[c]})
-                if v.parent.name != holdout_scene:
+                if i < self.num_train_scenes:
                     self.train_paths.append(labeled_path)
                 else:
                     self.val_paths.append(labeled_path)
 
-    def _make_ds_and_loader(self, mode: str):
-        ds = LimitDataset(
+    def train_dataloader(self):
+        self.train_dataset = LimitDataset(
             LabeledVideoDataset(
-                self.train_paths if mode == "train" else self.val_paths,
-                clip_sampler=make_clip_sampler(
-                    "random" if mode == "train" else "uniform", self.args.clip_duration
-                ),
+                self.train_paths,
+                clip_sampler=make_clip_sampler('random', self.clip_duration),
                 decode_audio=False,
-                transform=self._make_transforms(mode=mode),
-                video_sampler=DistributedSampler
-                if (self.trainer is not None and self.trainer.use_ddp)
-                else RandomSampler,
+                transform=self.train_transform,
+                video_sampler=RandomSampler
             )
         )
-        return ds, DataLoader(
-            ds, batch_size=self.args.batch_size, num_workers=self.args.workers
+        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.workers)
+
+    def val_dataloader(self):
+        self.val_dataset = LimitDataset(
+            LabeledVideoDataset(
+                self.val_paths,
+                clip_sampler=make_clip_sampler('uniform', self.clip_duration),
+                decode_audio=False,
+                transform=self.val_transform,
+                video_sampler=RandomSampler
+            )
         )
+        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.workers)
 
 
 def download_and_unzip(url, data_dir="./", verify=True):
@@ -284,3 +277,26 @@ def download_and_unzip(url, data_dir="./", verify=True):
             f.write(resp.content)
 
     unpack_archive(data_zip_path, extract_dir=data_dir)
+
+
+class LimitDataset(torch.utils.data.Dataset):
+
+    """
+    To ensure a constant number of samples are retrieved from the dataset we use this
+    LimitDataset wrapper. This is necessary because several of the underlying videos
+    may be corrupted while fetching or decoding, however, we always want the same
+    number of steps per epoch.
+    """
+
+    def __init__(self, dataset):
+        super().__init__()
+        self.dataset = dataset
+        self.dataset_iter = itertools.chain.from_iterable(
+            itertools.repeat(iter(dataset), 2)
+        )
+
+    def __getitem__(self, index):
+        return next(self.dataset_iter)
+
+    def __len__(self):
+        return self.dataset.num_videos
diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py
index 2aca590d..db097110 100644
--- a/tutorials/video_classification_example/finetune.py
+++ b/tutorials/video_classification_example/finetune.py
@@ -1,193 +1,39 @@
-from argparse import ArgumentParser
-
 import pytorch_lightning as pl
-import torch
-from data import KineticsDataModule, MiniKineticsDataModule, UCF11DataModule
-from pytorchvideo.models.head import create_res_basic_head
-from torch import nn
-from torch.optim import Adam
-
-
-DATASET_MAP = {
-    "ucf11": UCF11DataModule,
-    "kinetics": KineticsDataModule,
-    "kinetics-mini": MiniKineticsDataModule,
-}
-
 
-class Classifier(pl.LightningModule):
+from data import UCF11DataModule
+from models import SlowResnet50LightningModel
+from train import parse_args
 
-    def __init__(
-        self,
-        num_classes: int = 11,
-        lr: float = 2e-4,
-        freeze_backbone: bool = True,
-        pretrained: bool = True,
-        **kwargs
-    ):
-        """A classifier for finetuning pretrained video classification backbones from
-        torchhub. We use the slow_r50 model here, but you can edit this class to
-        use whatever backbone/head you'd like.
 
-        Args:
-            num_classes (int, optional): Number of output classes. Defaults to 11.
-            lr (float, optional): The learning rate for the Adam optimizer. Defaults to 2e-4.
-            freeze_backbone (bool, optional): Whether to freeze the backbone or leave it trainable. Defaults to True.
-            pretrained (bool, optional): Use the pretrained model from torchhub. When False, we initialize the slow_r50 model from scratch. Defaults to True.
-        
-        All extra kwargs will be available via self.hparams.<name-of-arg>. These will also be saved as TensorBoard Hparams.
-        """
-        super().__init__()
-        self.save_hyperparameters()
-
-        # The pretrained resnet model - we strip off its head to get the backbone
-        resnet = torch.hub.load(
-            "facebookresearch/pytorchvideo",
-            "slow_r50",
-            pretrained=self.hparams.pretrained,
-        )
-        self.backbone = nn.Sequential(*list(resnet.children())[0][:-1])
+def train(args):
+    pl.seed_everything(224)
+    dm = UCF11DataModule(**vars(args))
+    model = SlowResnet50LightningModel(num_classes=dm.NUM_CLASSES, **vars(args))
+    trainer = pl.Trainer.from_argparse_args(args)
+    trainer.fit(model, dm)
 
-        # Freeze the backbone layers if specified
-        if self.hparams.freeze_backbone:
-            for param in self.backbone.parameters():
-                param.requires_grad = False
 
-        # Create a new head we will train on top of the backbone
-        self.head = create_res_basic_head(
-            in_features=2048, out_features=self.hparams.num_classes
+def main():
+    args = parse_args()
+    if args.on_cluster:
+        from slurm import copy_and_run_with_config
+        copy_and_run_with_config(
+            train,
+            args,
+            args.working_directory,
+            job_name=args.job_name,
+            time="72:00:00",
+            partition=args.partition,
+            gpus_per_node=args.gpus,
+            ntasks_per_node=args.gpus,
+            cpus_per_task=10,
+            mem="470GB",
+            nodes=args.num_nodes,
+            constraint="volta32gb",
         )
-
-        # Metrics we will keep track of
-        self.loss_fn = nn.CrossEntropyLoss()
-        self.train_acc = pl.metrics.Accuracy()
-        self.val_acc = pl.metrics.Accuracy()
-        self.accuracy = {"train": self.train_acc, "val": self.val_acc}
-
-    def forward(self, x: torch.Tensor):
-        """
-        Forward defines the prediction/inference actions.
-        """
-        return self.head(self.backbone(x))
-
-    def shared_step(self, batch, mode: str):
-        """This shared step handles both the training and validation steps to avoid
-        re-writing the same code more than once. The given `mode` will change the name
-        of the logged metrics.
-
-        Args:
-            batch (dict): PyTorchVideo batch dictionary containing a single batch of data.
-            mode (str): The type of step. Can be 'train', 'val', or 'test'.
-
-        Returns:
-            torch.Tensor: The loss for a single batch step.
-        """
-        y_hat = self(batch["video"])
-        loss = self.loss_fn(y_hat, batch["label"])
-        self.log(f"{mode}_loss", loss)
-
-        if mode in ["val", "test"]:
-            preds = y_hat.argmax(dim=1)
-            acc = self.accuracy[mode](preds, batch["label"])
-            self.log(f"{mode}_acc", acc, prog_bar=True)
-
-        return loss
-
-    def training_step(self, batch, batch_idx):
-        """
-        This function is called in the inner loop of the training epoch. It must
-        return a loss that is used for loss.backwards() internally. The self.log(...)
-        function can be used to log any training metrics.
-
-        PyTorchVideo batches are dictionaries containing each modality or metadata of
-        the batch collated video clips. Kinetics contains the following notable keys:
-           {
-               'video': <video_tensor>,
-               'audio': <audio_tensor>,
-               'label': <action_label>,
-           }
-
-        - "video" is a Tensor of shape (batch, channels, time, height, Width)
-        - "audio" is a Tensor of shape (batch, channels, time, 1, frequency)
-        - "label" is a Tensor of shape (batch, 1)
-
-        The PyTorchVideo models and transforms expect the same input shapes and
-        dictionary structure making this function just a matter of unwrapping the dict and
-        feeding it through the model/loss.
-        """
-        return self.shared_step(batch, "train")
-
-    def validation_step(self, batch, batch_idx):
-        """
-        This function is called in the inner loop of the evaluation cycle. For this
-        simple example it's mostly the same as the training loop but with a different
-        metric name.
-        """
-        return self.shared_step(batch, "val")
-
-    def test_step(self, batch, batch_idx):
-        return self.shared_step(batch, "test")
-
-    def configure_optimizers(self):
-        return Adam(self.parameters(), lr=self.hparams.lr)
-
-
-def parse_args(args=None):
-    parser = ArgumentParser()
-
-    # Model parameters.
-    parser.add_argument("--lr", "--learning-rate", default=0.1, type=float)
-    parser.add_argument("--momentum", default=0.9, type=float)
-    parser.add_argument("--weight_decay", default=1e-4, type=float)
-    parser.add_argument(
-        "--arch",
-        default="video_resnet",
-        choices=["video_resnet", "audio_resnet"],
-        type=str,
-    )
-
-    # Data parameters
-    parser.add_argument(
-        "--dataset", default="ucf11", choices=["ucf11", "kinetics", "kinetics-mini"]
-    )
-    parser.add_argument("--data_path", default=None, type=str, required=True)
-    parser.add_argument("--video_path_prefix", default="", type=str)
-    parser.add_argument("--workers", default=8, type=int)
-    parser.add_argument("--batch_size", default=32, type=int)
-    parser.add_argument("--clip_duration", default=2, type=float)
-    parser.add_argument(
-        "--data_type", default="video", choices=["video", "audio"], type=str
-    )
-    parser.add_argument("--video_num_subsampled", default=8, type=int)
-    parser.add_argument("--video_means", default=(0.45, 0.45, 0.45), type=tuple)
-    parser.add_argument("--video_stds", default=(0.225, 0.225, 0.225), type=tuple)
-    parser.add_argument("--video_crop_size", default=224, type=int)
-    parser.add_argument("--video_min_short_side_scale", default=256, type=int)
-    parser.add_argument("--video_max_short_side_scale", default=320, type=int)
-    parser.add_argument("--video_horizontal_flip_p", default=0.5, type=float)
-    parser.add_argument("--audio_raw_sample_rate", default=44100, type=int)
-    parser.add_argument("--audio_resampled_rate", default=16000, type=int)
-    parser.add_argument("--audio_mel_window_size", default=32, type=int)
-    parser.add_argument("--audio_mel_step_size", default=16, type=int)
-    parser.add_argument("--audio_num_mels", default=80, type=int)
-    parser.add_argument("--audio_mel_num_subsample", default=128, type=int)
-    parser.add_argument("--audio_logmel_mean", default=-7.03, type=float)
-    parser.add_argument("--audio_logmel_std", default=4.66, type=float)
-
-    # Add PyTorch Lightning's Trainer init arguments as parser flags
-    parser = pl.Trainer.add_argparse_args(parser)
-
-    return parser.parse_args(args=args)
-
-
-def main(args):
-    pl.trainer.seed_everything()
-    dm_cls = DATASET_MAP.get(args.dataset)
-    dm = dm_cls(args)
-    model = Classifier(num_classes=dm_cls.NUM_CLASSES, **vars(args))
-    trainer = pl.Trainer.from_argparse_args(args)
-    trainer.fit(model, dm)
+    else:  # local
+        train(args)
 
 
-if __name__ == "__main__":
-    main(parse_args())
+if __name__ == '__main__':
+    main()
diff --git a/tutorials/video_classification_example/models.py b/tutorials/video_classification_example/models.py
new file mode 100644
index 00000000..9571e8d8
--- /dev/null
+++ b/tutorials/video_classification_example/models.py
@@ -0,0 +1,150 @@
+import pytorch_lightning as pl
+import torch
+from torch import nn
+from pytorchvideo.models.resnet import create_resnet
+from pytorchvideo.models.head import create_res_basic_head
+
+
+class VideoClassificationLightningModule(pl.LightningModule):
+
+    def __init__(
+        self,
+        num_classes: int = 11,
+        lr: float = 2e-4,
+        **kwargs
+    ):
+        """A classifier for finetuning pretrained video classification backbones from
+        torchhub. We use the slow_r50 model here, but you can edit this class to
+        use whatever backbone/head you'd like.
+
+        Args:
+            num_classes (int, optional): Number of output classes. Defaults to 11.
+            lr (float, optional): The learning rate for the Adam optimizer. Defaults to 2e-4.
+            freeze_backbone (bool, optional): Whether to freeze the backbone or leave it trainable. Defaults to True.
+            pretrained (bool, optional): Use the pretrained model from torchhub. When False, we initialize the
+            slow_r50 model from scratch. Defaults to True.
+
+        All extra kwargs will be available via self.hparams.<name-of-arg>. These will also be saved as
+        TensorBoard Hparams.
+        """
+        super().__init__()
+
+        # Saves all kwargs to self.hparams. Use references to self.hparams.<var-name>, not the init args themselves.
+        self.save_hyperparameters()
+
+        # Build the model in separate function so its easier to override
+        self.model = self._build_model()
+
+        # Metrics we will keep track of
+        self.loss_fn = nn.CrossEntropyLoss()
+        self.train_acc = pl.metrics.Accuracy()
+        self.val_acc = pl.metrics.Accuracy()
+        self.accuracy = {"train": self.train_acc, "val": self.val_acc}
+
+    def _build_model(self):
+        return create_resnet(model_num_class=self.hparams.num_classes)
+
+    def on_train_epoch_start(self):
+        """
+        For distributed training we need to set the datasets video sampler epoch so
+        that shuffling is done correctly
+        """
+        epoch = self.trainer.current_epoch
+        if self.trainer.use_ddp:
+            self.trainer.datamodule.train_dataset.dataset.video_sampler.set_epoch(epoch)
+
+    def forward(self, x: torch.Tensor):
+        """
+        Forward defines the prediction/inference actions.
+        """
+        return self.model(x)
+
+    def shared_step(self, batch, mode: str):
+        """This shared step handles both the training and validation steps to avoid
+        re-writing the same code more than once. The given `mode` will change the name
+        of the logged metrics.
+
+        PyTorchVideo batches are dictionaries containing each modality or metadata of
+        the batch collated video clips. Kinetics contains the following notable keys:
+           {
+               'video': <video_tensor>,
+               'label': <action_label>,
+           }
+
+        - "video" is a Tensor of shape (batch, channels, time, height, Width)
+        - "label" is a Tensor of shape (batch, 1)
+
+        The PyTorchVideo models and transforms expect the same input shapes and
+        dictionary structure making this function just a matter of unwrapping the dict and
+        feeding it through the model/loss.
+
+        Args:
+            batch (dict): PyTorchVideo batch dictionary containing a single batch of data.
+            mode (str): The type of step. Can be 'train', 'val', or 'test'.
+
+        Returns:
+            torch.Tensor: The loss for a single batch step.
+        """
+
+        # Pass video tensor through model to get outputs
+        outputs = self(batch["video"])
+
+        # Compute and log the cross entropy loss to {train|val}_loss in TensorBoard
+        loss = self.loss_fn(outputs, batch["label"])
+        self.log(f"{mode}_loss", loss)
+
+        # Predicted class probabilities - (BATCH_SIZE, NUM_CLASSES)
+        proba = outputs.softmax(dim=1)
+
+        # Predicted classes - (BATCH_SIZE,)
+        preds = proba.argmax(dim=1)
+
+        # Compute the predicted class accuracy and log it to {train|val}_acc in TensorBoard
+        acc = self.accuracy[mode](preds, batch["label"])
+        self.log(f"{mode}_acc", acc, prog_bar=True)
+
+        return loss
+
+    def training_step(self, batch, batch_idx):
+        """
+        This function is called in the inner loop of the training epoch. It must
+        return a loss that is used for loss.backwards() internally.
+        """
+        return self.shared_step(batch, "train")
+
+    def validation_step(self, batch, batch_idx):
+        """
+        This function is called in the inner loop of the evaluation cycle. For this
+        simple example it's mostly the same as the training loop but with a different
+        metric name.
+        """
+        return self.shared_step(batch, "val")
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
+
+
+class SlowResnet50LightningModel(VideoClassificationLightningModule):
+
+    def __init__(self, freeze_backbone: bool = True, pretrained: bool = True, **kwargs):
+        super().__init__(freeze_backbone=freeze_backbone, pretrained=pretrained, **kwargs)
+
+    def _build_model(self):
+        # The pretrained resnet model - we strip off its head to get the backbone
+        resnet = torch.hub.load(
+            "facebookresearch/pytorchvideo",
+            "slow_r50",
+            pretrained=self.hparams.pretrained,
+        )
+        self.backbone = nn.Sequential(*list(resnet.children())[0][:-1])
+
+        # Freeze the backbone layers if specified
+        if self.hparams.freeze_backbone:
+            for param in self.backbone.parameters():
+                param.requires_grad = False
+
+        # Create a new head we will train on top of the backbone
+        self.head = create_res_basic_head(
+            in_features=2048, out_features=self.hparams.num_classes
+        )
+        return nn.Sequential(self.backbone, self.head)
diff --git a/tutorials/video_classification_example/train.py b/tutorials/video_classification_example/train.py
index 8588d129..93ec1eb2 100644
--- a/tutorials/video_classification_example/train.py
+++ b/tutorials/video_classification_example/train.py
@@ -1,382 +1,12 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from argparse import ArgumentParser
 
-import argparse
-import itertools
-import logging
-import os
+import pytorch_lightning as pl
+from data import LabeledVideoDataModule
+from models import VideoClassificationLightningModule
 
-import pytorch_lightning
-import pytorchvideo.data
-import pytorchvideo.models.resnet
-import torch
-import torch.nn.functional as F
-from pytorch_lightning.callbacks import LearningRateMonitor
-from pytorchvideo.transforms import (
-    ApplyTransformToKey,
-    Normalize,
-    RandomShortSideScale,
-    RemoveKey,
-    ShortSideScale,
-    UniformTemporalSubsample,
-)
-from torch.utils.data import DistributedSampler, RandomSampler
-from torchaudio.transforms import MelSpectrogram, Resample
-from torchvision.transforms import (
-    CenterCrop,
-    Compose,
-    Lambda,
-    RandomCrop,
-    RandomHorizontalFlip,
-)
 
-
-"""
-This video classification example demonstrates how PyTorchVideo models, datasets and
-transforms can be used with PyTorch Lightning module. Specifically it shows how a
-simple pipeline to train a Resnet on the Kinetics video dataset can be built.
-
-Don't worry if you don't have PyTorch Lightning experience. We'll provide an explanation
-of how the PyTorch Lightning module works to accompany the example.
-
-The code can be separated into three main components:
-1. VideoClassificationLightningModule (pytorch_lightning.LightningModule), this defines:
-    - how the model is constructed,
-    - the inner train or validation loop (i.e. computing loss/metrics from a minibatch)
-    - optimizer configuration
-
-2. KineticsDataModule (pytorch_lightning.LightningDataModule), this defines:
-    - how to fetch/prepare the dataset
-    - the train and val dataloaders for the associated dataset
-
-3. pytorch_lightning.Trainer, this is a concrete PyTorch Lightning class that provides
-  the training pipeline configuration and a fit(<lightning_module>, <data_module>)
-  function to start the training/validation loop.
-
-All three components are combined in the train() function. We'll explain the rest of the
-details inline.
-"""
-
-
-class VideoClassificationLightningModule(pytorch_lightning.LightningModule):
-    def __init__(self, args):
-        """
-        This LightningModule implementation constructs a PyTorchVideo ResNet,
-        defines the train and val loss to be trained with (cross_entropy), and
-        configures the optimizer.
-        """
-        self.args = args
-        super().__init__()
-        self.train_accuracy = pytorch_lightning.metrics.Accuracy()
-        self.val_accuracy = pytorch_lightning.metrics.Accuracy()
-
-        #############
-        # PTV Model #
-        #############
-
-        # Here we construct the PyTorchVideo model. For this example we're using a
-        # ResNet that works with Kinetics (e.g. 400 num_classes). For your application,
-        # this could be changed to any other PyTorchVideo model (e.g. for SlowFast use
-        # create_slowfast).
-        if self.args.arch == "video_resnet":
-            self.model = pytorchvideo.models.resnet.create_resnet(
-                input_channel=3,
-                model_num_class=11 # 400,
-            )
-            self.batch_key = "video"
-        elif self.args.arch == "audio_resnet":
-            self.model = pytorchvideo.models.resnet.create_acoustic_resnet(
-                input_channel=1,
-                model_num_class=400,
-            )
-            self.batch_key = "audio"
-        else:
-            raise Exception("{self.args.arch} not supported")
-
-    def on_train_epoch_start(self):
-        """
-        For distributed training we need to set the datasets video sampler epoch so
-        that shuffling is done correctly
-        """
-        epoch = self.trainer.current_epoch
-        if self.trainer.use_ddp:
-            self.trainer.datamodule.train_dataset.dataset.video_sampler.set_epoch(epoch)
-
-    def forward(self, x):
-        """
-        Forward defines the prediction/inference actions.
-        """
-        return self.model(x)
-
-    def training_step(self, batch, batch_idx):
-        """
-        This function is called in the inner loop of the training epoch. It must
-        return a loss that is used for loss.backwards() internally. The self.log(...)
-        function can be used to log any training metrics.
-
-        PyTorchVideo batches are dictionaries containing each modality or metadata of
-        the batch collated video clips. Kinetics contains the following notable keys:
-           {
-               'video': <video_tensor>,
-               'audio': <audio_tensor>,
-               'label': <action_label>,
-           }
-
-        - "video" is a Tensor of shape (batch, channels, time, height, Width)
-        - "audio" is a Tensor of shape (batch, channels, time, 1, frequency)
-        - "label" is a Tensor of shape (batch, 1)
-
-        The PyTorchVideo models and transforms expect the same input shapes and
-        dictionary structure making this function just a matter of unwrapping the dict and
-        feeding it through the model/loss.
-        """
-        x = batch[self.batch_key]
-        y_hat = self.model(x)
-        loss = F.cross_entropy(y_hat, batch["label"])
-        acc = self.train_accuracy(F.softmax(y_hat, dim=-1), batch["label"])
-        self.log("train_loss", loss)
-        self.log(
-            "train_acc", acc, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True
-        )
-        return loss
-
-    def validation_step(self, batch, batch_idx):
-        """
-        This function is called in the inner loop of the evaluation cycle. For this
-        simple example it's mostly the same as the training loop but with a different
-        metric name.
-        """
-        x = batch[self.batch_key]
-        y_hat = self.model(x)
-        loss = F.cross_entropy(y_hat, batch["label"])
-        acc = self.val_accuracy(F.softmax(y_hat, dim=-1), batch["label"])
-        self.log("val_loss", loss)
-        self.log(
-            "val_acc", acc, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True
-        )
-        return loss
-
-    def configure_optimizers(self):
-        """
-        We use the SGD optimizer with per step cosine annealing scheduler.
-        """
-        optimizer = torch.optim.SGD(
-            self.parameters(),
-            lr=self.args.lr,
-            momentum=self.args.momentum,
-            weight_decay=self.args.weight_decay,
-        )
-        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
-            optimizer, self.args.max_epochs, last_epoch=-1
-        )
-        return [optimizer], [scheduler]
-
-
-class KineticsDataModule(pytorch_lightning.LightningDataModule):
-    """
-    This LightningDataModule implementation constructs a PyTorchVideo Kinetics dataset for both
-    the train and val partitions. It defines each partition's augmentation and
-    preprocessing transforms and configures the PyTorch DataLoaders.
-    """
-
-    TRAIN_PATH = 'train.csv'
-    VAL_PATH = 'val.csv'
-
-    def __init__(self, args):
-        self.args = args
-        super().__init__()
-
-    def _make_transforms(self, mode: str):
-        """
-        ##################
-        # PTV Transforms #
-        ##################
-
-        # Each PyTorchVideo dataset has a "transform" arg. This arg takes a
-        # Callable[[Dict], Any], and is used on the output Dict of the dataset to
-        # define any application specific processing or augmentation. Transforms can
-        # either be implemented by the user application or reused from any library
-        # that's domain specific to the modality. E.g. for video we recommend using
-        # TorchVision, for audio we recommend TorchAudio.
-        #
-        # To improve interoperation between domain transform libraries, PyTorchVideo
-        # provides a dictionary transform API that provides:
-        #   - ApplyTransformToKey(key, transform) - applies a transform to specific modality
-        #   - RemoveKey(key) - remove a specific modality from the clip
-        #
-        # In the case that the recommended libraries don't provide transforms that
-        # are common enough for PyTorchVideo use cases, PyTorchVideo will provide them in
-        # the same structure as the recommended library. E.g. TorchVision didn't
-        # have a RandomShortSideScale video transform so it's been added to PyTorchVideo.
-        """
-        if self.args.data_type == "video":
-            transform = [
-                self._video_transform(mode),
-                RemoveKey("audio"),
-            ]
-        elif self.args.data_type == "audio":
-            transform = [
-                self._audio_transform(),
-                RemoveKey("video"),
-            ]
-        else:
-            raise Exception(f"{self.args.data_type} not supported")
-
-        return Compose(transform)
-
-    def _video_transform(self, mode: str):
-        """
-        This function contains example transforms using both PyTorchVideo and TorchVision
-        in the same Callable. For 'train' mode, we use augmentations (prepended with
-        'Random'), for 'val' mode we use the respective determinstic function.
-        """
-        args = self.args
-        return ApplyTransformToKey(
-            key="video",
-            transform=Compose(
-                [
-                    UniformTemporalSubsample(args.video_num_subsampled),
-                    Lambda(lambda x: x/255.0),
-                    Normalize(args.video_means, args.video_stds),
-                ]
-                + (
-                    [
-                        RandomShortSideScale(
-                            min_size=args.video_min_short_side_scale,
-                            max_size=args.video_max_short_side_scale,
-                        ),
-                        RandomCrop(args.video_crop_size),
-                        RandomHorizontalFlip(p=args.video_horizontal_flip_p),
-                    ]
-                    if mode == "train"
-                    else [
-                        ShortSideScale(args.video_min_short_side_scale),
-                        CenterCrop(args.video_crop_size),
-                    ]
-                )
-            ),
-        )
-
-    def _audio_transform(self):
-        """
-        This function contains example transforms using both PyTorchVideo and TorchAudio
-        in the same Callable.
-        """
-        args = self.args
-        n_fft = int(
-            float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size
-        )
-        hop_length = int(
-            float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size
-        )
-        eps = 1e-10
-        return ApplyTransformToKey(
-            key="audio",
-            transform=Compose(
-                [
-                    Resample(
-                        orig_freq=args.audio_raw_sample_rate,
-                        new_freq=args.audio_resampled_rate,
-                    ),
-                    MelSpectrogram(
-                        sample_rate=args.audio_resampled_rate,
-                        n_fft=n_fft,
-                        hop_length=hop_length,
-                        n_mels=args.audio_num_mels,
-                        center=False,
-                    ),
-                    Lambda(lambda x: x.clamp(min=eps)),
-                    Lambda(torch.log),
-                    UniformTemporalSubsample(args.audio_mel_num_subsample),
-                    Lambda(lambda x: x.transpose(1, 0)),  # (F, T) -> (T, F)
-                    Lambda(
-                        lambda x: x.view(1, x.size(0), 1, x.size(1))
-                    ),  # (T, F) -> (1, T, 1, F)
-                    Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)),
-                ]
-            ),
-        )
-
-    def train_dataloader(self):
-        """
-        Defines the train DataLoader that the PyTorch Lightning Trainer trains/tests with.
-        """
-        sampler = DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler
-        train_transform = self._make_transforms(mode="train")
-        self.train_dataset = LimitDataset(
-            pytorchvideo.data.Kinetics(
-                data_path=os.path.join(self.args.data_path, self.TRAIN_PATH),
-                clip_sampler=pytorchvideo.data.make_clip_sampler(
-                    "random", self.args.clip_duration
-                ),
-                video_path_prefix=self.args.video_path_prefix,
-                transform=train_transform,
-                video_sampler=sampler,
-            )
-        )
-        return torch.utils.data.DataLoader(
-            self.train_dataset,
-            batch_size=self.args.batch_size,
-            num_workers=self.args.workers,
-        )
-
-    def val_dataloader(self):
-        """
-        Defines the train DataLoader that the PyTorch Lightning Trainer trains/tests with.
-        """
-        sampler = DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler
-        val_transform = self._make_transforms(mode="val")
-        self.val_dataset = LimitDataset(
-            pytorchvideo.data.Kinetics(
-                data_path=os.path.join(self.args.data_path, self.VAL_PATH),
-                clip_sampler=pytorchvideo.data.make_clip_sampler(
-                    "uniform", self.args.clip_duration
-                ),
-                video_path_prefix=self.args.video_path_prefix,
-                transform=val_transform,
-                video_sampler=sampler,
-            )
-        )
-        return torch.utils.data.DataLoader(
-            self.val_dataset,
-            batch_size=self.args.batch_size,
-            num_workers=self.args.workers,
-        )
-
-
-class LimitDataset(torch.utils.data.Dataset):
-    """
-    To ensure a constant number of samples are retrieved from the dataset we use this
-    LimitDataset wrapper. This is necessary because several of the underlying videos
-    may be corrupted while fetching or decoding, however, we always want the same
-    number of steps per epoch.
-    """
-
-    def __init__(self, dataset):
-        super().__init__()
-        self.dataset = dataset
-        self.dataset_iter = itertools.chain.from_iterable(
-            itertools.repeat(iter(dataset), 2)
-        )
-
-    def __getitem__(self, index):
-        return next(self.dataset_iter)
-
-    def __len__(self):
-        return self.dataset.num_videos
-
-
-def main():
-    """
-    To train the ResNet with the Kinetics dataset we construct the two modules above,
-    and pass them to the fit function of a pytorch_lightning.Trainer.
-
-    This example can be run either locally (with default parameters) or on a Slurm
-    cluster. To run on a Slurm cluster provide the --on_cluster argument.
-    """
-    setup_logger()
-
-    pytorch_lightning.trainer.seed_everything()
-    parser = argparse.ArgumentParser()
+def parse_args(args=None):
+    parser = ArgumentParser()
 
     #  Cluster parameters.
     parser.add_argument("--on_cluster", action="store_true")
@@ -384,54 +14,32 @@ def main():
     parser.add_argument("--working_directory", default=".", type=str)
     parser.add_argument("--partition", default="dev", type=str)
 
-    # Model parameters.
-    parser.add_argument("--lr", "--learning-rate", default=0.1, type=float)
-    parser.add_argument("--momentum", default=0.9, type=float)
-    parser.add_argument("--weight_decay", default=1e-4, type=float)
-    parser.add_argument(
-        "--arch",
-        default="video_resnet",
-        choices=["video_resnet", "audio_resnet"],
-        type=str,
-    )
+    # Model Parameters
+    parser.add_argument('--lr', '--learning_rate', default=2e-4, type=float)
 
-    # Data parameters.
-    parser.add_argument("--data_path", default=None, type=str, required=True)
-    parser.add_argument("--video_path_prefix", default="", type=str)
-    parser.add_argument("--workers", default=8, type=int)
-    parser.add_argument("--batch_size", default=32, type=int)
-    parser.add_argument("--clip_duration", default=2, type=float)
-    parser.add_argument(
-        "--data_type", default="video", choices=["video", "audio"], type=str
-    )
-    parser.add_argument("--video_num_subsampled", default=8, type=int)
-    parser.add_argument("--video_means", default=(0.45, 0.45, 0.45), type=tuple)
-    parser.add_argument("--video_stds", default=(0.225, 0.225, 0.225), type=tuple)
-    parser.add_argument("--video_crop_size", default=224, type=int)
-    parser.add_argument("--video_min_short_side_scale", default=256, type=int)
-    parser.add_argument("--video_max_short_side_scale", default=320, type=int)
-    parser.add_argument("--video_horizontal_flip_p", default=0.5, type=float)
-    parser.add_argument("--audio_raw_sample_rate", default=44100, type=int)
-    parser.add_argument("--audio_resampled_rate", default=16000, type=int)
-    parser.add_argument("--audio_mel_window_size", default=32, type=int)
-    parser.add_argument("--audio_mel_step_size", default=16, type=int)
-    parser.add_argument("--audio_num_mels", default=80, type=int)
-    parser.add_argument("--audio_mel_num_subsample", default=128, type=int)
-    parser.add_argument("--audio_logmel_mean", default=-7.03, type=float)
-    parser.add_argument("--audio_logmel_std", default=4.66, type=float)
+    # Data Parameters
+    parser = LabeledVideoDataModule.add_argparse_args(parser)
 
-    # Trainer parameters.
-    parser = pytorch_lightning.Trainer.add_argparse_args(parser)
+    # Training Parameters
+    parser = pl.Trainer.add_argparse_args(parser)
     parser.set_defaults(
-        max_epochs=200,
-        callbacks=[LearningRateMonitor()],
+        callbacks=[pl.callbacks.LearningRateMonitor()],
         replace_sampler_ddp=False,
-        reload_dataloaders_every_epoch=False,
     )
 
-    # Build trainer, ResNet lightning-module and Kinetics data-module.
-    args = parser.parse_args()
+    return parser.parse_args(args)
+
+
+def train(args):
+    pl.seed_everything(224)
+    dm = LabeledVideoDataModule.from_argparse_args(args)
+    model = VideoClassificationLightningModule(num_classes=dm.NUM_CLASSES, **vars(args))
+    trainer = pl.Trainer.from_argparse_args(args)
+    trainer.fit(model, dm)
+
 
+def main():
+    args = parse_args()
     if args.on_cluster:
         from slurm import copy_and_run_with_config
         copy_and_run_with_config(
@@ -452,21 +60,5 @@ def main():
         train(args)
 
 
-def train(args):
-    trainer = pytorch_lightning.Trainer.from_argparse_args(args)
-    classification_module = VideoClassificationLightningModule(args)
-    data_module = KineticsDataModule(args)
-    trainer.fit(classification_module, data_module)
-
-
-def setup_logger():
-    ch = logging.StreamHandler()
-    formatter = logging.Formatter("\n%(asctime)s [%(levelname)s] %(name)s: %(message)s")
-    ch.setFormatter(formatter)
-    logger = logging.getLogger("pytorchvideo")
-    logger.setLevel(logging.DEBUG)
-    logger.addHandler(ch)
-
-
-if __name__ == "__main__":
+if __name__ == '__main__':
     main()

From 7df5f3e321d8bb8b5694fa1e7048147be08811eb Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 20 May 2021 11:42:40 -0600
Subject: [PATCH 09/15] :lipstick: apply style

---
 .../video_classification_example/data.py      | 47 +++++++++++--------
 .../video_classification_example/finetune.py  |  4 +-
 .../video_classification_example/models.py    | 17 +++----
 .../video_classification_example/train.py     |  5 +-
 4 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py
index b1c29268..b2156f97 100644
--- a/tutorials/video_classification_example/data.py
+++ b/tutorials/video_classification_example/data.py
@@ -16,7 +16,6 @@
     ShortSideScale,
     UniformTemporalSubsample,
 )
-
 from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
 from torchvision.transforms import (
     CenterCrop,
@@ -36,7 +35,7 @@ class LabeledVideoDataModule(pl.LightningDataModule):
 
     def __init__(
         self,
-        root: str = './',
+        root: str = "./",
         clip_duration: int = 2,
         video_num_subsampled: int = 8,
         video_crop_size: int = 224,
@@ -66,7 +65,7 @@ def __init__(
 
         # Transforms applied to train dataset
         self.train_transform = ApplyTransformToKey(
-            key='video',
+            key="video",
             transform=Compose(
                 [
                     UniformTemporalSubsample(self.video_num_subsampled),
@@ -79,21 +78,21 @@ def __init__(
                     RandomCrop(self.video_crop_size),
                     RandomHorizontalFlip(p=self.video_horizontal_flip_p),
                 ]
-            )
+            ),
         )
 
         # Transforms applied on val dataset or for inference
         self.val_transform = ApplyTransformToKey(
-            key='video',
+            key="video",
             transform=Compose(
                 [
                     UniformTemporalSubsample(self.video_num_subsampled),
                     Lambda(lambda x: x / 255.0),
                     Normalize(self.video_means, self.video_stds),
                     ShortSideScale(self.video_min_short_side_scale),
-                    CenterCrop(self.video_crop_size)
+                    CenterCrop(self.video_crop_size),
                 ]
-            )
+            ),
         )
 
     def prepare_data(self):
@@ -105,30 +104,34 @@ def prepare_data(self):
     def train_dataloader(self):
         self.train_dataset = LimitDataset(
             labeled_video_dataset(
-                data_path=str(Path(self.data_path) / 'train'),
+                data_path=str(Path(self.data_path) / "train"),
                 clip_sampler=make_clip_sampler("random", self.clip_duration),
                 transform=self.train_transform,
                 decode_audio=False,
                 video_sampler=DistributedSampler
                 if (self.trainer is not None and self.trainer.use_ddp)
-                else RandomSampler
+                else RandomSampler,
             )
         )
-        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.workers)
+        return DataLoader(
+            self.train_dataset, batch_size=self.batch_size, num_workers=self.workers
+        )
 
     def val_dataloader(self):
         self.val_dataset = LimitDataset(
             labeled_video_dataset(
-                data_path=str(Path(self.data_path) / 'val'),
+                data_path=str(Path(self.data_path) / "val"),
                 clip_sampler=make_clip_sampler("uniform", self.clip_duration),
                 transform=self.val_transform,
                 decode_audio=False,
                 video_sampler=DistributedSampler
                 if (self.trainer is not None and self.trainer.use_ddp)
-                else RandomSampler
+                else RandomSampler,
             )
         )
-        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.workers)
+        return DataLoader(
+            self.val_dataset, batch_size=self.batch_size, num_workers=self.workers
+        )
 
 
 class UCF11DataModule(LabeledVideoDataModule):
@@ -201,7 +204,7 @@ def setup(self, stage=None):
 
             # Collect scene folders within this class
             # Ex. 'action_youtube_naudio/basketball/v_shooting_01'
-            for scene_folder in filter(Path.is_dir, class_folder.glob('v_*')):
+            for scene_folder in filter(Path.is_dir, class_folder.glob("v_*")):
                 scene_folders.append(scene_folder)
 
         # Randomly shuffle the scene folders before splitting them into train/val
@@ -236,25 +239,29 @@ def train_dataloader(self):
         self.train_dataset = LimitDataset(
             LabeledVideoDataset(
                 self.train_paths,
-                clip_sampler=make_clip_sampler('random', self.clip_duration),
+                clip_sampler=make_clip_sampler("random", self.clip_duration),
                 decode_audio=False,
                 transform=self.train_transform,
-                video_sampler=RandomSampler
+                video_sampler=RandomSampler,
             )
         )
-        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.workers)
+        return DataLoader(
+            self.train_dataset, batch_size=self.batch_size, num_workers=self.workers
+        )
 
     def val_dataloader(self):
         self.val_dataset = LimitDataset(
             LabeledVideoDataset(
                 self.val_paths,
-                clip_sampler=make_clip_sampler('uniform', self.clip_duration),
+                clip_sampler=make_clip_sampler("uniform", self.clip_duration),
                 decode_audio=False,
                 transform=self.val_transform,
-                video_sampler=RandomSampler
+                video_sampler=RandomSampler,
             )
         )
-        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.workers)
+        return DataLoader(
+            self.val_dataset, batch_size=self.batch_size, num_workers=self.workers
+        )
 
 
 def download_and_unzip(url, data_dir="./", verify=True):
diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py
index db097110..0dd05734 100644
--- a/tutorials/video_classification_example/finetune.py
+++ b/tutorials/video_classification_example/finetune.py
@@ -1,5 +1,4 @@
 import pytorch_lightning as pl
-
 from data import UCF11DataModule
 from models import SlowResnet50LightningModel
 from train import parse_args
@@ -17,6 +16,7 @@ def main():
     args = parse_args()
     if args.on_cluster:
         from slurm import copy_and_run_with_config
+
         copy_and_run_with_config(
             train,
             args,
@@ -35,5 +35,5 @@ def main():
         train(args)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tutorials/video_classification_example/models.py b/tutorials/video_classification_example/models.py
index 9571e8d8..600f59ce 100644
--- a/tutorials/video_classification_example/models.py
+++ b/tutorials/video_classification_example/models.py
@@ -1,18 +1,12 @@
 import pytorch_lightning as pl
 import torch
-from torch import nn
-from pytorchvideo.models.resnet import create_resnet
 from pytorchvideo.models.head import create_res_basic_head
+from pytorchvideo.models.resnet import create_resnet
+from torch import nn
 
 
 class VideoClassificationLightningModule(pl.LightningModule):
-
-    def __init__(
-        self,
-        num_classes: int = 11,
-        lr: float = 2e-4,
-        **kwargs
-    ):
+    def __init__(self, num_classes: int = 11, lr: float = 2e-4, **kwargs):
         """A classifier for finetuning pretrained video classification backbones from
         torchhub. We use the slow_r50 model here, but you can edit this class to
         use whatever backbone/head you'd like.
@@ -125,9 +119,10 @@ def configure_optimizers(self):
 
 
 class SlowResnet50LightningModel(VideoClassificationLightningModule):
-
     def __init__(self, freeze_backbone: bool = True, pretrained: bool = True, **kwargs):
-        super().__init__(freeze_backbone=freeze_backbone, pretrained=pretrained, **kwargs)
+        super().__init__(
+            freeze_backbone=freeze_backbone, pretrained=pretrained, **kwargs
+        )
 
     def _build_model(self):
         # The pretrained resnet model - we strip off its head to get the backbone
diff --git a/tutorials/video_classification_example/train.py b/tutorials/video_classification_example/train.py
index 93ec1eb2..0d7b5ebb 100644
--- a/tutorials/video_classification_example/train.py
+++ b/tutorials/video_classification_example/train.py
@@ -15,7 +15,7 @@ def parse_args(args=None):
     parser.add_argument("--partition", default="dev", type=str)
 
     # Model Parameters
-    parser.add_argument('--lr', '--learning_rate', default=2e-4, type=float)
+    parser.add_argument("--lr", "--learning_rate", default=2e-4, type=float)
 
     # Data Parameters
     parser = LabeledVideoDataModule.add_argparse_args(parser)
@@ -42,6 +42,7 @@ def main():
     args = parse_args()
     if args.on_cluster:
         from slurm import copy_and_run_with_config
+
         copy_and_run_with_config(
             train,
             args,
@@ -60,5 +61,5 @@ def main():
         train(args)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()

From 9dba4ad34f2e2887bb0a4c777726ec53de64c35e Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 20 May 2021 12:01:52 -0600
Subject: [PATCH 10/15] :art: move sampler statement to its own line

---
 tutorials/video_classification_example/data.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py
index b2156f97..2d8890c5 100644
--- a/tutorials/video_classification_example/data.py
+++ b/tutorials/video_classification_example/data.py
@@ -102,15 +102,14 @@ def prepare_data(self):
                 download_and_unzip(self.SOURCE_URL, self.root, verify=self.VERIFY_SSL)
 
     def train_dataloader(self):
+        do_use_ddp = self.trainer is not None and self.trainer.use_ddp
         self.train_dataset = LimitDataset(
             labeled_video_dataset(
                 data_path=str(Path(self.data_path) / "train"),
                 clip_sampler=make_clip_sampler("random", self.clip_duration),
                 transform=self.train_transform,
                 decode_audio=False,
-                video_sampler=DistributedSampler
-                if (self.trainer is not None and self.trainer.use_ddp)
-                else RandomSampler,
+                video_sampler=DistributedSampler if do_use_ddp else RandomSampler,
             )
         )
         return DataLoader(
@@ -118,15 +117,14 @@ def train_dataloader(self):
         )
 
     def val_dataloader(self):
+        do_use_ddp = self.trainer is not None and self.trainer.use_ddp
         self.val_dataset = LimitDataset(
             labeled_video_dataset(
                 data_path=str(Path(self.data_path) / "val"),
                 clip_sampler=make_clip_sampler("uniform", self.clip_duration),
                 transform=self.val_transform,
                 decode_audio=False,
-                video_sampler=DistributedSampler
-                if (self.trainer is not None and self.trainer.use_ddp)
-                else RandomSampler,
+                video_sampler=DistributedSampler if do_use_ddp else RandomSampler,
             )
         )
         return DataLoader(
@@ -180,7 +178,7 @@ def __init__(self, **kwargs):
         """
         super().__init__(**kwargs)
 
-    def setup(self, stage=None):
+    def setup(self, stage: str = None):
         """Set up anything needed for initializing train/val datasets. This runs on all nodes"""
 
         # Names of classes to predict

From 0886737ad51aaa1d3081b9e9041b5e2ab8d1fce0 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 20 May 2021 12:16:23 -0600
Subject: [PATCH 11/15] :pencil: writing docs

---
 .../video_classification_example/data.py      | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py
index 2d8890c5..ca8b3bd0 100644
--- a/tutorials/video_classification_example/data.py
+++ b/tutorials/video_classification_example/data.py
@@ -48,6 +48,43 @@ def __init__(
         workers: int = 4,
         **kwargs
     ):
+        """
+        A LabeledVideoDataModule expects a dataset in the following format:
+
+        /root                                      # Root Folder
+        ├── train                                  # Split Folder
+        │   ├── archery                            # Class Folder
+        │   │   ├── -1q7jA3DXQM_000005_000015.mp4  # Videos
+        │   │   ├── -5NN5hdIwTc_000036_000046.mp4
+        │   │   ...
+        │   ├── bowling
+        │   │   ├── -5ExwuF5IUI_000030_000040.mp4
+        │   │   ...
+        │   ├── high_jump
+        │   │   ├── -5ExwuF5IUI_000030_000040.mp4
+        │   │   ...
+        ├── val
+        │   ├── archery
+        │   │   ├── -1q7jA3DXQM_000005_000015.mp4
+        │   │   ├── -5NN5hdIwTc_000036_000046.mp4
+        │   │   ...
+        │   ├── bowling
+        │   │   ├── -5ExwuF5IUI_000030_000040.mp4
+        │   │   ...
+
+        Args:
+            root (str, optional): Directory where your dataset is stored. Defaults to "./".
+            clip_duration (int, optional): Duration of clip samples. Defaults to 2.
+            video_num_subsampled (int, optional): Number of subsamples to take of individual videos. Defaults to 8.
+            video_crop_size (int, optional): Size to crop the video to. Defaults to 224.
+            video_means (Tuple[float], optional): Means used to normalize dataset. Defaults to (0.45, 0.45, 0.45).
+            video_stds (Tuple[float], optional): Standard deviations used to normalized dataset. Defaults to (0.225, 0.225, 0.225).
+            video_min_short_side_scale (int, optional): min_size arg passed to pytorchvideo.transforms.RandomShortSideScale. Defaults to 256.
+            video_max_short_side_scale (int, optional): max_size arg passed to pytorchvideo.transforms.RandomShortSideScale. Defaults to 320.
+            video_horizontal_flip_p (float, optional): Probability of flipping a training example horizontally. Defaults to 0.5.
+            batch_size (int, optional): Number of examples per batch. Defaults to 4.
+            workers (int, optional): Number of DataLoader workers. Defaults to 4.
+        """
 
         super().__init__()
         self.root = root

From 8a9f527e743ef58a5a6d407fcfaf44ebda934bf7 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 20 May 2021 12:35:24 -0600
Subject: [PATCH 12/15] :pencil: update docstring with more specific path

---
 tutorials/video_classification_example/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py
index ca8b3bd0..22d2b91b 100644
--- a/tutorials/video_classification_example/data.py
+++ b/tutorials/video_classification_example/data.py
@@ -188,7 +188,7 @@ def __init__(self, **kwargs):
 
         The folder structure looks like the following:
 
-        /data_dir
+        /root/action_youtube_naudio
         ├── basketball                     # Class Folder Path
         │   ├── v_shooting_01              # Scene/Group Folder Path
         │   │   ├── v_shooting_01_01.avi   # Video Path

From 608c16be705243d135978277a3a3b7490beb1d99 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Thu, 20 May 2021 12:44:22 -0600
Subject: [PATCH 13/15] :pencil: add periods to keep it consistent

---
 .../video_classification_example/data.py      | 22 +++++++++----------
 .../video_classification_example/models.py    | 20 ++++++++---------
 .../video_classification_example/train.py     |  6 ++---
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py
index 22d2b91b..49972d9b 100644
--- a/tutorials/video_classification_example/data.py
+++ b/tutorials/video_classification_example/data.py
@@ -100,7 +100,7 @@ def __init__(
         self.batch_size = batch_size
         self.workers = workers
 
-        # Transforms applied to train dataset
+        # Transforms applied to train dataset.
         self.train_transform = ApplyTransformToKey(
             key="video",
             transform=Compose(
@@ -118,7 +118,7 @@ def __init__(
             ),
         )
 
-        # Transforms applied on val dataset or for inference
+        # Transforms applied on val dataset or for inference.
         self.val_transform = ApplyTransformToKey(
             key="video",
             transform=Compose(
@@ -216,9 +216,9 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
     def setup(self, stage: str = None):
-        """Set up anything needed for initializing train/val datasets. This runs on all nodes"""
+        """Set up anything needed for initializing train/val datasets. This runs on all nodes."""
 
-        # Names of classes to predict
+        # Names of classes to predict.
         # Ex. ['basketball', 'biking', 'diving', ...]
         self.classes = sorted(x.name for x in self.data_path.glob("*") if x.is_dir())
 
@@ -226,23 +226,23 @@ def setup(self, stage: str = None):
         # Ex. {'basketball': 0, 'biking': 1, 'diving': 2, ...}
         self.label_to_id = {}
 
-        # A list to hold all available scenes across all classes
+        # A list to hold all available scenes across all classes.
         scene_folders = []
 
         for class_id, class_name in enumerate(self.classes):
 
             self.label_to_id[class_name] = class_id
 
-            # The path of a class folder within self.data_path
+            # The path of a class folder within self.data_path.
             # Ex. 'action_youtube_naudio/{basketball|biking|diving|...}'
             class_folder = self.data_path / class_name
 
-            # Collect scene folders within this class
+            # Collect scene folders within this class.
             # Ex. 'action_youtube_naudio/basketball/v_shooting_01'
             for scene_folder in filter(Path.is_dir, class_folder.glob("v_*")):
                 scene_folders.append(scene_folder)
 
-        # Randomly shuffle the scene folders before splitting them into train/val
+        # Randomly shuffle the scene folders before splitting them into train/val.
         shuffle(scene_folders)
 
         # Determine number of scenes in train/validation splits.
@@ -250,18 +250,18 @@ def setup(self, stage: str = None):
         self.num_val_scenes = len(scene_folders) - self.num_train_scenes
 
         # Collect train/val paths to videos within each scene folder.
-        # Validation only uses videos from scenes not seen by model during training
+        # Validation only uses videos from scenes not seen by model during training.
         self.train_paths = []
         self.val_paths = []
         for i, scene_path in enumerate(scene_folders):
 
-            # The actual name of the class (Ex. 'basketball')
+            # The actual name of the class (Ex. 'basketball').
             class_name = scene_path.parent.name
 
             # Loop over all the videos within the given scene folder.
             for video_path in scene_path.glob("*.avi"):
 
-                # Construct a tuple containing (<path to a video>, <dict containing extra attributes/metadata>)
+                # Construct a tuple containing (<path to a video>, <dict containing extra attributes/metadata>).
                 # In our case, we assign the class's ID as 'label'.
                 labeled_path = (video_path, {"label": self.label_to_id[class_name]})
 
diff --git a/tutorials/video_classification_example/models.py b/tutorials/video_classification_example/models.py
index 600f59ce..52233bd1 100644
--- a/tutorials/video_classification_example/models.py
+++ b/tutorials/video_classification_example/models.py
@@ -26,10 +26,10 @@ def __init__(self, num_classes: int = 11, lr: float = 2e-4, **kwargs):
         # Saves all kwargs to self.hparams. Use references to self.hparams.<var-name>, not the init args themselves.
         self.save_hyperparameters()
 
-        # Build the model in separate function so its easier to override
+        # Build the model in separate function so its easier to override.
         self.model = self._build_model()
 
-        # Metrics we will keep track of
+        # Metrics we will keep track of.
         self.loss_fn = nn.CrossEntropyLoss()
         self.train_acc = pl.metrics.Accuracy()
         self.val_acc = pl.metrics.Accuracy()
@@ -80,20 +80,20 @@ def shared_step(self, batch, mode: str):
             torch.Tensor: The loss for a single batch step.
         """
 
-        # Pass video tensor through model to get outputs
+        # Pass video tensor through model to get outputs.
         outputs = self(batch["video"])
 
-        # Compute and log the cross entropy loss to {train|val}_loss in TensorBoard
+        # Compute and log the cross entropy loss to {train|val}_loss in TensorBoard.
         loss = self.loss_fn(outputs, batch["label"])
         self.log(f"{mode}_loss", loss)
 
-        # Predicted class probabilities - (BATCH_SIZE, NUM_CLASSES)
+        # Predicted class probabilities - (BATCH_SIZE, NUM_CLASSES).
         proba = outputs.softmax(dim=1)
 
-        # Predicted classes - (BATCH_SIZE,)
+        # Predicted classes - (BATCH_SIZE,).
         preds = proba.argmax(dim=1)
 
-        # Compute the predicted class accuracy and log it to {train|val}_acc in TensorBoard
+        # Compute the predicted class accuracy and log it to {train|val}_acc in TensorBoard.
         acc = self.accuracy[mode](preds, batch["label"])
         self.log(f"{mode}_acc", acc, prog_bar=True)
 
@@ -125,7 +125,7 @@ def __init__(self, freeze_backbone: bool = True, pretrained: bool = True, **kwar
         )
 
     def _build_model(self):
-        # The pretrained resnet model - we strip off its head to get the backbone
+        # The pretrained resnet model - we strip off its head to get the backbone.
         resnet = torch.hub.load(
             "facebookresearch/pytorchvideo",
             "slow_r50",
@@ -133,12 +133,12 @@ def _build_model(self):
         )
         self.backbone = nn.Sequential(*list(resnet.children())[0][:-1])
 
-        # Freeze the backbone layers if specified
+        # Freeze the backbone layers if specified.
         if self.hparams.freeze_backbone:
             for param in self.backbone.parameters():
                 param.requires_grad = False
 
-        # Create a new head we will train on top of the backbone
+        # Create a new head we will train on top of the backbone.
         self.head = create_res_basic_head(
             in_features=2048, out_features=self.hparams.num_classes
         )
diff --git a/tutorials/video_classification_example/train.py b/tutorials/video_classification_example/train.py
index 0d7b5ebb..8568b9c2 100644
--- a/tutorials/video_classification_example/train.py
+++ b/tutorials/video_classification_example/train.py
@@ -14,13 +14,13 @@ def parse_args(args=None):
     parser.add_argument("--working_directory", default=".", type=str)
     parser.add_argument("--partition", default="dev", type=str)
 
-    # Model Parameters
+    # Model Parameters.
     parser.add_argument("--lr", "--learning_rate", default=2e-4, type=float)
 
-    # Data Parameters
+    # Data Parameters.
     parser = LabeledVideoDataModule.add_argparse_args(parser)
 
-    # Training Parameters
+    # Training Parameters.
     parser = pl.Trainer.add_argparse_args(parser)
     parser.set_defaults(
         callbacks=[pl.callbacks.LearningRateMonitor()],

From cde0ce75db03f1bbe7c52878ef5035151e7e1d11 Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Sun, 23 May 2021 18:44:35 -0600
Subject: [PATCH 14/15] :fire: remove inline comments

---
 tutorials/video_classification_example/models.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tutorials/video_classification_example/models.py b/tutorials/video_classification_example/models.py
index 52233bd1..2ec97fa6 100644
--- a/tutorials/video_classification_example/models.py
+++ b/tutorials/video_classification_example/models.py
@@ -80,20 +80,14 @@ def shared_step(self, batch, mode: str):
             torch.Tensor: The loss for a single batch step.
         """
 
-        # Pass video tensor through model to get outputs.
         outputs = self(batch["video"])
 
-        # Compute and log the cross entropy loss to {train|val}_loss in TensorBoard.
         loss = self.loss_fn(outputs, batch["label"])
         self.log(f"{mode}_loss", loss)
 
-        # Predicted class probabilities - (BATCH_SIZE, NUM_CLASSES).
         proba = outputs.softmax(dim=1)
-
-        # Predicted classes - (BATCH_SIZE,).
         preds = proba.argmax(dim=1)
 
-        # Compute the predicted class accuracy and log it to {train|val}_acc in TensorBoard.
         acc = self.accuracy[mode](preds, batch["label"])
         self.log(f"{mode}_acc", acc, prog_bar=True)
 

From 6608f9ace2145658ab4f1452bfd2832c5f2d5d6a Mon Sep 17 00:00:00 2001
From: nateraw <nxr9266@g.rit.edu>
Date: Sun, 23 May 2021 18:48:00 -0600
Subject: [PATCH 15/15] :fire: removing incomplete finetuning tutorial for now

---
 website/docs/tutorial_finetuning.md | 9 ---------
 1 file changed, 9 deletions(-)
 delete mode 100644 website/docs/tutorial_finetuning.md

diff --git a/website/docs/tutorial_finetuning.md b/website/docs/tutorial_finetuning.md
deleted file mode 100644
index ba7d1763..00000000
--- a/website/docs/tutorial_finetuning.md
+++ /dev/null
@@ -1,9 +0,0 @@
----
-id: tutorial_finetuning
-title: Finetune a TorchHub model for Classification
----
-
-# Introduction
-
-In this tutorial, you will learn how to finetune a pre-trained [Slow Resnet50 model from TorchHub](https://pytorch.org/hub/facebookresearch_pytorchvideo_resnet/) on the [UCF11 Dataset](https://www.crcv.ucf.edu/data/UCF_YouTube_Action.php).
-