From 559b3b6463cb62469eea691a5215af58c8c71ca6 Mon Sep 17 00:00:00 2001 From: nateraw Date: Thu, 13 May 2021 11:48:20 -0600 Subject: [PATCH 01/15] :sparkles: introduce finetuning example --- .../video_classification_example/finetune.py | 255 ++++++++++++++++++ .../video_classification_example/train.py | 15 +- 2 files changed, 264 insertions(+), 6 deletions(-) create mode 100644 tutorials/video_classification_example/finetune.py diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py new file mode 100644 index 00000000..21fce5bb --- /dev/null +++ b/tutorials/video_classification_example/finetune.py @@ -0,0 +1,255 @@ +from pathlib import Path +from argparse import Namespace +from torchvision.transforms._transforms_video import CenterCropVideo +from pytorchvideo.data import LabeledVideoDataset +from pytorchvideo.data.clip_sampling import UniformClipSampler +import pytorch_lightning as pl +import torch +from pytorchvideo.models.head import create_res_basic_head +from torch import nn +from torch.optim import Adam + +# HACK +from train import * + + +class UCF11DataModule(KineticsDataModule): + + def __init__( + self, + root="./", + batch_size=32, + num_workers=8, + holdout_scene=None, + side_size = 256, + crop_size = 256, + clip_mean = (0.45, 0.45, 0.45), + clip_std = (0.225, 0.225, 0.225), + num_frames = 8, + sampling_rate = 8, + frames_per_second = 30 + ): + super().__init__(Namespace(data_type='video', batch_size=batch_size, workers=num_workers)) + + self.root = Path(root) / 'action_youtube_naudio' + assert self.root.exists(), "Dataset not found." + self.batch_size = batch_size + self.num_workers = num_workers + self.holdout_scene = holdout_scene + self.side_size = side_size + self.mean = clip_mean + self.std = clip_std + self.crop_size = crop_size + self.num_frames = num_frames + self.sampling_rate = sampling_rate + self.frames_per_second = frames_per_second + self.clip_duration = (self.num_frames * self.sampling_rate) / self.frames_per_second + + self.classes = [x.name for x in self.root.glob("*") if x.is_dir()] + self.id_to_label = dict(zip(range(len(self.classes)), self.classes)) + self.class_to_label = dict(zip(self.classes, range(len(self.classes)))) + self.num_classes = len(self.classes) + + + # TODO - too many repeated .glob calls here. + self.train_paths = [] + self.val_paths = [] + self.holdout_scenes = {} + for c in self.classes: + + # Scenes within each class directory + scene_names = sorted(set(x.name for x in (self.root / c).glob("*") if x.is_dir() and x.name != 'Annotation')) + + # Holdout the last scene + # TODO - wrap this in a function so users can override the split logic + holdout_scene = scene_names[-1] + scene_names = scene_names[:-1] + + # Keep track of which scenes we held out for each class w/ a dict + self.holdout_scenes[c] = holdout_scene + + # Prepare the list of 'labeled paths' required by the LabeledVideoDataset + label_paths = [(v, {"label": self.class_to_label[c]}) for v in (self.root / c).glob("**/*.avi")] + + # HACK - this is no bueno. Can be done within the loop above + self.train_paths.extend([x for x in label_paths if x[0].parent.name != holdout_scene]) + self.val_paths.extend([x for x in label_paths if x[0].parent.name == holdout_scene]) + + def _video_transform(self, mode: str): + # TODO - different tsfm for val/train + return ApplyTransformToKey( + key="video", + transform=Compose( + [ + UniformTemporalSubsample(self.num_frames), + Lambda(lambda x: x / 255.0), + Normalize(self.mean, self.std), + ShortSideScale(size=self.side_size), + CenterCropVideo(crop_size=(self.crop_size, self.crop_size)), + ] + ), + ) + + def _make_dataset(self, mode: str): + """ + Defines the train DataLoader that the PyTorch Lightning Trainer trains/tests with. + """ + sampler = DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler + return LimitDataset(LabeledVideoDataset( + self.train_paths if mode == 'train' else self.val_paths, + UniformClipSampler(self.clip_duration), + decode_audio=False, + transform=self._make_transforms(mode=mode), + video_sampler=sampler, + )) + + def train_dataloader(self): + self.train_dataset = self._make_dataset('train') + return torch.utils.data.DataLoader( + self.train_dataset, + batch_size=self.args.batch_size, + num_workers=self.args.workers, + ) + + def val_dataloader(self): + self.val_dataset = self._make_dataset('val') + return torch.utils.data.DataLoader( + self.val_dataset, + batch_size=self.args.batch_size, + num_workers=self.args.workers, + ) + + +class MiniKineticsDataModule(KineticsDataModule): + TRAIN_PATH = 'train' + VAL_PATH = 'val' + + +class Classifier(pl.LightningModule): + + def __init__(self, num_classes: int = 11, lr: float = 2e-4, freeze_backbone: bool = True): + super().__init__() + self.save_hyperparameters() + + # Backbone + resnet = torch.hub.load("facebookresearch/pytorchvideo", "slow_r50", pretrained=True) + self.backbone = nn.Sequential(*list(resnet.children())[0][:-1]) + + if self.hparams.freeze_backbone: + for param in self.backbone.parameters(): + param.requires_grad = False + + # Head + self.head = create_res_basic_head(in_features=2048, out_features=self.hparams.num_classes) + + # Metrics + self.loss_fn = nn.CrossEntropyLoss() + self.train_acc = pl.metrics.Accuracy() + self.val_acc = pl.metrics.Accuracy() + self.accuracy = {'train': self.train_acc, 'val': self.val_acc} + + def forward(self, x): + if isinstance(x, dict): + x = x["video"] + feats = self.backbone(x) + return self.head(feats) + + def shared_step(self, batch, mode: str): + y_hat = self(batch["video"]) + loss = self.loss_fn(y_hat, batch["label"]) + self.log(f"{mode}_loss", loss) + + if mode in ["val", "test"]: + preds = y_hat.argmax(dim=1) + acc = self.accuracy[mode](preds, batch["label"]) + self.log(f"{mode}_acc", acc, prog_bar=True) + + return loss + + def training_step(self, batch, batch_idx): + return self.shared_step(batch, "train") + + def validation_step(self, batch, batch_idx): + return self.shared_step(batch, "val") + + def test_step(self, batch, batch_idx): + return self.shared_step(batch, "test") + + def configure_optimizers(self): + return Adam(self.parameters(), lr=self.hparams.lr) + + +def main(): + """ + To train the ResNet with the Kinetics dataset we construct the two modules above, + and pass them to the fit function of a pytorch_lightning.Trainer. + + This example can be run either locally (with default parameters) or on a Slurm + cluster. To run on a Slurm cluster provide the --on_cluster argument. + """ + setup_logger() + + pytorch_lightning.trainer.seed_everything() + parser = argparse.ArgumentParser() + + # Cluster parameters. + parser.add_argument("--on_cluster", action="store_true") + parser.add_argument("--job_name", default="ptv_video_classification", type=str) + parser.add_argument("--working_directory", default=".", type=str) + parser.add_argument("--partition", default="dev", type=str) + + # Model parameters. + parser.add_argument("--lr", "--learning-rate", default=0.1, type=float) + parser.add_argument("--momentum", default=0.9, type=float) + parser.add_argument("--weight_decay", default=1e-4, type=float) + parser.add_argument( + "--arch", + default="video_resnet", + choices=["video_resnet", "audio_resnet"], + type=str, + ) + + # Data parameters. + parser.add_argument("--data_path", default=None, type=str, required=True) + parser.add_argument("--video_path_prefix", default="", type=str) + parser.add_argument("--workers", default=8, type=int) + parser.add_argument("--batch_size", default=32, type=int) + parser.add_argument("--clip_duration", default=2, type=float) + parser.add_argument( + "--data_type", default="video", choices=["video", "audio"], type=str + ) + parser.add_argument("--video_num_subsampled", default=8, type=int) + parser.add_argument("--video_means", default=(0.45, 0.45, 0.45), type=tuple) + parser.add_argument("--video_stds", default=(0.225, 0.225, 0.225), type=tuple) + parser.add_argument("--video_crop_size", default=224, type=int) + parser.add_argument("--video_min_short_side_scale", default=256, type=int) + parser.add_argument("--video_max_short_side_scale", default=320, type=int) + parser.add_argument("--video_horizontal_flip_p", default=0.5, type=float) + parser.add_argument("--audio_raw_sample_rate", default=44100, type=int) + parser.add_argument("--audio_resampled_rate", default=16000, type=int) + parser.add_argument("--audio_mel_window_size", default=32, type=int) + parser.add_argument("--audio_mel_step_size", default=16, type=int) + parser.add_argument("--audio_num_mels", default=80, type=int) + parser.add_argument("--audio_mel_num_subsample", default=128, type=int) + parser.add_argument("--audio_logmel_mean", default=-7.03, type=float) + parser.add_argument("--audio_logmel_std", default=4.66, type=float) + + # Trainer parameters. + parser = pytorch_lightning.Trainer.add_argparse_args(parser) + parser.set_defaults( + max_epochs=200, + callbacks=[LearningRateMonitor()], + replace_sampler_ddp=False, + reload_dataloaders_every_epoch=False, + ) + args = parser.parse_args() + + # Get data, model, configure trainer, and train + data = MiniKineticsDataModule(args) + model = Classifier(num_classes=6) + trainer = pl.Trainer(gpus=1, precision=16, max_epochs=5) + trainer.fit(model, data) + + +if __name__ == "__main__": + main() diff --git a/tutorials/video_classification_example/train.py b/tutorials/video_classification_example/train.py index b2d896ba..17beb719 100644 --- a/tutorials/video_classification_example/train.py +++ b/tutorials/video_classification_example/train.py @@ -19,7 +19,6 @@ ShortSideScale, UniformTemporalSubsample, ) -from slurm import copy_and_run_with_config from torch.utils.data import DistributedSampler, RandomSampler from torchaudio.transforms import MelSpectrogram, Resample from torchvision.transforms import ( @@ -179,6 +178,9 @@ class KineticsDataModule(pytorch_lightning.LightningDataModule): preprocessing transforms and configures the PyTorch DataLoaders. """ + TRAIN_PATH = 'train.csv' + VAL_PATH = 'val.csv' + def __init__(self, args): self.args = args super().__init__() @@ -297,11 +299,11 @@ def train_dataloader(self): """ Defines the train DataLoader that the PyTorch Lightning Trainer trains/tests with. """ - sampler = DistributedSampler if self.trainer.use_ddp else RandomSampler + sampler = DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler train_transform = self._make_transforms(mode="train") self.train_dataset = LimitDataset( pytorchvideo.data.Kinetics( - data_path=os.path.join(self.args.data_path, "train.csv"), + data_path=os.path.join(self.args.data_path, self.TRAIN_PATH), clip_sampler=pytorchvideo.data.make_clip_sampler( "random", self.args.clip_duration ), @@ -320,11 +322,11 @@ def val_dataloader(self): """ Defines the train DataLoader that the PyTorch Lightning Trainer trains/tests with. """ - sampler = DistributedSampler if self.trainer.use_ddp else RandomSampler + sampler = DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler val_transform = self._make_transforms(mode="val") self.val_dataset = LimitDataset( pytorchvideo.data.Kinetics( - data_path=os.path.join(self.args.data_path, "val.csv"), + data_path=os.path.join(self.args.data_path, self.VAL_PATH), clip_sampler=pytorchvideo.data.make_clip_sampler( "uniform", self.args.clip_duration ), @@ -359,7 +361,7 @@ def __getitem__(self, index): return next(self.dataset_iter) def __len__(self): - return self.dataset.num_videos() + return self.dataset.num_videos def main(): @@ -430,6 +432,7 @@ def main(): args = parser.parse_args() if args.on_cluster: + from slurm import copy_and_run_with_config copy_and_run_with_config( train, args, From 4c97dfc794606549cff860ae9c8d2ae49dd5a8c8 Mon Sep 17 00:00:00 2001 From: nateraw Date: Thu, 13 May 2021 19:07:49 -0600 Subject: [PATCH 02/15] :art: improve structure, modularity of code --- .../video_classification_example/data.py | 250 ++++++++++++++++++ .../video_classification_example/finetune.py | 182 +++---------- 2 files changed, 282 insertions(+), 150 deletions(-) create mode 100644 tutorials/video_classification_example/data.py diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py new file mode 100644 index 00000000..13344680 --- /dev/null +++ b/tutorials/video_classification_example/data.py @@ -0,0 +1,250 @@ +import requests +from argparse import Namespace, ArgumentParser +import pytorch_lightning +from pathlib import Path +from shutil import unpack_archive +from pytorchvideo.transforms import ( + ApplyTransformToKey, + Normalize, + RandomShortSideScale, + RemoveKey, + ShortSideScale, + UniformTemporalSubsample, +) +from pytorchvideo.data import LabeledVideoDataset + +from torch.utils.data import DistributedSampler, RandomSampler +from torchaudio.transforms import MelSpectrogram, Resample +from torchvision.transforms import ( + CenterCrop, + Compose, + Lambda, + RandomCrop, + RandomHorizontalFlip, +) +from pytorchvideo.data import make_clip_sampler +from pytorchvideo.data.labeled_video_dataset import labeled_video_dataset +import torch +import itertools +from torch.utils.data import DataLoader +from random import shuffle + + +class LabeledVideoDataModule(pytorch_lightning.LightningDataModule): + + TRAIN_PATH = "train.csv" + VAL_PATH = "val.csv" + SOURCE_URL = None + SOURCE_DIR_NAME = None + + def __init__(self, args): + super().__init__() + self.args = args + self.root = Path(self.args.data_path) / self.SOURCE_DIR_NAME + if not (self.SOURCE_URL is None or self.SOURCE_DIR_NAME is None): + if not self.root.exists(): + download_and_unzip(self.SOURCE_URL, self.args.data_path, verify=getattr(self.args, 'verify', True)) + + def _make_transforms(self, mode: str): + + if self.args.data_type == "video": + transform = [ + self._video_transform(mode), + RemoveKey("audio"), + ] + elif self.args.data_type == "audio": + transform = [ + self._audio_transform(), + RemoveKey("video"), + ] + else: + raise Exception(f"{self.args.data_type} not supported") + + return Compose(transform) + + def _video_transform(self, mode: str): + args = self.args + return ApplyTransformToKey( + key="video", + transform=Compose( + [ + UniformTemporalSubsample(args.video_num_subsampled), + Normalize(args.video_means, args.video_stds), + ] + + ( + [ + RandomShortSideScale( + min_size=args.video_min_short_side_scale, + max_size=args.video_max_short_side_scale, + ), + RandomCrop(args.video_crop_size), + RandomHorizontalFlip(p=args.video_horizontal_flip_p), + ] + if mode == "train" + else [ + ShortSideScale(args.video_min_short_side_scale), + CenterCrop(args.video_crop_size), + ] + ) + ), + ) + + def _audio_transform(self): + args = self.args + n_fft = int(float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size) + hop_length = int(float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size) + eps = 1e-10 + return ApplyTransformToKey( + key="audio", + transform=Compose( + [ + Resample( + orig_freq=args.audio_raw_sample_rate, + new_freq=args.audio_resampled_rate, + ), + MelSpectrogram( + sample_rate=args.audio_resampled_rate, + n_fft=n_fft, + hop_length=hop_length, + n_mels=args.audio_num_mels, + center=False, + ), + Lambda(lambda x: x.clamp(min=eps)), + Lambda(torch.log), + UniformTemporalSubsample(args.audio_mel_num_subsample), + Lambda(lambda x: x.transpose(1, 0)), # (F, T) -> (T, F) + Lambda(lambda x: x.view(1, x.size(0), 1, x.size(1))), # (T, F) -> (1, T, 1, F) + Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)), + ] + ), + ) + + def _make_ds_and_loader(self, mode: str): + ds = LimitDataset( + labeled_video_dataset( + data_path=str(Path(self.root) / (self.TRAIN_PATH if mode == 'train' else self.VAL_PATH)), + clip_sampler=make_clip_sampler("random" if mode == 'train' else 'uniform', self.args.clip_duration), + video_path_prefix=self.args.video_path_prefix, + transform=self._make_transforms(mode=mode), + video_sampler=DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler, + ) + ) + return ds, DataLoader(ds, batch_size=self.args.batch_size, num_workers=self.args.workers) + + def train_dataloader(self): + self.train_dataset, loader = self._make_ds_and_loader('train') + return loader + + def val_dataloader(self): + self.val_dataset, loader = self._make_ds_and_loader('val') + return loader + + +class LimitDataset(torch.utils.data.Dataset): + """ + To ensure a constant number of samples are retrieved from the dataset we use this + LimitDataset wrapper. This is necessary because several of the underlying videos + may be corrupted while fetching or decoding, however, we always want the same + number of steps per epoch. + """ + + def __init__(self, dataset): + super().__init__() + self.dataset = dataset + self.dataset_iter = itertools.chain.from_iterable(itertools.repeat(iter(dataset), 2)) + + def __getitem__(self, index): + return next(self.dataset_iter) + + def __len__(self): + return self.dataset.num_videos + + +class KineticsDataModule(LabeledVideoDataModule): + TRAIN_PATH = 'train.csv' + VAL_PATH = 'val.csv' + NUM_CLASSES = 700 + + +class MiniKineticsDataModule(LabeledVideoDataModule): + + TRAIN_PATH = "train" + VAL_PATH = "val" + SOURCE_URL = "https://pl-flash-data.s3.amazonaws.com/kinetics.zip" + SOURCE_DIR_NAME = 'kinetics' + NUM_CLASSES = 6 + + +class UCF11DataModule(LabeledVideoDataModule): + TRAIN_PATH = None + VAL_PATH = None + SOURCE_URL = "https://www.crcv.ucf.edu/data/YouTube_DataSet_Annotated.zip" + SOURCE_DIR_NAME = 'action_youtube_naudio' + NUM_CLASSES = 11 + + def __init__(self, args): + args.verify = False + super().__init__(args) + + data_path = Path(self.args.data_path) + root = data_path / self.SOURCE_DIR_NAME + self.classes = [x.name for x in root.glob("*") if x.is_dir()] + self.id_to_label = dict(zip(range(len(self.classes)), self.classes)) + self.class_to_label = {v: k for k, v in self.id_to_label.items()} + self.num_classes = len(self.classes) + + self.train_paths = [] + self.val_paths = [] + self.holdout_scenes = {} + for c in self.classes: + + # Scenes within each class directory + scene_names = sorted(x.name for x in (root / c).glob("*") if x.is_dir() and x.name != 'Annotation') + shuffle(scene_names) + + # Holdout a random actor/scene + holdout_scene = scene_names[-1] + scene_names = scene_names[:-1] + + # Keep track of which scenes we held out for each class w/ a dict + self.holdout_scenes[c] = holdout_scene + + for v in (root / c).glob('**/*.avi'): + labeled_path = (v, {"label": self.class_to_label[c]}) + if v.parent.name != holdout_scene: + self.train_paths.append(labeled_path) + else: + self.val_paths.append(labeled_path) + + + def _make_ds_and_loader(self, mode: str): + ds = LimitDataset( + LabeledVideoDataset( + self.train_paths if mode == 'train' else self.val_paths, + clip_sampler=make_clip_sampler("random" if mode == 'train' else 'uniform', self.args.clip_duration), + decode_audio=False, + transform=self._make_transforms(mode=mode), + video_sampler=DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler, + ) + ) + return ds, DataLoader(ds, batch_size=self.args.batch_size, num_workers=self.args.workers) + + +def download_and_unzip(url, data_dir="./", verify=True): + data_dir = Path(data_dir) + zipfile_name = url.split("/")[-1] + data_zip_path = data_dir / zipfile_name + data_dir.mkdir(exist_ok=True, parents=True) + + if not data_zip_path.exists(): + resp = requests.get(url, verify=verify) + + with data_zip_path.open("wb") as f: + f.write(resp.content) + + unpack_archive(data_zip_path, extract_dir=data_dir) + + +if __name__ == "__main__": + args = parse_args('--batch_size 4 --data_path ./yt_data'.split()) + dm = UCF11DataModule(args) diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py index 21fce5bb..35055634 100644 --- a/tutorials/video_classification_example/finetune.py +++ b/tutorials/video_classification_example/finetune.py @@ -1,138 +1,30 @@ -from pathlib import Path -from argparse import Namespace -from torchvision.transforms._transforms_video import CenterCropVideo -from pytorchvideo.data import LabeledVideoDataset -from pytorchvideo.data.clip_sampling import UniformClipSampler +from argparse import ArgumentParser + import pytorch_lightning as pl import torch -from pytorchvideo.models.head import create_res_basic_head from torch import nn from torch.optim import Adam +from pytorchvideo.models.head import create_res_basic_head -# HACK -from train import * - - -class UCF11DataModule(KineticsDataModule): - - def __init__( - self, - root="./", - batch_size=32, - num_workers=8, - holdout_scene=None, - side_size = 256, - crop_size = 256, - clip_mean = (0.45, 0.45, 0.45), - clip_std = (0.225, 0.225, 0.225), - num_frames = 8, - sampling_rate = 8, - frames_per_second = 30 - ): - super().__init__(Namespace(data_type='video', batch_size=batch_size, workers=num_workers)) - - self.root = Path(root) / 'action_youtube_naudio' - assert self.root.exists(), "Dataset not found." - self.batch_size = batch_size - self.num_workers = num_workers - self.holdout_scene = holdout_scene - self.side_size = side_size - self.mean = clip_mean - self.std = clip_std - self.crop_size = crop_size - self.num_frames = num_frames - self.sampling_rate = sampling_rate - self.frames_per_second = frames_per_second - self.clip_duration = (self.num_frames * self.sampling_rate) / self.frames_per_second - - self.classes = [x.name for x in self.root.glob("*") if x.is_dir()] - self.id_to_label = dict(zip(range(len(self.classes)), self.classes)) - self.class_to_label = dict(zip(self.classes, range(len(self.classes)))) - self.num_classes = len(self.classes) - - - # TODO - too many repeated .glob calls here. - self.train_paths = [] - self.val_paths = [] - self.holdout_scenes = {} - for c in self.classes: - - # Scenes within each class directory - scene_names = sorted(set(x.name for x in (self.root / c).glob("*") if x.is_dir() and x.name != 'Annotation')) - - # Holdout the last scene - # TODO - wrap this in a function so users can override the split logic - holdout_scene = scene_names[-1] - scene_names = scene_names[:-1] - - # Keep track of which scenes we held out for each class w/ a dict - self.holdout_scenes[c] = holdout_scene - - # Prepare the list of 'labeled paths' required by the LabeledVideoDataset - label_paths = [(v, {"label": self.class_to_label[c]}) for v in (self.root / c).glob("**/*.avi")] - - # HACK - this is no bueno. Can be done within the loop above - self.train_paths.extend([x for x in label_paths if x[0].parent.name != holdout_scene]) - self.val_paths.extend([x for x in label_paths if x[0].parent.name == holdout_scene]) - - def _video_transform(self, mode: str): - # TODO - different tsfm for val/train - return ApplyTransformToKey( - key="video", - transform=Compose( - [ - UniformTemporalSubsample(self.num_frames), - Lambda(lambda x: x / 255.0), - Normalize(self.mean, self.std), - ShortSideScale(size=self.side_size), - CenterCropVideo(crop_size=(self.crop_size, self.crop_size)), - ] - ), - ) - - def _make_dataset(self, mode: str): - """ - Defines the train DataLoader that the PyTorch Lightning Trainer trains/tests with. - """ - sampler = DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler - return LimitDataset(LabeledVideoDataset( - self.train_paths if mode == 'train' else self.val_paths, - UniformClipSampler(self.clip_duration), - decode_audio=False, - transform=self._make_transforms(mode=mode), - video_sampler=sampler, - )) - - def train_dataloader(self): - self.train_dataset = self._make_dataset('train') - return torch.utils.data.DataLoader( - self.train_dataset, - batch_size=self.args.batch_size, - num_workers=self.args.workers, - ) - - def val_dataloader(self): - self.val_dataset = self._make_dataset('val') - return torch.utils.data.DataLoader( - self.val_dataset, - batch_size=self.args.batch_size, - num_workers=self.args.workers, - ) - - -class MiniKineticsDataModule(KineticsDataModule): - TRAIN_PATH = 'train' - VAL_PATH = 'val' +from data import UCF11DataModule, KineticsDataModule, MiniKineticsDataModule +from models import Classifier + + +DATASET_MAP = { + "ucf11": UCF11DataModule, + "kinetics": KineticsDataModule, + "kinetics-mini": MiniKineticsDataModule, +} class Classifier(pl.LightningModule): - def __init__(self, num_classes: int = 11, lr: float = 2e-4, freeze_backbone: bool = True): + def __init__(self, num_classes: int = 11, lr: float = 2e-4, freeze_backbone: bool = True, pretrained: bool = True): super().__init__() self.save_hyperparameters() # Backbone - resnet = torch.hub.load("facebookresearch/pytorchvideo", "slow_r50", pretrained=True) + resnet = torch.hub.load("facebookresearch/pytorchvideo", 'slow_r50', pretrained=self.hparams.pretrained) self.backbone = nn.Sequential(*list(resnet.children())[0][:-1]) if self.hparams.freeze_backbone: @@ -179,24 +71,8 @@ def configure_optimizers(self): return Adam(self.parameters(), lr=self.hparams.lr) -def main(): - """ - To train the ResNet with the Kinetics dataset we construct the two modules above, - and pass them to the fit function of a pytorch_lightning.Trainer. - - This example can be run either locally (with default parameters) or on a Slurm - cluster. To run on a Slurm cluster provide the --on_cluster argument. - """ - setup_logger() - - pytorch_lightning.trainer.seed_everything() - parser = argparse.ArgumentParser() - - # Cluster parameters. - parser.add_argument("--on_cluster", action="store_true") - parser.add_argument("--job_name", default="ptv_video_classification", type=str) - parser.add_argument("--working_directory", default=".", type=str) - parser.add_argument("--partition", default="dev", type=str) +def parse_args(args=None): + parser = ArgumentParser() # Model parameters. parser.add_argument("--lr", "--learning-rate", default=0.1, type=float) @@ -209,7 +85,10 @@ def main(): type=str, ) - # Data parameters. + # Data parameters + parser.add_argument( + "--dataset", default="ucf11", choices=["ucf11", "kinetics", "kinetics-mini"] + ) parser.add_argument("--data_path", default=None, type=str, required=True) parser.add_argument("--video_path_prefix", default="", type=str) parser.add_argument("--workers", default=8, type=int) @@ -235,21 +114,24 @@ def main(): parser.add_argument("--audio_logmel_std", default=4.66, type=float) # Trainer parameters. - parser = pytorch_lightning.Trainer.add_argparse_args(parser) + parser = pl.Trainer.add_argparse_args(parser) parser.set_defaults( max_epochs=200, - callbacks=[LearningRateMonitor()], + callbacks=[pl.callbacks.LearningRateMonitor()], replace_sampler_ddp=False, reload_dataloaders_every_epoch=False, ) - args = parser.parse_args() + return parser.parse_args(args=args) + - # Get data, model, configure trainer, and train - data = MiniKineticsDataModule(args) - model = Classifier(num_classes=6) - trainer = pl.Trainer(gpus=1, precision=16, max_epochs=5) - trainer.fit(model, data) +def main(args): + pl.trainer.seed_everything() + dm_cls = DATASET_MAP.get(args.dataset) + dm = dm_cls(args) + model = Classifier(num_classes=dm_cls.NUM_CLASSES) + trainer = pl.Trainer.from_argparse_args(args) + trainer.fit(model, dm) if __name__ == "__main__": - main() + main(parse_args()) From 42b3d7c679359f65f1a13fa9111caff57c1db9ca Mon Sep 17 00:00:00 2001 From: nateraw Date: Thu, 13 May 2021 19:10:44 -0600 Subject: [PATCH 03/15] :lipstick: style --- .../video_classification_example/data.py | 98 ++++++++++++------- .../video_classification_example/finetune.py | 28 ++++-- 2 files changed, 82 insertions(+), 44 deletions(-) diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py index 13344680..4dd379cb 100644 --- a/tutorials/video_classification_example/data.py +++ b/tutorials/video_classification_example/data.py @@ -1,8 +1,14 @@ -import requests -from argparse import Namespace, ArgumentParser -import pytorch_lightning +import itertools +from argparse import ArgumentParser, Namespace from pathlib import Path +from random import shuffle from shutil import unpack_archive + +import pytorch_lightning +import requests +import torch +from pytorchvideo.data import LabeledVideoDataset, make_clip_sampler +from pytorchvideo.data.labeled_video_dataset import labeled_video_dataset from pytorchvideo.transforms import ( ApplyTransformToKey, Normalize, @@ -11,9 +17,7 @@ ShortSideScale, UniformTemporalSubsample, ) -from pytorchvideo.data import LabeledVideoDataset - -from torch.utils.data import DistributedSampler, RandomSampler +from torch.utils.data import DataLoader, DistributedSampler, RandomSampler from torchaudio.transforms import MelSpectrogram, Resample from torchvision.transforms import ( CenterCrop, @@ -22,12 +26,6 @@ RandomCrop, RandomHorizontalFlip, ) -from pytorchvideo.data import make_clip_sampler -from pytorchvideo.data.labeled_video_dataset import labeled_video_dataset -import torch -import itertools -from torch.utils.data import DataLoader -from random import shuffle class LabeledVideoDataModule(pytorch_lightning.LightningDataModule): @@ -43,7 +41,11 @@ def __init__(self, args): self.root = Path(self.args.data_path) / self.SOURCE_DIR_NAME if not (self.SOURCE_URL is None or self.SOURCE_DIR_NAME is None): if not self.root.exists(): - download_and_unzip(self.SOURCE_URL, self.args.data_path, verify=getattr(self.args, 'verify', True)) + download_and_unzip( + self.SOURCE_URL, + self.args.data_path, + verify=getattr(self.args, "verify", True), + ) def _make_transforms(self, mode: str): @@ -91,8 +93,12 @@ def _video_transform(self, mode: str): def _audio_transform(self): args = self.args - n_fft = int(float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size) - hop_length = int(float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size) + n_fft = int( + float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size + ) + hop_length = int( + float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size + ) eps = 1e-10 return ApplyTransformToKey( key="audio", @@ -113,7 +119,9 @@ def _audio_transform(self): Lambda(torch.log), UniformTemporalSubsample(args.audio_mel_num_subsample), Lambda(lambda x: x.transpose(1, 0)), # (F, T) -> (T, F) - Lambda(lambda x: x.view(1, x.size(0), 1, x.size(1))), # (T, F) -> (1, T, 1, F) + Lambda( + lambda x: x.view(1, x.size(0), 1, x.size(1)) + ), # (T, F) -> (1, T, 1, F) Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)), ] ), @@ -122,21 +130,30 @@ def _audio_transform(self): def _make_ds_and_loader(self, mode: str): ds = LimitDataset( labeled_video_dataset( - data_path=str(Path(self.root) / (self.TRAIN_PATH if mode == 'train' else self.VAL_PATH)), - clip_sampler=make_clip_sampler("random" if mode == 'train' else 'uniform', self.args.clip_duration), + data_path=str( + Path(self.root) + / (self.TRAIN_PATH if mode == "train" else self.VAL_PATH) + ), + clip_sampler=make_clip_sampler( + "random" if mode == "train" else "uniform", self.args.clip_duration + ), video_path_prefix=self.args.video_path_prefix, transform=self._make_transforms(mode=mode), - video_sampler=DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler, + video_sampler=DistributedSampler + if (self.trainer is not None and self.trainer.use_ddp) + else RandomSampler, ) ) - return ds, DataLoader(ds, batch_size=self.args.batch_size, num_workers=self.args.workers) + return ds, DataLoader( + ds, batch_size=self.args.batch_size, num_workers=self.args.workers + ) def train_dataloader(self): - self.train_dataset, loader = self._make_ds_and_loader('train') + self.train_dataset, loader = self._make_ds_and_loader("train") return loader def val_dataloader(self): - self.val_dataset, loader = self._make_ds_and_loader('val') + self.val_dataset, loader = self._make_ds_and_loader("val") return loader @@ -151,7 +168,9 @@ class LimitDataset(torch.utils.data.Dataset): def __init__(self, dataset): super().__init__() self.dataset = dataset - self.dataset_iter = itertools.chain.from_iterable(itertools.repeat(iter(dataset), 2)) + self.dataset_iter = itertools.chain.from_iterable( + itertools.repeat(iter(dataset), 2) + ) def __getitem__(self, index): return next(self.dataset_iter) @@ -161,8 +180,8 @@ def __len__(self): class KineticsDataModule(LabeledVideoDataModule): - TRAIN_PATH = 'train.csv' - VAL_PATH = 'val.csv' + TRAIN_PATH = "train.csv" + VAL_PATH = "val.csv" NUM_CLASSES = 700 @@ -171,7 +190,7 @@ class MiniKineticsDataModule(LabeledVideoDataModule): TRAIN_PATH = "train" VAL_PATH = "val" SOURCE_URL = "https://pl-flash-data.s3.amazonaws.com/kinetics.zip" - SOURCE_DIR_NAME = 'kinetics' + SOURCE_DIR_NAME = "kinetics" NUM_CLASSES = 6 @@ -179,7 +198,7 @@ class UCF11DataModule(LabeledVideoDataModule): TRAIN_PATH = None VAL_PATH = None SOURCE_URL = "https://www.crcv.ucf.edu/data/YouTube_DataSet_Annotated.zip" - SOURCE_DIR_NAME = 'action_youtube_naudio' + SOURCE_DIR_NAME = "action_youtube_naudio" NUM_CLASSES = 11 def __init__(self, args): @@ -199,7 +218,11 @@ def __init__(self, args): for c in self.classes: # Scenes within each class directory - scene_names = sorted(x.name for x in (root / c).glob("*") if x.is_dir() and x.name != 'Annotation') + scene_names = sorted( + x.name + for x in (root / c).glob("*") + if x.is_dir() and x.name != "Annotation" + ) shuffle(scene_names) # Holdout a random actor/scene @@ -209,25 +232,30 @@ def __init__(self, args): # Keep track of which scenes we held out for each class w/ a dict self.holdout_scenes[c] = holdout_scene - for v in (root / c).glob('**/*.avi'): + for v in (root / c).glob("**/*.avi"): labeled_path = (v, {"label": self.class_to_label[c]}) if v.parent.name != holdout_scene: self.train_paths.append(labeled_path) else: self.val_paths.append(labeled_path) - def _make_ds_and_loader(self, mode: str): ds = LimitDataset( LabeledVideoDataset( - self.train_paths if mode == 'train' else self.val_paths, - clip_sampler=make_clip_sampler("random" if mode == 'train' else 'uniform', self.args.clip_duration), + self.train_paths if mode == "train" else self.val_paths, + clip_sampler=make_clip_sampler( + "random" if mode == "train" else "uniform", self.args.clip_duration + ), decode_audio=False, transform=self._make_transforms(mode=mode), - video_sampler=DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler, + video_sampler=DistributedSampler + if (self.trainer is not None and self.trainer.use_ddp) + else RandomSampler, ) ) - return ds, DataLoader(ds, batch_size=self.args.batch_size, num_workers=self.args.workers) + return ds, DataLoader( + ds, batch_size=self.args.batch_size, num_workers=self.args.workers + ) def download_and_unzip(url, data_dir="./", verify=True): @@ -246,5 +274,5 @@ def download_and_unzip(url, data_dir="./", verify=True): if __name__ == "__main__": - args = parse_args('--batch_size 4 --data_path ./yt_data'.split()) + args = parse_args("--batch_size 4 --data_path ./yt_data".split()) dm = UCF11DataModule(args) diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py index 35055634..d77c9eb1 100644 --- a/tutorials/video_classification_example/finetune.py +++ b/tutorials/video_classification_example/finetune.py @@ -2,12 +2,11 @@ import pytorch_lightning as pl import torch +from data import KineticsDataModule, MiniKineticsDataModule, UCF11DataModule +from models import Classifier +from pytorchvideo.models.head import create_res_basic_head from torch import nn from torch.optim import Adam -from pytorchvideo.models.head import create_res_basic_head - -from data import UCF11DataModule, KineticsDataModule, MiniKineticsDataModule -from models import Classifier DATASET_MAP = { @@ -18,13 +17,22 @@ class Classifier(pl.LightningModule): - - def __init__(self, num_classes: int = 11, lr: float = 2e-4, freeze_backbone: bool = True, pretrained: bool = True): + def __init__( + self, + num_classes: int = 11, + lr: float = 2e-4, + freeze_backbone: bool = True, + pretrained: bool = True, + ): super().__init__() self.save_hyperparameters() # Backbone - resnet = torch.hub.load("facebookresearch/pytorchvideo", 'slow_r50', pretrained=self.hparams.pretrained) + resnet = torch.hub.load( + "facebookresearch/pytorchvideo", + "slow_r50", + pretrained=self.hparams.pretrained, + ) self.backbone = nn.Sequential(*list(resnet.children())[0][:-1]) if self.hparams.freeze_backbone: @@ -32,13 +40,15 @@ def __init__(self, num_classes: int = 11, lr: float = 2e-4, freeze_backbone: boo param.requires_grad = False # Head - self.head = create_res_basic_head(in_features=2048, out_features=self.hparams.num_classes) + self.head = create_res_basic_head( + in_features=2048, out_features=self.hparams.num_classes + ) # Metrics self.loss_fn = nn.CrossEntropyLoss() self.train_acc = pl.metrics.Accuracy() self.val_acc = pl.metrics.Accuracy() - self.accuracy = {'train': self.train_acc, 'val': self.val_acc} + self.accuracy = {"train": self.train_acc, "val": self.val_acc} def forward(self, x): if isinstance(x, dict): From fb55f1f53add12fd8522ed9e36110296281cf0e6 Mon Sep 17 00:00:00 2001 From: nateraw Date: Thu, 13 May 2021 19:13:53 -0600 Subject: [PATCH 04/15] :rotating_light: remove unused import --- tutorials/video_classification_example/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py index 4dd379cb..6c23dce2 100644 --- a/tutorials/video_classification_example/data.py +++ b/tutorials/video_classification_example/data.py @@ -1,5 +1,4 @@ import itertools -from argparse import ArgumentParser, Namespace from pathlib import Path from random import shuffle from shutil import unpack_archive From 7fd088010c798a53cce0431f4bb78ec276ab771c Mon Sep 17 00:00:00 2001 From: nateraw Date: Thu, 20 May 2021 00:58:12 -0600 Subject: [PATCH 05/15] :construction: wip --- website/docs/tutorial_finetuning.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 website/docs/tutorial_finetuning.md diff --git a/website/docs/tutorial_finetuning.md b/website/docs/tutorial_finetuning.md new file mode 100644 index 00000000..ba7d1763 --- /dev/null +++ b/website/docs/tutorial_finetuning.md @@ -0,0 +1,9 @@ +--- +id: tutorial_finetuning +title: Finetune a TorchHub model for Classification +--- + +# Introduction + +In this tutorial, you will learn how to finetune a pre-trained [Slow Resnet50 model from TorchHub](https://pytorch.org/hub/facebookresearch_pytorchvideo_resnet/) on the [UCF11 Dataset](https://www.crcv.ucf.edu/data/UCF_YouTube_Action.php). + From 0ec28aa3117e6e236c87d6372f30214945a66940 Mon Sep 17 00:00:00 2001 From: nateraw Date: Thu, 20 May 2021 00:58:35 -0600 Subject: [PATCH 06/15] :construction: wip --- .../video_classification_example/data.py | 16 +++++++-- .../video_classification_example/finetune.py | 33 ++++++++++++------- .../video_classification_example/train.py | 3 +- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py index 6c23dce2..16eb3857 100644 --- a/tutorials/video_classification_example/data.py +++ b/tutorials/video_classification_example/data.py @@ -70,6 +70,7 @@ def _video_transform(self, mode: str): transform=Compose( [ UniformTemporalSubsample(args.video_num_subsampled), + Lambda(lambda x: x / 255.0), Normalize(args.video_means, args.video_stds), ] + ( @@ -217,7 +218,7 @@ def __init__(self, args): for c in self.classes: # Scenes within each class directory - scene_names = sorted( + scene_names = list( x.name for x in (root / c).glob("*") if x.is_dir() and x.name != "Annotation" @@ -273,5 +274,16 @@ def download_and_unzip(url, data_dir="./", verify=True): if __name__ == "__main__": - args = parse_args("--batch_size 4 --data_path ./yt_data".split()) + from finetune import parse_args + from train import LearningRateMonitor, VideoClassificationLightningModule + args = parse_args("--gpus 1 --precision 16 --batch_size 8 --data_path ./yt_data".split()) + args.max_epochs = 200 + args.callbacks = [LearningRateMonitor()] + args.replace_sampler_ddp = False + args.reload_dataloaders_every_epoch = False + + pytorch_lightning.trainer.seed_everything(244) dm = UCF11DataModule(args) + model = VideoClassificationLightningModule(args) + trainer = pytorch_lightning.Trainer.from_argparse_args(args) + trainer.fit(model, dm) diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py index d77c9eb1..cff7fa99 100644 --- a/tutorials/video_classification_example/finetune.py +++ b/tutorials/video_classification_example/finetune.py @@ -2,8 +2,7 @@ import pytorch_lightning as pl import torch -from data import KineticsDataModule, MiniKineticsDataModule, UCF11DataModule -from models import Classifier +from .data import KineticsDataModule, MiniKineticsDataModule, UCF11DataModule from pytorchvideo.models.head import create_res_basic_head from torch import nn from torch.optim import Adam @@ -17,17 +16,30 @@ class Classifier(pl.LightningModule): + """ + """ def __init__( self, num_classes: int = 11, lr: float = 2e-4, freeze_backbone: bool = True, pretrained: bool = True, + **kwargs ): + """A classifier for finetuning pretrained video classification backbones from + torchhub. We use the slow_r50 model here, but you can edit this class to + use whatever backbone/head you'd like. + + Args: + num_classes (int, optional): Number of output classes. Defaults to 11. + lr (float, optional): The learning rate for the Adam optimizer. Defaults to 2e-4. + freeze_backbone (bool, optional): Whether to freeze the backbone or leave it trainable. Defaults to True. + pretrained (bool, optional): Use the pretrained model from torchhub. When False, we initialize the slow_r50 model from scratch. Defaults to True. + """ super().__init__() self.save_hyperparameters() - # Backbone + # The pretrained resnet model - we strip off its head to get the backbone resnet = torch.hub.load( "facebookresearch/pytorchvideo", "slow_r50", @@ -35,26 +47,24 @@ def __init__( ) self.backbone = nn.Sequential(*list(resnet.children())[0][:-1]) + # Freeze the backbone layers if specified if self.hparams.freeze_backbone: for param in self.backbone.parameters(): param.requires_grad = False - # Head + # Create a new head we will train on top of the backbone self.head = create_res_basic_head( in_features=2048, out_features=self.hparams.num_classes ) - # Metrics + # Metrics we will keep track of self.loss_fn = nn.CrossEntropyLoss() self.train_acc = pl.metrics.Accuracy() self.val_acc = pl.metrics.Accuracy() self.accuracy = {"train": self.train_acc, "val": self.val_acc} - def forward(self, x): - if isinstance(x, dict): - x = x["video"] - feats = self.backbone(x) - return self.head(feats) + def forward(self, x: torch.Tensor): + return self.head(self.backbone(x)) def shared_step(self, batch, mode: str): y_hat = self(batch["video"]) @@ -127,7 +137,6 @@ def parse_args(args=None): parser = pl.Trainer.add_argparse_args(parser) parser.set_defaults( max_epochs=200, - callbacks=[pl.callbacks.LearningRateMonitor()], replace_sampler_ddp=False, reload_dataloaders_every_epoch=False, ) @@ -138,7 +147,7 @@ def main(args): pl.trainer.seed_everything() dm_cls = DATASET_MAP.get(args.dataset) dm = dm_cls(args) - model = Classifier(num_classes=dm_cls.NUM_CLASSES) + model = Classifier(num_classes=dm_cls.NUM_CLASSES, **vars(args)) trainer = pl.Trainer.from_argparse_args(args) trainer.fit(model, dm) diff --git a/tutorials/video_classification_example/train.py b/tutorials/video_classification_example/train.py index 17beb719..8588d129 100644 --- a/tutorials/video_classification_example/train.py +++ b/tutorials/video_classification_example/train.py @@ -80,7 +80,7 @@ def __init__(self, args): if self.args.arch == "video_resnet": self.model = pytorchvideo.models.resnet.create_resnet( input_channel=3, - model_num_class=400, + model_num_class=11 # 400, ) self.batch_key = "video" elif self.args.arch == "audio_resnet": @@ -235,6 +235,7 @@ def _video_transform(self, mode: str): transform=Compose( [ UniformTemporalSubsample(args.video_num_subsampled), + Lambda(lambda x: x/255.0), Normalize(args.video_means, args.video_stds), ] + ( From e126c63143c3d7a9d495ee415645c9e0eccdaa91 Mon Sep 17 00:00:00 2001 From: nateraw Date: Thu, 20 May 2021 01:30:45 -0600 Subject: [PATCH 07/15] :pencil: Writing docs. --- .../video_classification_example/data.py | 29 +++++----- .../video_classification_example/finetune.py | 55 ++++++++++++++++--- 2 files changed, 59 insertions(+), 25 deletions(-) diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py index 16eb3857..988e47fa 100644 --- a/tutorials/video_classification_example/data.py +++ b/tutorials/video_classification_example/data.py @@ -128,6 +128,12 @@ def _audio_transform(self): ) def _make_ds_and_loader(self, mode: str): + """Creates both the dataset and dataloader for a given dataset split 'mode'. This returns + both the dataset and the dataloader specified, and should be called from self.{train|val|test}_dataloader(). + + Args: + mode (str): The dataset split to create. Should be 'train' or 'val'. + """ ds = LimitDataset( labeled_video_dataset( data_path=str( @@ -259,6 +265,13 @@ def _make_ds_and_loader(self, mode: str): def download_and_unzip(url, data_dir="./", verify=True): + """Download a zip file from a given URL and unpack it within data_dir. + + Args: + url (str): A URL to a zip file. + data_dir (str, optional): Directory where the zip will be unpacked. Defaults to "./". + verify (bool, optional): Whether to verify SSL certificate when requesting the zip file. Defaults to True. + """ data_dir = Path(data_dir) zipfile_name = url.split("/")[-1] data_zip_path = data_dir / zipfile_name @@ -271,19 +284,3 @@ def download_and_unzip(url, data_dir="./", verify=True): f.write(resp.content) unpack_archive(data_zip_path, extract_dir=data_dir) - - -if __name__ == "__main__": - from finetune import parse_args - from train import LearningRateMonitor, VideoClassificationLightningModule - args = parse_args("--gpus 1 --precision 16 --batch_size 8 --data_path ./yt_data".split()) - args.max_epochs = 200 - args.callbacks = [LearningRateMonitor()] - args.replace_sampler_ddp = False - args.reload_dataloaders_every_epoch = False - - pytorch_lightning.trainer.seed_everything(244) - dm = UCF11DataModule(args) - model = VideoClassificationLightningModule(args) - trainer = pytorch_lightning.Trainer.from_argparse_args(args) - trainer.fit(model, dm) diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py index cff7fa99..2aca590d 100644 --- a/tutorials/video_classification_example/finetune.py +++ b/tutorials/video_classification_example/finetune.py @@ -2,7 +2,7 @@ import pytorch_lightning as pl import torch -from .data import KineticsDataModule, MiniKineticsDataModule, UCF11DataModule +from data import KineticsDataModule, MiniKineticsDataModule, UCF11DataModule from pytorchvideo.models.head import create_res_basic_head from torch import nn from torch.optim import Adam @@ -16,8 +16,7 @@ class Classifier(pl.LightningModule): - """ - """ + def __init__( self, num_classes: int = 11, @@ -35,6 +34,8 @@ def __init__( lr (float, optional): The learning rate for the Adam optimizer. Defaults to 2e-4. freeze_backbone (bool, optional): Whether to freeze the backbone or leave it trainable. Defaults to True. pretrained (bool, optional): Use the pretrained model from torchhub. When False, we initialize the slow_r50 model from scratch. Defaults to True. + + All extra kwargs will be available via self.hparams.. These will also be saved as TensorBoard Hparams. """ super().__init__() self.save_hyperparameters() @@ -64,9 +65,23 @@ def __init__( self.accuracy = {"train": self.train_acc, "val": self.val_acc} def forward(self, x: torch.Tensor): + """ + Forward defines the prediction/inference actions. + """ return self.head(self.backbone(x)) def shared_step(self, batch, mode: str): + """This shared step handles both the training and validation steps to avoid + re-writing the same code more than once. The given `mode` will change the name + of the logged metrics. + + Args: + batch (dict): PyTorchVideo batch dictionary containing a single batch of data. + mode (str): The type of step. Can be 'train', 'val', or 'test'. + + Returns: + torch.Tensor: The loss for a single batch step. + """ y_hat = self(batch["video"]) loss = self.loss_fn(y_hat, batch["label"]) self.log(f"{mode}_loss", loss) @@ -79,9 +94,35 @@ def shared_step(self, batch, mode: str): return loss def training_step(self, batch, batch_idx): + """ + This function is called in the inner loop of the training epoch. It must + return a loss that is used for loss.backwards() internally. The self.log(...) + function can be used to log any training metrics. + + PyTorchVideo batches are dictionaries containing each modality or metadata of + the batch collated video clips. Kinetics contains the following notable keys: + { + 'video': , + 'audio': , + 'label': , + } + + - "video" is a Tensor of shape (batch, channels, time, height, Width) + - "audio" is a Tensor of shape (batch, channels, time, 1, frequency) + - "label" is a Tensor of shape (batch, 1) + + The PyTorchVideo models and transforms expect the same input shapes and + dictionary structure making this function just a matter of unwrapping the dict and + feeding it through the model/loss. + """ return self.shared_step(batch, "train") def validation_step(self, batch, batch_idx): + """ + This function is called in the inner loop of the evaluation cycle. For this + simple example it's mostly the same as the training loop but with a different + metric name. + """ return self.shared_step(batch, "val") def test_step(self, batch, batch_idx): @@ -133,13 +174,9 @@ def parse_args(args=None): parser.add_argument("--audio_logmel_mean", default=-7.03, type=float) parser.add_argument("--audio_logmel_std", default=4.66, type=float) - # Trainer parameters. + # Add PyTorch Lightning's Trainer init arguments as parser flags parser = pl.Trainer.add_argparse_args(parser) - parser.set_defaults( - max_epochs=200, - replace_sampler_ddp=False, - reload_dataloaders_every_epoch=False, - ) + return parser.parse_args(args=args) From e4c8cbff1d843c94013536b604ddcbd8524ae415 Mon Sep 17 00:00:00 2001 From: nateraw Date: Thu, 20 May 2021 11:36:01 -0600 Subject: [PATCH 08/15] :art: improve structure + cleanup unnecessary code --- .../video_classification_example/data.py | 394 ++++++++------- .../video_classification_example/finetune.py | 214 ++------ .../video_classification_example/models.py | 150 ++++++ .../video_classification_example/train.py | 460 +----------------- 4 files changed, 411 insertions(+), 807 deletions(-) create mode 100644 tutorials/video_classification_example/models.py diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py index 988e47fa..b1c29268 100644 --- a/tutorials/video_classification_example/data.py +++ b/tutorials/video_classification_example/data.py @@ -2,8 +2,9 @@ from pathlib import Path from random import shuffle from shutil import unpack_archive +from typing import Tuple -import pytorch_lightning +import pytorch_lightning as pl import requests import torch from pytorchvideo.data import LabeledVideoDataset, make_clip_sampler @@ -12,12 +13,11 @@ ApplyTransformToKey, Normalize, RandomShortSideScale, - RemoveKey, ShortSideScale, UniformTemporalSubsample, ) + from torch.utils.data import DataLoader, DistributedSampler, RandomSampler -from torchaudio.transforms import MelSpectrogram, Resample from torchvision.transforms import ( CenterCrop, Compose, @@ -27,241 +27,234 @@ ) -class LabeledVideoDataModule(pytorch_lightning.LightningDataModule): - - TRAIN_PATH = "train.csv" - VAL_PATH = "val.csv" - SOURCE_URL = None - SOURCE_DIR_NAME = None +class LabeledVideoDataModule(pl.LightningDataModule): + + SOURCE_URL: str = None + SOURCE_DIR_NAME: str = "" + NUM_CLASSES: int = 700 + VERIFY_SSL: bool = True + + def __init__( + self, + root: str = './', + clip_duration: int = 2, + video_num_subsampled: int = 8, + video_crop_size: int = 224, + video_means: Tuple[float] = (0.45, 0.45, 0.45), + video_stds: Tuple[float] = (0.225, 0.225, 0.225), + video_min_short_side_scale: int = 256, + video_max_short_side_scale: int = 320, + video_horizontal_flip_p: float = 0.5, + batch_size: int = 4, + workers: int = 4, + **kwargs + ): - def __init__(self, args): super().__init__() - self.args = args - self.root = Path(self.args.data_path) / self.SOURCE_DIR_NAME - if not (self.SOURCE_URL is None or self.SOURCE_DIR_NAME is None): - if not self.root.exists(): - download_and_unzip( - self.SOURCE_URL, - self.args.data_path, - verify=getattr(self.args, "verify", True), - ) - - def _make_transforms(self, mode: str): - - if self.args.data_type == "video": - transform = [ - self._video_transform(mode), - RemoveKey("audio"), - ] - elif self.args.data_type == "audio": - transform = [ - self._audio_transform(), - RemoveKey("video"), - ] - else: - raise Exception(f"{self.args.data_type} not supported") - - return Compose(transform) - - def _video_transform(self, mode: str): - args = self.args - return ApplyTransformToKey( - key="video", + self.root = root + self.data_path = Path(self.root) / self.SOURCE_DIR_NAME + self.clip_duration = clip_duration + self.video_num_subsampled = video_num_subsampled + self.video_crop_size = video_crop_size + self.video_means = video_means + self.video_stds = video_stds + self.video_min_short_side_scale = video_min_short_side_scale + self.video_max_short_side_scale = video_max_short_side_scale + self.video_horizontal_flip_p = video_horizontal_flip_p + self.batch_size = batch_size + self.workers = workers + + # Transforms applied to train dataset + self.train_transform = ApplyTransformToKey( + key='video', transform=Compose( [ - UniformTemporalSubsample(args.video_num_subsampled), + UniformTemporalSubsample(self.video_num_subsampled), Lambda(lambda x: x / 255.0), - Normalize(args.video_means, args.video_stds), + Normalize(self.video_means, self.video_stds), + RandomShortSideScale( + min_size=self.video_min_short_side_scale, + max_size=self.video_max_short_side_scale, + ), + RandomCrop(self.video_crop_size), + RandomHorizontalFlip(p=self.video_horizontal_flip_p), ] - + ( - [ - RandomShortSideScale( - min_size=args.video_min_short_side_scale, - max_size=args.video_max_short_side_scale, - ), - RandomCrop(args.video_crop_size), - RandomHorizontalFlip(p=args.video_horizontal_flip_p), - ] - if mode == "train" - else [ - ShortSideScale(args.video_min_short_side_scale), - CenterCrop(args.video_crop_size), - ] - ) - ), + ) ) - def _audio_transform(self): - args = self.args - n_fft = int( - float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size - ) - hop_length = int( - float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size - ) - eps = 1e-10 - return ApplyTransformToKey( - key="audio", + # Transforms applied on val dataset or for inference + self.val_transform = ApplyTransformToKey( + key='video', transform=Compose( [ - Resample( - orig_freq=args.audio_raw_sample_rate, - new_freq=args.audio_resampled_rate, - ), - MelSpectrogram( - sample_rate=args.audio_resampled_rate, - n_fft=n_fft, - hop_length=hop_length, - n_mels=args.audio_num_mels, - center=False, - ), - Lambda(lambda x: x.clamp(min=eps)), - Lambda(torch.log), - UniformTemporalSubsample(args.audio_mel_num_subsample), - Lambda(lambda x: x.transpose(1, 0)), # (F, T) -> (T, F) - Lambda( - lambda x: x.view(1, x.size(0), 1, x.size(1)) - ), # (T, F) -> (1, T, 1, F) - Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)), + UniformTemporalSubsample(self.video_num_subsampled), + Lambda(lambda x: x / 255.0), + Normalize(self.video_means, self.video_stds), + ShortSideScale(self.video_min_short_side_scale), + CenterCrop(self.video_crop_size) ] - ), + ) ) - def _make_ds_and_loader(self, mode: str): - """Creates both the dataset and dataloader for a given dataset split 'mode'. This returns - both the dataset and the dataloader specified, and should be called from self.{train|val|test}_dataloader(). + def prepare_data(self): + """Download the dataset if it doesn't already exist. This runs only on rank 0""" + if not (self.SOURCE_URL is None or self.SOURCE_DIR_NAME is None): + if not self.data_path.exists(): + download_and_unzip(self.SOURCE_URL, self.root, verify=self.VERIFY_SSL) - Args: - mode (str): The dataset split to create. Should be 'train' or 'val'. - """ - ds = LimitDataset( + def train_dataloader(self): + self.train_dataset = LimitDataset( labeled_video_dataset( - data_path=str( - Path(self.root) - / (self.TRAIN_PATH if mode == "train" else self.VAL_PATH) - ), - clip_sampler=make_clip_sampler( - "random" if mode == "train" else "uniform", self.args.clip_duration - ), - video_path_prefix=self.args.video_path_prefix, - transform=self._make_transforms(mode=mode), + data_path=str(Path(self.data_path) / 'train'), + clip_sampler=make_clip_sampler("random", self.clip_duration), + transform=self.train_transform, + decode_audio=False, video_sampler=DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) - else RandomSampler, + else RandomSampler ) ) - return ds, DataLoader( - ds, batch_size=self.args.batch_size, num_workers=self.args.workers + return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.workers) + + def val_dataloader(self): + self.val_dataset = LimitDataset( + labeled_video_dataset( + data_path=str(Path(self.data_path) / 'val'), + clip_sampler=make_clip_sampler("uniform", self.clip_duration), + transform=self.val_transform, + decode_audio=False, + video_sampler=DistributedSampler + if (self.trainer is not None and self.trainer.use_ddp) + else RandomSampler + ) ) + return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.workers) - def train_dataloader(self): - self.train_dataset, loader = self._make_ds_and_loader("train") - return loader - def val_dataloader(self): - self.val_dataset, loader = self._make_ds_and_loader("val") - return loader +class UCF11DataModule(LabeledVideoDataModule): + SOURCE_URL: str = "https://www.crcv.ucf.edu/data/YouTube_DataSet_Annotated.zip" + SOURCE_DIR_NAME: str = "action_youtube_naudio" + NUM_CLASSES: int = 11 + VERIFY_SSL: bool = False -class LimitDataset(torch.utils.data.Dataset): - """ - To ensure a constant number of samples are retrieved from the dataset we use this - LimitDataset wrapper. This is necessary because several of the underlying videos - may be corrupted while fetching or decoding, however, we always want the same - number of steps per epoch. - """ + def __init__(self, **kwargs): + """ + The UCF11 Dataset contains 11 action classes: basketball shooting, biking/cycling, diving, + golf swinging, horse back riding, soccer juggling, swinging, tennis swinging, trampoline jumping, + volleyball spiking, and walking with a dog. + + For each class, the videos are grouped into 25 group/scene folders containing at least 4 video clips each. + The video clips in the same scene folder share some common features, such as the same actor, similar + background, similar viewpoint, and so on. + + The folder structure looks like the following: + + /data_dir + ├── basketball # Class Folder Path + │ ├── v_shooting_01 # Scene/Group Folder Path + │ │ ├── v_shooting_01_01.avi # Video Path + │ │ ├── v_shooting_01_02.avi + │ │ ├── v_shooting_01_03.avi + │ │ ├── ... + │ ├── v_shooting_02 + │ ├── v_shooting_03 + │ ├── ... + │ ... + ├── biking + │ ├── v_biking_01 + │ │ ├── v_biking_01_01.avi + │ │ ├── v_biking_01_02.avi + │ │ ├── v_biking_01_03.avi + │ ├── v_biking_02 + │ ├── v_biking_03 + │ ... + ... + + We take 80% of all scenes and use the videos within for training. The remaining scenes' videos + are used for validation. We do this so the validation data contains only videos from scenes/actors + that the model has not seen yet. + """ + super().__init__(**kwargs) - def __init__(self, dataset): - super().__init__() - self.dataset = dataset - self.dataset_iter = itertools.chain.from_iterable( - itertools.repeat(iter(dataset), 2) - ) + def setup(self, stage=None): + """Set up anything needed for initializing train/val datasets. This runs on all nodes""" - def __getitem__(self, index): - return next(self.dataset_iter) + # Names of classes to predict + # Ex. ['basketball', 'biking', 'diving', ...] + self.classes = sorted(x.name for x in self.data_path.glob("*") if x.is_dir()) - def __len__(self): - return self.dataset.num_videos + # Mapping from label to class id. + # Ex. {'basketball': 0, 'biking': 1, 'diving': 2, ...} + self.label_to_id = {} + # A list to hold all available scenes across all classes + scene_folders = [] -class KineticsDataModule(LabeledVideoDataModule): - TRAIN_PATH = "train.csv" - VAL_PATH = "val.csv" - NUM_CLASSES = 700 + for class_id, class_name in enumerate(self.classes): + self.label_to_id[class_name] = class_id -class MiniKineticsDataModule(LabeledVideoDataModule): + # The path of a class folder within self.data_path + # Ex. 'action_youtube_naudio/{basketball|biking|diving|...}' + class_folder = self.data_path / class_name - TRAIN_PATH = "train" - VAL_PATH = "val" - SOURCE_URL = "https://pl-flash-data.s3.amazonaws.com/kinetics.zip" - SOURCE_DIR_NAME = "kinetics" - NUM_CLASSES = 6 + # Collect scene folders within this class + # Ex. 'action_youtube_naudio/basketball/v_shooting_01' + for scene_folder in filter(Path.is_dir, class_folder.glob('v_*')): + scene_folders.append(scene_folder) + # Randomly shuffle the scene folders before splitting them into train/val + shuffle(scene_folders) -class UCF11DataModule(LabeledVideoDataModule): - TRAIN_PATH = None - VAL_PATH = None - SOURCE_URL = "https://www.crcv.ucf.edu/data/YouTube_DataSet_Annotated.zip" - SOURCE_DIR_NAME = "action_youtube_naudio" - NUM_CLASSES = 11 - - def __init__(self, args): - args.verify = False - super().__init__(args) - - data_path = Path(self.args.data_path) - root = data_path / self.SOURCE_DIR_NAME - self.classes = [x.name for x in root.glob("*") if x.is_dir()] - self.id_to_label = dict(zip(range(len(self.classes)), self.classes)) - self.class_to_label = {v: k for k, v in self.id_to_label.items()} - self.num_classes = len(self.classes) + # Determine number of scenes in train/validation splits. + self.num_train_scenes = int(0.8 * len(scene_folders)) + self.num_val_scenes = len(scene_folders) - self.num_train_scenes + # Collect train/val paths to videos within each scene folder. + # Validation only uses videos from scenes not seen by model during training self.train_paths = [] self.val_paths = [] - self.holdout_scenes = {} - for c in self.classes: - - # Scenes within each class directory - scene_names = list( - x.name - for x in (root / c).glob("*") - if x.is_dir() and x.name != "Annotation" - ) - shuffle(scene_names) + for i, scene_path in enumerate(scene_folders): + + # The actual name of the class (Ex. 'basketball') + class_name = scene_path.parent.name - # Holdout a random actor/scene - holdout_scene = scene_names[-1] - scene_names = scene_names[:-1] + # Loop over all the videos within the given scene folder. + for video_path in scene_path.glob("*.avi"): - # Keep track of which scenes we held out for each class w/ a dict - self.holdout_scenes[c] = holdout_scene + # Construct a tuple containing (, ) + # In our case, we assign the class's ID as 'label'. + labeled_path = (video_path, {"label": self.label_to_id[class_name]}) - for v in (root / c).glob("**/*.avi"): - labeled_path = (v, {"label": self.class_to_label[c]}) - if v.parent.name != holdout_scene: + if i < self.num_train_scenes: self.train_paths.append(labeled_path) else: self.val_paths.append(labeled_path) - def _make_ds_and_loader(self, mode: str): - ds = LimitDataset( + def train_dataloader(self): + self.train_dataset = LimitDataset( LabeledVideoDataset( - self.train_paths if mode == "train" else self.val_paths, - clip_sampler=make_clip_sampler( - "random" if mode == "train" else "uniform", self.args.clip_duration - ), + self.train_paths, + clip_sampler=make_clip_sampler('random', self.clip_duration), decode_audio=False, - transform=self._make_transforms(mode=mode), - video_sampler=DistributedSampler - if (self.trainer is not None and self.trainer.use_ddp) - else RandomSampler, + transform=self.train_transform, + video_sampler=RandomSampler ) ) - return ds, DataLoader( - ds, batch_size=self.args.batch_size, num_workers=self.args.workers + return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.workers) + + def val_dataloader(self): + self.val_dataset = LimitDataset( + LabeledVideoDataset( + self.val_paths, + clip_sampler=make_clip_sampler('uniform', self.clip_duration), + decode_audio=False, + transform=self.val_transform, + video_sampler=RandomSampler + ) ) + return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.workers) def download_and_unzip(url, data_dir="./", verify=True): @@ -284,3 +277,26 @@ def download_and_unzip(url, data_dir="./", verify=True): f.write(resp.content) unpack_archive(data_zip_path, extract_dir=data_dir) + + +class LimitDataset(torch.utils.data.Dataset): + + """ + To ensure a constant number of samples are retrieved from the dataset we use this + LimitDataset wrapper. This is necessary because several of the underlying videos + may be corrupted while fetching or decoding, however, we always want the same + number of steps per epoch. + """ + + def __init__(self, dataset): + super().__init__() + self.dataset = dataset + self.dataset_iter = itertools.chain.from_iterable( + itertools.repeat(iter(dataset), 2) + ) + + def __getitem__(self, index): + return next(self.dataset_iter) + + def __len__(self): + return self.dataset.num_videos diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py index 2aca590d..db097110 100644 --- a/tutorials/video_classification_example/finetune.py +++ b/tutorials/video_classification_example/finetune.py @@ -1,193 +1,39 @@ -from argparse import ArgumentParser - import pytorch_lightning as pl -import torch -from data import KineticsDataModule, MiniKineticsDataModule, UCF11DataModule -from pytorchvideo.models.head import create_res_basic_head -from torch import nn -from torch.optim import Adam - - -DATASET_MAP = { - "ucf11": UCF11DataModule, - "kinetics": KineticsDataModule, - "kinetics-mini": MiniKineticsDataModule, -} - -class Classifier(pl.LightningModule): +from data import UCF11DataModule +from models import SlowResnet50LightningModel +from train import parse_args - def __init__( - self, - num_classes: int = 11, - lr: float = 2e-4, - freeze_backbone: bool = True, - pretrained: bool = True, - **kwargs - ): - """A classifier for finetuning pretrained video classification backbones from - torchhub. We use the slow_r50 model here, but you can edit this class to - use whatever backbone/head you'd like. - Args: - num_classes (int, optional): Number of output classes. Defaults to 11. - lr (float, optional): The learning rate for the Adam optimizer. Defaults to 2e-4. - freeze_backbone (bool, optional): Whether to freeze the backbone or leave it trainable. Defaults to True. - pretrained (bool, optional): Use the pretrained model from torchhub. When False, we initialize the slow_r50 model from scratch. Defaults to True. - - All extra kwargs will be available via self.hparams.. These will also be saved as TensorBoard Hparams. - """ - super().__init__() - self.save_hyperparameters() - - # The pretrained resnet model - we strip off its head to get the backbone - resnet = torch.hub.load( - "facebookresearch/pytorchvideo", - "slow_r50", - pretrained=self.hparams.pretrained, - ) - self.backbone = nn.Sequential(*list(resnet.children())[0][:-1]) +def train(args): + pl.seed_everything(224) + dm = UCF11DataModule(**vars(args)) + model = SlowResnet50LightningModel(num_classes=dm.NUM_CLASSES, **vars(args)) + trainer = pl.Trainer.from_argparse_args(args) + trainer.fit(model, dm) - # Freeze the backbone layers if specified - if self.hparams.freeze_backbone: - for param in self.backbone.parameters(): - param.requires_grad = False - # Create a new head we will train on top of the backbone - self.head = create_res_basic_head( - in_features=2048, out_features=self.hparams.num_classes +def main(): + args = parse_args() + if args.on_cluster: + from slurm import copy_and_run_with_config + copy_and_run_with_config( + train, + args, + args.working_directory, + job_name=args.job_name, + time="72:00:00", + partition=args.partition, + gpus_per_node=args.gpus, + ntasks_per_node=args.gpus, + cpus_per_task=10, + mem="470GB", + nodes=args.num_nodes, + constraint="volta32gb", ) - - # Metrics we will keep track of - self.loss_fn = nn.CrossEntropyLoss() - self.train_acc = pl.metrics.Accuracy() - self.val_acc = pl.metrics.Accuracy() - self.accuracy = {"train": self.train_acc, "val": self.val_acc} - - def forward(self, x: torch.Tensor): - """ - Forward defines the prediction/inference actions. - """ - return self.head(self.backbone(x)) - - def shared_step(self, batch, mode: str): - """This shared step handles both the training and validation steps to avoid - re-writing the same code more than once. The given `mode` will change the name - of the logged metrics. - - Args: - batch (dict): PyTorchVideo batch dictionary containing a single batch of data. - mode (str): The type of step. Can be 'train', 'val', or 'test'. - - Returns: - torch.Tensor: The loss for a single batch step. - """ - y_hat = self(batch["video"]) - loss = self.loss_fn(y_hat, batch["label"]) - self.log(f"{mode}_loss", loss) - - if mode in ["val", "test"]: - preds = y_hat.argmax(dim=1) - acc = self.accuracy[mode](preds, batch["label"]) - self.log(f"{mode}_acc", acc, prog_bar=True) - - return loss - - def training_step(self, batch, batch_idx): - """ - This function is called in the inner loop of the training epoch. It must - return a loss that is used for loss.backwards() internally. The self.log(...) - function can be used to log any training metrics. - - PyTorchVideo batches are dictionaries containing each modality or metadata of - the batch collated video clips. Kinetics contains the following notable keys: - { - 'video': , - 'audio': , - 'label': , - } - - - "video" is a Tensor of shape (batch, channels, time, height, Width) - - "audio" is a Tensor of shape (batch, channels, time, 1, frequency) - - "label" is a Tensor of shape (batch, 1) - - The PyTorchVideo models and transforms expect the same input shapes and - dictionary structure making this function just a matter of unwrapping the dict and - feeding it through the model/loss. - """ - return self.shared_step(batch, "train") - - def validation_step(self, batch, batch_idx): - """ - This function is called in the inner loop of the evaluation cycle. For this - simple example it's mostly the same as the training loop but with a different - metric name. - """ - return self.shared_step(batch, "val") - - def test_step(self, batch, batch_idx): - return self.shared_step(batch, "test") - - def configure_optimizers(self): - return Adam(self.parameters(), lr=self.hparams.lr) - - -def parse_args(args=None): - parser = ArgumentParser() - - # Model parameters. - parser.add_argument("--lr", "--learning-rate", default=0.1, type=float) - parser.add_argument("--momentum", default=0.9, type=float) - parser.add_argument("--weight_decay", default=1e-4, type=float) - parser.add_argument( - "--arch", - default="video_resnet", - choices=["video_resnet", "audio_resnet"], - type=str, - ) - - # Data parameters - parser.add_argument( - "--dataset", default="ucf11", choices=["ucf11", "kinetics", "kinetics-mini"] - ) - parser.add_argument("--data_path", default=None, type=str, required=True) - parser.add_argument("--video_path_prefix", default="", type=str) - parser.add_argument("--workers", default=8, type=int) - parser.add_argument("--batch_size", default=32, type=int) - parser.add_argument("--clip_duration", default=2, type=float) - parser.add_argument( - "--data_type", default="video", choices=["video", "audio"], type=str - ) - parser.add_argument("--video_num_subsampled", default=8, type=int) - parser.add_argument("--video_means", default=(0.45, 0.45, 0.45), type=tuple) - parser.add_argument("--video_stds", default=(0.225, 0.225, 0.225), type=tuple) - parser.add_argument("--video_crop_size", default=224, type=int) - parser.add_argument("--video_min_short_side_scale", default=256, type=int) - parser.add_argument("--video_max_short_side_scale", default=320, type=int) - parser.add_argument("--video_horizontal_flip_p", default=0.5, type=float) - parser.add_argument("--audio_raw_sample_rate", default=44100, type=int) - parser.add_argument("--audio_resampled_rate", default=16000, type=int) - parser.add_argument("--audio_mel_window_size", default=32, type=int) - parser.add_argument("--audio_mel_step_size", default=16, type=int) - parser.add_argument("--audio_num_mels", default=80, type=int) - parser.add_argument("--audio_mel_num_subsample", default=128, type=int) - parser.add_argument("--audio_logmel_mean", default=-7.03, type=float) - parser.add_argument("--audio_logmel_std", default=4.66, type=float) - - # Add PyTorch Lightning's Trainer init arguments as parser flags - parser = pl.Trainer.add_argparse_args(parser) - - return parser.parse_args(args=args) - - -def main(args): - pl.trainer.seed_everything() - dm_cls = DATASET_MAP.get(args.dataset) - dm = dm_cls(args) - model = Classifier(num_classes=dm_cls.NUM_CLASSES, **vars(args)) - trainer = pl.Trainer.from_argparse_args(args) - trainer.fit(model, dm) + else: # local + train(args) -if __name__ == "__main__": - main(parse_args()) +if __name__ == '__main__': + main() diff --git a/tutorials/video_classification_example/models.py b/tutorials/video_classification_example/models.py new file mode 100644 index 00000000..9571e8d8 --- /dev/null +++ b/tutorials/video_classification_example/models.py @@ -0,0 +1,150 @@ +import pytorch_lightning as pl +import torch +from torch import nn +from pytorchvideo.models.resnet import create_resnet +from pytorchvideo.models.head import create_res_basic_head + + +class VideoClassificationLightningModule(pl.LightningModule): + + def __init__( + self, + num_classes: int = 11, + lr: float = 2e-4, + **kwargs + ): + """A classifier for finetuning pretrained video classification backbones from + torchhub. We use the slow_r50 model here, but you can edit this class to + use whatever backbone/head you'd like. + + Args: + num_classes (int, optional): Number of output classes. Defaults to 11. + lr (float, optional): The learning rate for the Adam optimizer. Defaults to 2e-4. + freeze_backbone (bool, optional): Whether to freeze the backbone or leave it trainable. Defaults to True. + pretrained (bool, optional): Use the pretrained model from torchhub. When False, we initialize the + slow_r50 model from scratch. Defaults to True. + + All extra kwargs will be available via self.hparams.. These will also be saved as + TensorBoard Hparams. + """ + super().__init__() + + # Saves all kwargs to self.hparams. Use references to self.hparams., not the init args themselves. + self.save_hyperparameters() + + # Build the model in separate function so its easier to override + self.model = self._build_model() + + # Metrics we will keep track of + self.loss_fn = nn.CrossEntropyLoss() + self.train_acc = pl.metrics.Accuracy() + self.val_acc = pl.metrics.Accuracy() + self.accuracy = {"train": self.train_acc, "val": self.val_acc} + + def _build_model(self): + return create_resnet(model_num_class=self.hparams.num_classes) + + def on_train_epoch_start(self): + """ + For distributed training we need to set the datasets video sampler epoch so + that shuffling is done correctly + """ + epoch = self.trainer.current_epoch + if self.trainer.use_ddp: + self.trainer.datamodule.train_dataset.dataset.video_sampler.set_epoch(epoch) + + def forward(self, x: torch.Tensor): + """ + Forward defines the prediction/inference actions. + """ + return self.model(x) + + def shared_step(self, batch, mode: str): + """This shared step handles both the training and validation steps to avoid + re-writing the same code more than once. The given `mode` will change the name + of the logged metrics. + + PyTorchVideo batches are dictionaries containing each modality or metadata of + the batch collated video clips. Kinetics contains the following notable keys: + { + 'video': , + 'label': , + } + + - "video" is a Tensor of shape (batch, channels, time, height, Width) + - "label" is a Tensor of shape (batch, 1) + + The PyTorchVideo models and transforms expect the same input shapes and + dictionary structure making this function just a matter of unwrapping the dict and + feeding it through the model/loss. + + Args: + batch (dict): PyTorchVideo batch dictionary containing a single batch of data. + mode (str): The type of step. Can be 'train', 'val', or 'test'. + + Returns: + torch.Tensor: The loss for a single batch step. + """ + + # Pass video tensor through model to get outputs + outputs = self(batch["video"]) + + # Compute and log the cross entropy loss to {train|val}_loss in TensorBoard + loss = self.loss_fn(outputs, batch["label"]) + self.log(f"{mode}_loss", loss) + + # Predicted class probabilities - (BATCH_SIZE, NUM_CLASSES) + proba = outputs.softmax(dim=1) + + # Predicted classes - (BATCH_SIZE,) + preds = proba.argmax(dim=1) + + # Compute the predicted class accuracy and log it to {train|val}_acc in TensorBoard + acc = self.accuracy[mode](preds, batch["label"]) + self.log(f"{mode}_acc", acc, prog_bar=True) + + return loss + + def training_step(self, batch, batch_idx): + """ + This function is called in the inner loop of the training epoch. It must + return a loss that is used for loss.backwards() internally. + """ + return self.shared_step(batch, "train") + + def validation_step(self, batch, batch_idx): + """ + This function is called in the inner loop of the evaluation cycle. For this + simple example it's mostly the same as the training loop but with a different + metric name. + """ + return self.shared_step(batch, "val") + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=self.hparams.lr) + + +class SlowResnet50LightningModel(VideoClassificationLightningModule): + + def __init__(self, freeze_backbone: bool = True, pretrained: bool = True, **kwargs): + super().__init__(freeze_backbone=freeze_backbone, pretrained=pretrained, **kwargs) + + def _build_model(self): + # The pretrained resnet model - we strip off its head to get the backbone + resnet = torch.hub.load( + "facebookresearch/pytorchvideo", + "slow_r50", + pretrained=self.hparams.pretrained, + ) + self.backbone = nn.Sequential(*list(resnet.children())[0][:-1]) + + # Freeze the backbone layers if specified + if self.hparams.freeze_backbone: + for param in self.backbone.parameters(): + param.requires_grad = False + + # Create a new head we will train on top of the backbone + self.head = create_res_basic_head( + in_features=2048, out_features=self.hparams.num_classes + ) + return nn.Sequential(self.backbone, self.head) diff --git a/tutorials/video_classification_example/train.py b/tutorials/video_classification_example/train.py index 8588d129..93ec1eb2 100644 --- a/tutorials/video_classification_example/train.py +++ b/tutorials/video_classification_example/train.py @@ -1,382 +1,12 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from argparse import ArgumentParser -import argparse -import itertools -import logging -import os +import pytorch_lightning as pl +from data import LabeledVideoDataModule +from models import VideoClassificationLightningModule -import pytorch_lightning -import pytorchvideo.data -import pytorchvideo.models.resnet -import torch -import torch.nn.functional as F -from pytorch_lightning.callbacks import LearningRateMonitor -from pytorchvideo.transforms import ( - ApplyTransformToKey, - Normalize, - RandomShortSideScale, - RemoveKey, - ShortSideScale, - UniformTemporalSubsample, -) -from torch.utils.data import DistributedSampler, RandomSampler -from torchaudio.transforms import MelSpectrogram, Resample -from torchvision.transforms import ( - CenterCrop, - Compose, - Lambda, - RandomCrop, - RandomHorizontalFlip, -) - -""" -This video classification example demonstrates how PyTorchVideo models, datasets and -transforms can be used with PyTorch Lightning module. Specifically it shows how a -simple pipeline to train a Resnet on the Kinetics video dataset can be built. - -Don't worry if you don't have PyTorch Lightning experience. We'll provide an explanation -of how the PyTorch Lightning module works to accompany the example. - -The code can be separated into three main components: -1. VideoClassificationLightningModule (pytorch_lightning.LightningModule), this defines: - - how the model is constructed, - - the inner train or validation loop (i.e. computing loss/metrics from a minibatch) - - optimizer configuration - -2. KineticsDataModule (pytorch_lightning.LightningDataModule), this defines: - - how to fetch/prepare the dataset - - the train and val dataloaders for the associated dataset - -3. pytorch_lightning.Trainer, this is a concrete PyTorch Lightning class that provides - the training pipeline configuration and a fit(, ) - function to start the training/validation loop. - -All three components are combined in the train() function. We'll explain the rest of the -details inline. -""" - - -class VideoClassificationLightningModule(pytorch_lightning.LightningModule): - def __init__(self, args): - """ - This LightningModule implementation constructs a PyTorchVideo ResNet, - defines the train and val loss to be trained with (cross_entropy), and - configures the optimizer. - """ - self.args = args - super().__init__() - self.train_accuracy = pytorch_lightning.metrics.Accuracy() - self.val_accuracy = pytorch_lightning.metrics.Accuracy() - - ############# - # PTV Model # - ############# - - # Here we construct the PyTorchVideo model. For this example we're using a - # ResNet that works with Kinetics (e.g. 400 num_classes). For your application, - # this could be changed to any other PyTorchVideo model (e.g. for SlowFast use - # create_slowfast). - if self.args.arch == "video_resnet": - self.model = pytorchvideo.models.resnet.create_resnet( - input_channel=3, - model_num_class=11 # 400, - ) - self.batch_key = "video" - elif self.args.arch == "audio_resnet": - self.model = pytorchvideo.models.resnet.create_acoustic_resnet( - input_channel=1, - model_num_class=400, - ) - self.batch_key = "audio" - else: - raise Exception("{self.args.arch} not supported") - - def on_train_epoch_start(self): - """ - For distributed training we need to set the datasets video sampler epoch so - that shuffling is done correctly - """ - epoch = self.trainer.current_epoch - if self.trainer.use_ddp: - self.trainer.datamodule.train_dataset.dataset.video_sampler.set_epoch(epoch) - - def forward(self, x): - """ - Forward defines the prediction/inference actions. - """ - return self.model(x) - - def training_step(self, batch, batch_idx): - """ - This function is called in the inner loop of the training epoch. It must - return a loss that is used for loss.backwards() internally. The self.log(...) - function can be used to log any training metrics. - - PyTorchVideo batches are dictionaries containing each modality or metadata of - the batch collated video clips. Kinetics contains the following notable keys: - { - 'video': , - 'audio': , - 'label': , - } - - - "video" is a Tensor of shape (batch, channels, time, height, Width) - - "audio" is a Tensor of shape (batch, channels, time, 1, frequency) - - "label" is a Tensor of shape (batch, 1) - - The PyTorchVideo models and transforms expect the same input shapes and - dictionary structure making this function just a matter of unwrapping the dict and - feeding it through the model/loss. - """ - x = batch[self.batch_key] - y_hat = self.model(x) - loss = F.cross_entropy(y_hat, batch["label"]) - acc = self.train_accuracy(F.softmax(y_hat, dim=-1), batch["label"]) - self.log("train_loss", loss) - self.log( - "train_acc", acc, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True - ) - return loss - - def validation_step(self, batch, batch_idx): - """ - This function is called in the inner loop of the evaluation cycle. For this - simple example it's mostly the same as the training loop but with a different - metric name. - """ - x = batch[self.batch_key] - y_hat = self.model(x) - loss = F.cross_entropy(y_hat, batch["label"]) - acc = self.val_accuracy(F.softmax(y_hat, dim=-1), batch["label"]) - self.log("val_loss", loss) - self.log( - "val_acc", acc, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True - ) - return loss - - def configure_optimizers(self): - """ - We use the SGD optimizer with per step cosine annealing scheduler. - """ - optimizer = torch.optim.SGD( - self.parameters(), - lr=self.args.lr, - momentum=self.args.momentum, - weight_decay=self.args.weight_decay, - ) - scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( - optimizer, self.args.max_epochs, last_epoch=-1 - ) - return [optimizer], [scheduler] - - -class KineticsDataModule(pytorch_lightning.LightningDataModule): - """ - This LightningDataModule implementation constructs a PyTorchVideo Kinetics dataset for both - the train and val partitions. It defines each partition's augmentation and - preprocessing transforms and configures the PyTorch DataLoaders. - """ - - TRAIN_PATH = 'train.csv' - VAL_PATH = 'val.csv' - - def __init__(self, args): - self.args = args - super().__init__() - - def _make_transforms(self, mode: str): - """ - ################## - # PTV Transforms # - ################## - - # Each PyTorchVideo dataset has a "transform" arg. This arg takes a - # Callable[[Dict], Any], and is used on the output Dict of the dataset to - # define any application specific processing or augmentation. Transforms can - # either be implemented by the user application or reused from any library - # that's domain specific to the modality. E.g. for video we recommend using - # TorchVision, for audio we recommend TorchAudio. - # - # To improve interoperation between domain transform libraries, PyTorchVideo - # provides a dictionary transform API that provides: - # - ApplyTransformToKey(key, transform) - applies a transform to specific modality - # - RemoveKey(key) - remove a specific modality from the clip - # - # In the case that the recommended libraries don't provide transforms that - # are common enough for PyTorchVideo use cases, PyTorchVideo will provide them in - # the same structure as the recommended library. E.g. TorchVision didn't - # have a RandomShortSideScale video transform so it's been added to PyTorchVideo. - """ - if self.args.data_type == "video": - transform = [ - self._video_transform(mode), - RemoveKey("audio"), - ] - elif self.args.data_type == "audio": - transform = [ - self._audio_transform(), - RemoveKey("video"), - ] - else: - raise Exception(f"{self.args.data_type} not supported") - - return Compose(transform) - - def _video_transform(self, mode: str): - """ - This function contains example transforms using both PyTorchVideo and TorchVision - in the same Callable. For 'train' mode, we use augmentations (prepended with - 'Random'), for 'val' mode we use the respective determinstic function. - """ - args = self.args - return ApplyTransformToKey( - key="video", - transform=Compose( - [ - UniformTemporalSubsample(args.video_num_subsampled), - Lambda(lambda x: x/255.0), - Normalize(args.video_means, args.video_stds), - ] - + ( - [ - RandomShortSideScale( - min_size=args.video_min_short_side_scale, - max_size=args.video_max_short_side_scale, - ), - RandomCrop(args.video_crop_size), - RandomHorizontalFlip(p=args.video_horizontal_flip_p), - ] - if mode == "train" - else [ - ShortSideScale(args.video_min_short_side_scale), - CenterCrop(args.video_crop_size), - ] - ) - ), - ) - - def _audio_transform(self): - """ - This function contains example transforms using both PyTorchVideo and TorchAudio - in the same Callable. - """ - args = self.args - n_fft = int( - float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size - ) - hop_length = int( - float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size - ) - eps = 1e-10 - return ApplyTransformToKey( - key="audio", - transform=Compose( - [ - Resample( - orig_freq=args.audio_raw_sample_rate, - new_freq=args.audio_resampled_rate, - ), - MelSpectrogram( - sample_rate=args.audio_resampled_rate, - n_fft=n_fft, - hop_length=hop_length, - n_mels=args.audio_num_mels, - center=False, - ), - Lambda(lambda x: x.clamp(min=eps)), - Lambda(torch.log), - UniformTemporalSubsample(args.audio_mel_num_subsample), - Lambda(lambda x: x.transpose(1, 0)), # (F, T) -> (T, F) - Lambda( - lambda x: x.view(1, x.size(0), 1, x.size(1)) - ), # (T, F) -> (1, T, 1, F) - Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)), - ] - ), - ) - - def train_dataloader(self): - """ - Defines the train DataLoader that the PyTorch Lightning Trainer trains/tests with. - """ - sampler = DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler - train_transform = self._make_transforms(mode="train") - self.train_dataset = LimitDataset( - pytorchvideo.data.Kinetics( - data_path=os.path.join(self.args.data_path, self.TRAIN_PATH), - clip_sampler=pytorchvideo.data.make_clip_sampler( - "random", self.args.clip_duration - ), - video_path_prefix=self.args.video_path_prefix, - transform=train_transform, - video_sampler=sampler, - ) - ) - return torch.utils.data.DataLoader( - self.train_dataset, - batch_size=self.args.batch_size, - num_workers=self.args.workers, - ) - - def val_dataloader(self): - """ - Defines the train DataLoader that the PyTorch Lightning Trainer trains/tests with. - """ - sampler = DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) else RandomSampler - val_transform = self._make_transforms(mode="val") - self.val_dataset = LimitDataset( - pytorchvideo.data.Kinetics( - data_path=os.path.join(self.args.data_path, self.VAL_PATH), - clip_sampler=pytorchvideo.data.make_clip_sampler( - "uniform", self.args.clip_duration - ), - video_path_prefix=self.args.video_path_prefix, - transform=val_transform, - video_sampler=sampler, - ) - ) - return torch.utils.data.DataLoader( - self.val_dataset, - batch_size=self.args.batch_size, - num_workers=self.args.workers, - ) - - -class LimitDataset(torch.utils.data.Dataset): - """ - To ensure a constant number of samples are retrieved from the dataset we use this - LimitDataset wrapper. This is necessary because several of the underlying videos - may be corrupted while fetching or decoding, however, we always want the same - number of steps per epoch. - """ - - def __init__(self, dataset): - super().__init__() - self.dataset = dataset - self.dataset_iter = itertools.chain.from_iterable( - itertools.repeat(iter(dataset), 2) - ) - - def __getitem__(self, index): - return next(self.dataset_iter) - - def __len__(self): - return self.dataset.num_videos - - -def main(): - """ - To train the ResNet with the Kinetics dataset we construct the two modules above, - and pass them to the fit function of a pytorch_lightning.Trainer. - - This example can be run either locally (with default parameters) or on a Slurm - cluster. To run on a Slurm cluster provide the --on_cluster argument. - """ - setup_logger() - - pytorch_lightning.trainer.seed_everything() - parser = argparse.ArgumentParser() +def parse_args(args=None): + parser = ArgumentParser() # Cluster parameters. parser.add_argument("--on_cluster", action="store_true") @@ -384,54 +14,32 @@ def main(): parser.add_argument("--working_directory", default=".", type=str) parser.add_argument("--partition", default="dev", type=str) - # Model parameters. - parser.add_argument("--lr", "--learning-rate", default=0.1, type=float) - parser.add_argument("--momentum", default=0.9, type=float) - parser.add_argument("--weight_decay", default=1e-4, type=float) - parser.add_argument( - "--arch", - default="video_resnet", - choices=["video_resnet", "audio_resnet"], - type=str, - ) + # Model Parameters + parser.add_argument('--lr', '--learning_rate', default=2e-4, type=float) - # Data parameters. - parser.add_argument("--data_path", default=None, type=str, required=True) - parser.add_argument("--video_path_prefix", default="", type=str) - parser.add_argument("--workers", default=8, type=int) - parser.add_argument("--batch_size", default=32, type=int) - parser.add_argument("--clip_duration", default=2, type=float) - parser.add_argument( - "--data_type", default="video", choices=["video", "audio"], type=str - ) - parser.add_argument("--video_num_subsampled", default=8, type=int) - parser.add_argument("--video_means", default=(0.45, 0.45, 0.45), type=tuple) - parser.add_argument("--video_stds", default=(0.225, 0.225, 0.225), type=tuple) - parser.add_argument("--video_crop_size", default=224, type=int) - parser.add_argument("--video_min_short_side_scale", default=256, type=int) - parser.add_argument("--video_max_short_side_scale", default=320, type=int) - parser.add_argument("--video_horizontal_flip_p", default=0.5, type=float) - parser.add_argument("--audio_raw_sample_rate", default=44100, type=int) - parser.add_argument("--audio_resampled_rate", default=16000, type=int) - parser.add_argument("--audio_mel_window_size", default=32, type=int) - parser.add_argument("--audio_mel_step_size", default=16, type=int) - parser.add_argument("--audio_num_mels", default=80, type=int) - parser.add_argument("--audio_mel_num_subsample", default=128, type=int) - parser.add_argument("--audio_logmel_mean", default=-7.03, type=float) - parser.add_argument("--audio_logmel_std", default=4.66, type=float) + # Data Parameters + parser = LabeledVideoDataModule.add_argparse_args(parser) - # Trainer parameters. - parser = pytorch_lightning.Trainer.add_argparse_args(parser) + # Training Parameters + parser = pl.Trainer.add_argparse_args(parser) parser.set_defaults( - max_epochs=200, - callbacks=[LearningRateMonitor()], + callbacks=[pl.callbacks.LearningRateMonitor()], replace_sampler_ddp=False, - reload_dataloaders_every_epoch=False, ) - # Build trainer, ResNet lightning-module and Kinetics data-module. - args = parser.parse_args() + return parser.parse_args(args) + + +def train(args): + pl.seed_everything(224) + dm = LabeledVideoDataModule.from_argparse_args(args) + model = VideoClassificationLightningModule(num_classes=dm.NUM_CLASSES, **vars(args)) + trainer = pl.Trainer.from_argparse_args(args) + trainer.fit(model, dm) + +def main(): + args = parse_args() if args.on_cluster: from slurm import copy_and_run_with_config copy_and_run_with_config( @@ -452,21 +60,5 @@ def main(): train(args) -def train(args): - trainer = pytorch_lightning.Trainer.from_argparse_args(args) - classification_module = VideoClassificationLightningModule(args) - data_module = KineticsDataModule(args) - trainer.fit(classification_module, data_module) - - -def setup_logger(): - ch = logging.StreamHandler() - formatter = logging.Formatter("\n%(asctime)s [%(levelname)s] %(name)s: %(message)s") - ch.setFormatter(formatter) - logger = logging.getLogger("pytorchvideo") - logger.setLevel(logging.DEBUG) - logger.addHandler(ch) - - -if __name__ == "__main__": +if __name__ == '__main__': main() From 7df5f3e321d8bb8b5694fa1e7048147be08811eb Mon Sep 17 00:00:00 2001 From: nateraw Date: Thu, 20 May 2021 11:42:40 -0600 Subject: [PATCH 09/15] :lipstick: apply style --- .../video_classification_example/data.py | 47 +++++++++++-------- .../video_classification_example/finetune.py | 4 +- .../video_classification_example/models.py | 17 +++---- .../video_classification_example/train.py | 5 +- 4 files changed, 38 insertions(+), 35 deletions(-) diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py index b1c29268..b2156f97 100644 --- a/tutorials/video_classification_example/data.py +++ b/tutorials/video_classification_example/data.py @@ -16,7 +16,6 @@ ShortSideScale, UniformTemporalSubsample, ) - from torch.utils.data import DataLoader, DistributedSampler, RandomSampler from torchvision.transforms import ( CenterCrop, @@ -36,7 +35,7 @@ class LabeledVideoDataModule(pl.LightningDataModule): def __init__( self, - root: str = './', + root: str = "./", clip_duration: int = 2, video_num_subsampled: int = 8, video_crop_size: int = 224, @@ -66,7 +65,7 @@ def __init__( # Transforms applied to train dataset self.train_transform = ApplyTransformToKey( - key='video', + key="video", transform=Compose( [ UniformTemporalSubsample(self.video_num_subsampled), @@ -79,21 +78,21 @@ def __init__( RandomCrop(self.video_crop_size), RandomHorizontalFlip(p=self.video_horizontal_flip_p), ] - ) + ), ) # Transforms applied on val dataset or for inference self.val_transform = ApplyTransformToKey( - key='video', + key="video", transform=Compose( [ UniformTemporalSubsample(self.video_num_subsampled), Lambda(lambda x: x / 255.0), Normalize(self.video_means, self.video_stds), ShortSideScale(self.video_min_short_side_scale), - CenterCrop(self.video_crop_size) + CenterCrop(self.video_crop_size), ] - ) + ), ) def prepare_data(self): @@ -105,30 +104,34 @@ def prepare_data(self): def train_dataloader(self): self.train_dataset = LimitDataset( labeled_video_dataset( - data_path=str(Path(self.data_path) / 'train'), + data_path=str(Path(self.data_path) / "train"), clip_sampler=make_clip_sampler("random", self.clip_duration), transform=self.train_transform, decode_audio=False, video_sampler=DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) - else RandomSampler + else RandomSampler, ) ) - return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.workers) + return DataLoader( + self.train_dataset, batch_size=self.batch_size, num_workers=self.workers + ) def val_dataloader(self): self.val_dataset = LimitDataset( labeled_video_dataset( - data_path=str(Path(self.data_path) / 'val'), + data_path=str(Path(self.data_path) / "val"), clip_sampler=make_clip_sampler("uniform", self.clip_duration), transform=self.val_transform, decode_audio=False, video_sampler=DistributedSampler if (self.trainer is not None and self.trainer.use_ddp) - else RandomSampler + else RandomSampler, ) ) - return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.workers) + return DataLoader( + self.val_dataset, batch_size=self.batch_size, num_workers=self.workers + ) class UCF11DataModule(LabeledVideoDataModule): @@ -201,7 +204,7 @@ def setup(self, stage=None): # Collect scene folders within this class # Ex. 'action_youtube_naudio/basketball/v_shooting_01' - for scene_folder in filter(Path.is_dir, class_folder.glob('v_*')): + for scene_folder in filter(Path.is_dir, class_folder.glob("v_*")): scene_folders.append(scene_folder) # Randomly shuffle the scene folders before splitting them into train/val @@ -236,25 +239,29 @@ def train_dataloader(self): self.train_dataset = LimitDataset( LabeledVideoDataset( self.train_paths, - clip_sampler=make_clip_sampler('random', self.clip_duration), + clip_sampler=make_clip_sampler("random", self.clip_duration), decode_audio=False, transform=self.train_transform, - video_sampler=RandomSampler + video_sampler=RandomSampler, ) ) - return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.workers) + return DataLoader( + self.train_dataset, batch_size=self.batch_size, num_workers=self.workers + ) def val_dataloader(self): self.val_dataset = LimitDataset( LabeledVideoDataset( self.val_paths, - clip_sampler=make_clip_sampler('uniform', self.clip_duration), + clip_sampler=make_clip_sampler("uniform", self.clip_duration), decode_audio=False, transform=self.val_transform, - video_sampler=RandomSampler + video_sampler=RandomSampler, ) ) - return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.workers) + return DataLoader( + self.val_dataset, batch_size=self.batch_size, num_workers=self.workers + ) def download_and_unzip(url, data_dir="./", verify=True): diff --git a/tutorials/video_classification_example/finetune.py b/tutorials/video_classification_example/finetune.py index db097110..0dd05734 100644 --- a/tutorials/video_classification_example/finetune.py +++ b/tutorials/video_classification_example/finetune.py @@ -1,5 +1,4 @@ import pytorch_lightning as pl - from data import UCF11DataModule from models import SlowResnet50LightningModel from train import parse_args @@ -17,6 +16,7 @@ def main(): args = parse_args() if args.on_cluster: from slurm import copy_and_run_with_config + copy_and_run_with_config( train, args, @@ -35,5 +35,5 @@ def main(): train(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tutorials/video_classification_example/models.py b/tutorials/video_classification_example/models.py index 9571e8d8..600f59ce 100644 --- a/tutorials/video_classification_example/models.py +++ b/tutorials/video_classification_example/models.py @@ -1,18 +1,12 @@ import pytorch_lightning as pl import torch -from torch import nn -from pytorchvideo.models.resnet import create_resnet from pytorchvideo.models.head import create_res_basic_head +from pytorchvideo.models.resnet import create_resnet +from torch import nn class VideoClassificationLightningModule(pl.LightningModule): - - def __init__( - self, - num_classes: int = 11, - lr: float = 2e-4, - **kwargs - ): + def __init__(self, num_classes: int = 11, lr: float = 2e-4, **kwargs): """A classifier for finetuning pretrained video classification backbones from torchhub. We use the slow_r50 model here, but you can edit this class to use whatever backbone/head you'd like. @@ -125,9 +119,10 @@ def configure_optimizers(self): class SlowResnet50LightningModel(VideoClassificationLightningModule): - def __init__(self, freeze_backbone: bool = True, pretrained: bool = True, **kwargs): - super().__init__(freeze_backbone=freeze_backbone, pretrained=pretrained, **kwargs) + super().__init__( + freeze_backbone=freeze_backbone, pretrained=pretrained, **kwargs + ) def _build_model(self): # The pretrained resnet model - we strip off its head to get the backbone diff --git a/tutorials/video_classification_example/train.py b/tutorials/video_classification_example/train.py index 93ec1eb2..0d7b5ebb 100644 --- a/tutorials/video_classification_example/train.py +++ b/tutorials/video_classification_example/train.py @@ -15,7 +15,7 @@ def parse_args(args=None): parser.add_argument("--partition", default="dev", type=str) # Model Parameters - parser.add_argument('--lr', '--learning_rate', default=2e-4, type=float) + parser.add_argument("--lr", "--learning_rate", default=2e-4, type=float) # Data Parameters parser = LabeledVideoDataModule.add_argparse_args(parser) @@ -42,6 +42,7 @@ def main(): args = parse_args() if args.on_cluster: from slurm import copy_and_run_with_config + copy_and_run_with_config( train, args, @@ -60,5 +61,5 @@ def main(): train(args) -if __name__ == '__main__': +if __name__ == "__main__": main() From 9dba4ad34f2e2887bb0a4c777726ec53de64c35e Mon Sep 17 00:00:00 2001 From: nateraw Date: Thu, 20 May 2021 12:01:52 -0600 Subject: [PATCH 10/15] :art: move sampler statement to its own line --- tutorials/video_classification_example/data.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py index b2156f97..2d8890c5 100644 --- a/tutorials/video_classification_example/data.py +++ b/tutorials/video_classification_example/data.py @@ -102,15 +102,14 @@ def prepare_data(self): download_and_unzip(self.SOURCE_URL, self.root, verify=self.VERIFY_SSL) def train_dataloader(self): + do_use_ddp = self.trainer is not None and self.trainer.use_ddp self.train_dataset = LimitDataset( labeled_video_dataset( data_path=str(Path(self.data_path) / "train"), clip_sampler=make_clip_sampler("random", self.clip_duration), transform=self.train_transform, decode_audio=False, - video_sampler=DistributedSampler - if (self.trainer is not None and self.trainer.use_ddp) - else RandomSampler, + video_sampler=DistributedSampler if do_use_ddp else RandomSampler, ) ) return DataLoader( @@ -118,15 +117,14 @@ def train_dataloader(self): ) def val_dataloader(self): + do_use_ddp = self.trainer is not None and self.trainer.use_ddp self.val_dataset = LimitDataset( labeled_video_dataset( data_path=str(Path(self.data_path) / "val"), clip_sampler=make_clip_sampler("uniform", self.clip_duration), transform=self.val_transform, decode_audio=False, - video_sampler=DistributedSampler - if (self.trainer is not None and self.trainer.use_ddp) - else RandomSampler, + video_sampler=DistributedSampler if do_use_ddp else RandomSampler, ) ) return DataLoader( @@ -180,7 +178,7 @@ def __init__(self, **kwargs): """ super().__init__(**kwargs) - def setup(self, stage=None): + def setup(self, stage: str = None): """Set up anything needed for initializing train/val datasets. This runs on all nodes""" # Names of classes to predict From 0886737ad51aaa1d3081b9e9041b5e2ab8d1fce0 Mon Sep 17 00:00:00 2001 From: nateraw Date: Thu, 20 May 2021 12:16:23 -0600 Subject: [PATCH 11/15] :pencil: writing docs --- .../video_classification_example/data.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py index 2d8890c5..ca8b3bd0 100644 --- a/tutorials/video_classification_example/data.py +++ b/tutorials/video_classification_example/data.py @@ -48,6 +48,43 @@ def __init__( workers: int = 4, **kwargs ): + """ + A LabeledVideoDataModule expects a dataset in the following format: + + /root # Root Folder + ├── train # Split Folder + │ ├── archery # Class Folder + │ │ ├── -1q7jA3DXQM_000005_000015.mp4 # Videos + │ │ ├── -5NN5hdIwTc_000036_000046.mp4 + │ │ ... + │ ├── bowling + │ │ ├── -5ExwuF5IUI_000030_000040.mp4 + │ │ ... + │ ├── high_jump + │ │ ├── -5ExwuF5IUI_000030_000040.mp4 + │ │ ... + ├── val + │ ├── archery + │ │ ├── -1q7jA3DXQM_000005_000015.mp4 + │ │ ├── -5NN5hdIwTc_000036_000046.mp4 + │ │ ... + │ ├── bowling + │ │ ├── -5ExwuF5IUI_000030_000040.mp4 + │ │ ... + + Args: + root (str, optional): Directory where your dataset is stored. Defaults to "./". + clip_duration (int, optional): Duration of clip samples. Defaults to 2. + video_num_subsampled (int, optional): Number of subsamples to take of individual videos. Defaults to 8. + video_crop_size (int, optional): Size to crop the video to. Defaults to 224. + video_means (Tuple[float], optional): Means used to normalize dataset. Defaults to (0.45, 0.45, 0.45). + video_stds (Tuple[float], optional): Standard deviations used to normalized dataset. Defaults to (0.225, 0.225, 0.225). + video_min_short_side_scale (int, optional): min_size arg passed to pytorchvideo.transforms.RandomShortSideScale. Defaults to 256. + video_max_short_side_scale (int, optional): max_size arg passed to pytorchvideo.transforms.RandomShortSideScale. Defaults to 320. + video_horizontal_flip_p (float, optional): Probability of flipping a training example horizontally. Defaults to 0.5. + batch_size (int, optional): Number of examples per batch. Defaults to 4. + workers (int, optional): Number of DataLoader workers. Defaults to 4. + """ super().__init__() self.root = root From 8a9f527e743ef58a5a6d407fcfaf44ebda934bf7 Mon Sep 17 00:00:00 2001 From: nateraw Date: Thu, 20 May 2021 12:35:24 -0600 Subject: [PATCH 12/15] :pencil: update docstring with more specific path --- tutorials/video_classification_example/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py index ca8b3bd0..22d2b91b 100644 --- a/tutorials/video_classification_example/data.py +++ b/tutorials/video_classification_example/data.py @@ -188,7 +188,7 @@ def __init__(self, **kwargs): The folder structure looks like the following: - /data_dir + /root/action_youtube_naudio ├── basketball # Class Folder Path │ ├── v_shooting_01 # Scene/Group Folder Path │ │ ├── v_shooting_01_01.avi # Video Path From 608c16be705243d135978277a3a3b7490beb1d99 Mon Sep 17 00:00:00 2001 From: nateraw Date: Thu, 20 May 2021 12:44:22 -0600 Subject: [PATCH 13/15] :pencil: add periods to keep it consistent --- .../video_classification_example/data.py | 22 +++++++++---------- .../video_classification_example/models.py | 20 ++++++++--------- .../video_classification_example/train.py | 6 ++--- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/tutorials/video_classification_example/data.py b/tutorials/video_classification_example/data.py index 22d2b91b..49972d9b 100644 --- a/tutorials/video_classification_example/data.py +++ b/tutorials/video_classification_example/data.py @@ -100,7 +100,7 @@ def __init__( self.batch_size = batch_size self.workers = workers - # Transforms applied to train dataset + # Transforms applied to train dataset. self.train_transform = ApplyTransformToKey( key="video", transform=Compose( @@ -118,7 +118,7 @@ def __init__( ), ) - # Transforms applied on val dataset or for inference + # Transforms applied on val dataset or for inference. self.val_transform = ApplyTransformToKey( key="video", transform=Compose( @@ -216,9 +216,9 @@ def __init__(self, **kwargs): super().__init__(**kwargs) def setup(self, stage: str = None): - """Set up anything needed for initializing train/val datasets. This runs on all nodes""" + """Set up anything needed for initializing train/val datasets. This runs on all nodes.""" - # Names of classes to predict + # Names of classes to predict. # Ex. ['basketball', 'biking', 'diving', ...] self.classes = sorted(x.name for x in self.data_path.glob("*") if x.is_dir()) @@ -226,23 +226,23 @@ def setup(self, stage: str = None): # Ex. {'basketball': 0, 'biking': 1, 'diving': 2, ...} self.label_to_id = {} - # A list to hold all available scenes across all classes + # A list to hold all available scenes across all classes. scene_folders = [] for class_id, class_name in enumerate(self.classes): self.label_to_id[class_name] = class_id - # The path of a class folder within self.data_path + # The path of a class folder within self.data_path. # Ex. 'action_youtube_naudio/{basketball|biking|diving|...}' class_folder = self.data_path / class_name - # Collect scene folders within this class + # Collect scene folders within this class. # Ex. 'action_youtube_naudio/basketball/v_shooting_01' for scene_folder in filter(Path.is_dir, class_folder.glob("v_*")): scene_folders.append(scene_folder) - # Randomly shuffle the scene folders before splitting them into train/val + # Randomly shuffle the scene folders before splitting them into train/val. shuffle(scene_folders) # Determine number of scenes in train/validation splits. @@ -250,18 +250,18 @@ def setup(self, stage: str = None): self.num_val_scenes = len(scene_folders) - self.num_train_scenes # Collect train/val paths to videos within each scene folder. - # Validation only uses videos from scenes not seen by model during training + # Validation only uses videos from scenes not seen by model during training. self.train_paths = [] self.val_paths = [] for i, scene_path in enumerate(scene_folders): - # The actual name of the class (Ex. 'basketball') + # The actual name of the class (Ex. 'basketball'). class_name = scene_path.parent.name # Loop over all the videos within the given scene folder. for video_path in scene_path.glob("*.avi"): - # Construct a tuple containing (, ) + # Construct a tuple containing (, ). # In our case, we assign the class's ID as 'label'. labeled_path = (video_path, {"label": self.label_to_id[class_name]}) diff --git a/tutorials/video_classification_example/models.py b/tutorials/video_classification_example/models.py index 600f59ce..52233bd1 100644 --- a/tutorials/video_classification_example/models.py +++ b/tutorials/video_classification_example/models.py @@ -26,10 +26,10 @@ def __init__(self, num_classes: int = 11, lr: float = 2e-4, **kwargs): # Saves all kwargs to self.hparams. Use references to self.hparams., not the init args themselves. self.save_hyperparameters() - # Build the model in separate function so its easier to override + # Build the model in separate function so its easier to override. self.model = self._build_model() - # Metrics we will keep track of + # Metrics we will keep track of. self.loss_fn = nn.CrossEntropyLoss() self.train_acc = pl.metrics.Accuracy() self.val_acc = pl.metrics.Accuracy() @@ -80,20 +80,20 @@ def shared_step(self, batch, mode: str): torch.Tensor: The loss for a single batch step. """ - # Pass video tensor through model to get outputs + # Pass video tensor through model to get outputs. outputs = self(batch["video"]) - # Compute and log the cross entropy loss to {train|val}_loss in TensorBoard + # Compute and log the cross entropy loss to {train|val}_loss in TensorBoard. loss = self.loss_fn(outputs, batch["label"]) self.log(f"{mode}_loss", loss) - # Predicted class probabilities - (BATCH_SIZE, NUM_CLASSES) + # Predicted class probabilities - (BATCH_SIZE, NUM_CLASSES). proba = outputs.softmax(dim=1) - # Predicted classes - (BATCH_SIZE,) + # Predicted classes - (BATCH_SIZE,). preds = proba.argmax(dim=1) - # Compute the predicted class accuracy and log it to {train|val}_acc in TensorBoard + # Compute the predicted class accuracy and log it to {train|val}_acc in TensorBoard. acc = self.accuracy[mode](preds, batch["label"]) self.log(f"{mode}_acc", acc, prog_bar=True) @@ -125,7 +125,7 @@ def __init__(self, freeze_backbone: bool = True, pretrained: bool = True, **kwar ) def _build_model(self): - # The pretrained resnet model - we strip off its head to get the backbone + # The pretrained resnet model - we strip off its head to get the backbone. resnet = torch.hub.load( "facebookresearch/pytorchvideo", "slow_r50", @@ -133,12 +133,12 @@ def _build_model(self): ) self.backbone = nn.Sequential(*list(resnet.children())[0][:-1]) - # Freeze the backbone layers if specified + # Freeze the backbone layers if specified. if self.hparams.freeze_backbone: for param in self.backbone.parameters(): param.requires_grad = False - # Create a new head we will train on top of the backbone + # Create a new head we will train on top of the backbone. self.head = create_res_basic_head( in_features=2048, out_features=self.hparams.num_classes ) diff --git a/tutorials/video_classification_example/train.py b/tutorials/video_classification_example/train.py index 0d7b5ebb..8568b9c2 100644 --- a/tutorials/video_classification_example/train.py +++ b/tutorials/video_classification_example/train.py @@ -14,13 +14,13 @@ def parse_args(args=None): parser.add_argument("--working_directory", default=".", type=str) parser.add_argument("--partition", default="dev", type=str) - # Model Parameters + # Model Parameters. parser.add_argument("--lr", "--learning_rate", default=2e-4, type=float) - # Data Parameters + # Data Parameters. parser = LabeledVideoDataModule.add_argparse_args(parser) - # Training Parameters + # Training Parameters. parser = pl.Trainer.add_argparse_args(parser) parser.set_defaults( callbacks=[pl.callbacks.LearningRateMonitor()], From cde0ce75db03f1bbe7c52878ef5035151e7e1d11 Mon Sep 17 00:00:00 2001 From: nateraw Date: Sun, 23 May 2021 18:44:35 -0600 Subject: [PATCH 14/15] :fire: remove inline comments --- tutorials/video_classification_example/models.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tutorials/video_classification_example/models.py b/tutorials/video_classification_example/models.py index 52233bd1..2ec97fa6 100644 --- a/tutorials/video_classification_example/models.py +++ b/tutorials/video_classification_example/models.py @@ -80,20 +80,14 @@ def shared_step(self, batch, mode: str): torch.Tensor: The loss for a single batch step. """ - # Pass video tensor through model to get outputs. outputs = self(batch["video"]) - # Compute and log the cross entropy loss to {train|val}_loss in TensorBoard. loss = self.loss_fn(outputs, batch["label"]) self.log(f"{mode}_loss", loss) - # Predicted class probabilities - (BATCH_SIZE, NUM_CLASSES). proba = outputs.softmax(dim=1) - - # Predicted classes - (BATCH_SIZE,). preds = proba.argmax(dim=1) - # Compute the predicted class accuracy and log it to {train|val}_acc in TensorBoard. acc = self.accuracy[mode](preds, batch["label"]) self.log(f"{mode}_acc", acc, prog_bar=True) From 6608f9ace2145658ab4f1452bfd2832c5f2d5d6a Mon Sep 17 00:00:00 2001 From: nateraw Date: Sun, 23 May 2021 18:48:00 -0600 Subject: [PATCH 15/15] :fire: removing incomplete finetuning tutorial for now --- website/docs/tutorial_finetuning.md | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 website/docs/tutorial_finetuning.md diff --git a/website/docs/tutorial_finetuning.md b/website/docs/tutorial_finetuning.md deleted file mode 100644 index ba7d1763..00000000 --- a/website/docs/tutorial_finetuning.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -id: tutorial_finetuning -title: Finetune a TorchHub model for Classification ---- - -# Introduction - -In this tutorial, you will learn how to finetune a pre-trained [Slow Resnet50 model from TorchHub](https://pytorch.org/hub/facebookresearch_pytorchvideo_resnet/) on the [UCF11 Dataset](https://www.crcv.ucf.edu/data/UCF_YouTube_Action.php). -