From 4feccb607d7a16933d485495f91d067f177dd8db Mon Sep 17 00:00:00 2001
From: facebook-github-bot
Date: Tue, 13 Apr 2021 11:02:30 -0700
Subject: [PATCH] Initial commit
fbshipit-source-id: f0a1066f07cf2ccd46c3e06003937406dc6fded4
---
.circleci/config.yml | 205 +++
.flake8 | 6 +
.github/CODE_OF_CONDUCT.md | 76 +
.github/CONTRIBUTING.md | 55 +
.github/ISSUE_TEMPLATE/bugs.md | 30 +
.github/ISSUE_TEMPLATE/config.yml | 1 +
.github/ISSUE_TEMPLATE/feature_request.md | 21 +
.github/ISSUE_TEMPLATE/questions-help.md | 21 +
.github/PULL_REQUEST_TEMPLATE.md | 30 +
.github/logo_horizontal_color.png | Bin 0 -> 8071 bytes
.github/logo_horizontal_color.svg | 1 +
.gitignore | 34 +
.readthedocs.yml | 25 +
CONTRIBUTING.md | 41 +
INSTALL.md | 68 +
LICENSE | 201 +++
MANIFEST.in | 3 +
README.md | 79 +
dev/README.md | 11 +
dev/linter.sh | 32 +
docs/Makefile | 20 +
docs/README.md | 65 +
docs/make.bat | 35 +
docs/requirements.txt | 16 +
.../source/_static/css/pytorchvideo_theme.css | 134 ++
docs/source/_static/img/ptv_logo.png | Bin 0 -> 10053 bytes
docs/source/_static/img/ptv_logo.svg | 1 +
docs/source/_templates/layout.html | 25 +
docs/source/accelerator.md | 53 +
docs/source/api/data/charades.rst | 10 +
docs/source/api/data/domsev.rst | 9 +
docs/source/api/data/encoded_video.rst | 59 +
docs/source/api/data/epic_kitchen.rst | 18 +
docs/source/api/data/extra.rst | 48 +
docs/source/api/data/hmdb51.rst | 9 +
docs/source/api/data/index.rst | 11 +
docs/source/api/index.rst | 9 +
docs/source/api/layers/index.rst | 6 +
docs/source/api/layers/layers.rst | 72 +
docs/source/api/models/byol.rst | 8 +
docs/source/api/models/csn.rst | 8 +
docs/source/api/models/head.rst | 8 +
docs/source/api/models/index.rst | 17 +
docs/source/api/models/masked_multistream.rst | 8 +
docs/source/api/models/memory_bank.rst | 8 +
docs/source/api/models/net.rst | 8 +
docs/source/api/models/r2plus1d.rst | 8 +
docs/source/api/models/resnet.rst | 9 +
docs/source/api/models/simclr.rst | 8 +
docs/source/api/models/slowfast.rst | 8 +
docs/source/api/models/stem.rst | 8 +
docs/source/api/models/x3d.rst | 8 +
docs/source/api/transforms/index.rst | 6 +
docs/source/api/transforms/transforms.rst | 18 +
docs/source/conf.py | 197 +++
docs/source/data.md | 48 +
docs/source/data_preparation.md | 45 +
docs/source/index.rst | 47 +
docs/source/model_zoo.md | 63 +
docs/source/models.md | 180 +++
docs/source/transforms.md | 33 +
hubconf.py | 13 +
pytorchvideo/__init__.py | 3 +
pytorchvideo/accelerator/__init__.py | 1 +
.../accelerator/deployment/__init__.py | 1 +
.../accelerator/deployment/common/__init__.py | 1 +
.../deployment/common/model_transmuter.py | 86 +
.../deployment/mobile_cpu/__init__.py | 1 +
.../mobile_cpu/transmuter/__init__.py | 10 +
.../transmuter/transmuter_mobile_cpu.py | 204 +++
.../deployment/mobile_cpu/utils/__init__.py | 1 +
.../mobile_cpu/utils/model_conversion.py | 102 ++
.../accelerator/efficient_blocks/__init__.py | 1 +
.../efficient_blocks/efficient_block_base.py | 35 +
.../efficient_blocks/no_op_convert_block.py | 26 +
pytorchvideo/data/__init__.py | 12 +
pytorchvideo/data/charades.py | 225 +++
pytorchvideo/data/clip_sampling.py | 171 ++
pytorchvideo/data/dataset_manifest_utils.py | 266 +++
pytorchvideo/data/decoder.py | 7 +
pytorchvideo/data/domsev.py | 321 ++++
pytorchvideo/data/encoded_video.py | 123 ++
pytorchvideo/data/encoded_video_dataset.py | 279 ++++
pytorchvideo/data/encoded_video_pyav.py | 286 ++++
.../data/encoded_video_torchvision.py | 255 +++
pytorchvideo/data/epic_kitchen/__init__.py | 3 +
.../data/epic_kitchen/epic_kitchen_dataset.py | 195 +++
pytorchvideo/data/epic_kitchen/utils.py | 197 +++
pytorchvideo/data/epic_kitchen_forecasting.py | 295 ++++
pytorchvideo/data/epic_kitchen_recognition.py | 212 +++
pytorchvideo/data/frame_video.py | 210 +++
pytorchvideo/data/hmdb51.py | 230 +++
pytorchvideo/data/kinetics.py | 10 +
pytorchvideo/data/labeled_video_paths.py | 139 ++
pytorchvideo/data/ssv2.py | 254 +++
pytorchvideo/data/ucf101.py | 10 +
pytorchvideo/data/utils.py | 278 ++++
pytorchvideo/data/video.py | 72 +
pytorchvideo/layers/__init__.py | 5 +
pytorchvideo/layers/accelerator/__init__.py | 1 +
.../layers/accelerator/mobile_cpu/__init__.py | 1 +
.../mobile_cpu/activation_functions.py | 103 ++
.../accelerator/mobile_cpu/attention.py | 109 ++
.../accelerator/mobile_cpu/conv_helper.py | 556 +++++++
.../accelerator/mobile_cpu/convolutions.py | 592 +++++++
.../accelerator/mobile_cpu/fully_connected.py | 26 +
.../layers/accelerator/mobile_cpu/pool.py | 113 ++
pytorchvideo/layers/batch_norm.py | 120 ++
pytorchvideo/layers/convolutions.py | 237 +++
pytorchvideo/layers/distributed.py | 15 +
pytorchvideo/layers/fusion.py | 149 ++
pytorchvideo/layers/mlp.py | 62 +
pytorchvideo/layers/nonlocal_net.py | 153 ++
pytorchvideo/layers/positional_encoding.py | 42 +
pytorchvideo/layers/squeeze_excitation.py | 182 +++
pytorchvideo/layers/swish.py | 34 +
pytorchvideo/layers/utils.py | 49 +
pytorchvideo/models/__init__.py | 18 +
pytorchvideo/models/accelerator/__init__.py | 1 +
.../models/accelerator/mobile_cpu/__init__.py | 1 +
.../accelerator/mobile_cpu/efficient_x3d.py | 195 +++
.../accelerator/mobile_cpu/residual_blocks.py | 214 +++
pytorchvideo/models/byol.py | 140 ++
pytorchvideo/models/csn.py | 187 +++
pytorchvideo/models/head.py | 164 ++
pytorchvideo/models/hub/__init__.py | 6 +
.../models/hub/efficient_x3d_mobile_cpu.py | 80 +
pytorchvideo/models/hub/resnet.py | 52 +
pytorchvideo/models/hub/slowfast.py | 92 ++
pytorchvideo/models/hub/x3d.py | 125 ++
pytorchvideo/models/masked_multistream.py | 384 +++++
pytorchvideo/models/memory_bank.py | 113 ++
pytorchvideo/models/net.py | 92 ++
pytorchvideo/models/r2plus1d.py | 309 ++++
pytorchvideo/models/resnet.py | 1383 ++++++++++++++++
pytorchvideo/models/simclr.py | 63 +
pytorchvideo/models/slowfast.py | 487 ++++++
pytorchvideo/models/stem.py | 260 +++
pytorchvideo/models/weight_init.py | 42 +
pytorchvideo/models/x3d.py | 800 +++++++++
pytorchvideo/transforms/__init__.py | 3 +
pytorchvideo/transforms/functional.py | 212 +++
pytorchvideo/transforms/transforms.py | 104 ++
setup.cfg | 8 +
setup.py | 79 +
tests/README.md | 21 +
tests/__init__.py | 1 +
.../benchmark_accelerator_efficient_blocks.py | 355 ++++
tests/benchmark_transforms.py | 82 +
..._deployment_mobile_cpu_model_conversion.py | 83 +
...accelerator_deployment_model_transmuter.py | 87 +
..._blocks_mobile_cpu_activation_attention.py | 55 +
...ator_efficient_blocks_mobile_cpu_conv3d.py | 144 ++
..._efficient_blocks_mobile_cpu_head_layer.py | 81 +
...icient_blocks_mobile_cpu_residual_block.py | 56 +
.../test_accelerator_models_efficient_x3d.py | 102 ++
tests/test_data_charades_dataset.py | 114 ++
tests/test_data_dataset_manifest_utils.py | 148 ++
tests/test_data_domsev_dataset.py | 242 +++
tests/test_data_encoded_video.py | 140 ++
tests/test_data_encoded_video_dataset.py | 686 ++++++++
tests/test_data_epic_kitchen_dataset.py | 258 +++
tests/test_data_epic_kitchen_forecasting.py | 424 +++++
tests/test_data_epic_kitchen_recognition.py | 166 ++
tests/test_data_epic_kitchen_utils.py | 190 +++
tests/test_data_frame_video.py | 50 +
tests/test_data_ssv2_dataset.py | 96 ++
tests/test_data_utils.py | 126 ++
tests/test_layers_convolutions.py | 219 +++
tests/test_layers_fusion.py | 64 +
tests/test_layers_mlp.py | 36 +
tests/test_layers_nonlocal_net.py | 159 ++
tests/test_layers_positional_encoding.py | 69 +
tests/test_layers_squeeze_excitation.py | 51 +
tests/test_models_byol.py | 36 +
tests/test_models_csn.py | 96 ++
tests/test_models_head.py | 171 ++
tests/test_models_masked_multistream.py | 130 ++
tests/test_models_memory_bank.py | 37 +
tests/test_models_r2plus1d.py | 102 ++
tests/test_models_resnet.py | 1440 +++++++++++++++++
tests/test_models_slowfast.py | 99 ++
tests/test_models_stem.py | 303 ++++
tests/test_models_x3d.py | 135 ++
tests/test_simclr.py | 38 +
tests/test_transforms.py | 196 +++
tests/utils.py | 217 +++
..._model_with_PytorchVideo_Accelerator.ipynb | 464 ++++++
.../accelerator/Use_Model_Transmuter.ipynb | 279 ++++
...e_PytorchVideo_Accelerator_Model_Zoo.ipynb | 345 ++++
tutorials/torchhub_inference_tutorial.ipynb | 264 +++
website/.dockerignore | 2 +
website/.gitignore | 12 +
.../tutorial_accelerator_build_your_model.md | 439 +++++
...l_accelerator_use_accelerator_model_zoo.md | 118 ++
...torial_accelerator_use_model_transmuter.md | 98 ++
website/docs/tutorial_classification.md | 221 +++
website/docs/tutorial_overview.md | 10 +
website/docs/tutorial_torchhub_inference.md | 164 ++
website/website/README.md | 216 +++
website/website/core/Footer.js | 91 ++
website/website/package.json | 14 +
website/website/pages/en/index.js | 237 +++
website/website/sidebars.json | 7 +
website/website/siteConfig.js | 63 +
website/website/static/css/custom.css | 348 ++++
website/website/static/img/efficient.svg | 1 +
website/website/static/img/favicon.png | Bin 0 -> 664 bytes
website/website/static/img/logo.svg | 1 +
website/website/static/img/logo_no_text.svg | 1 +
website/website/static/img/logo_white.svg | 1 +
website/website/static/img/modelzoo.svg | 1 +
website/website/static/img/oss_logo.png | Bin 0 -> 4370 bytes
website/website/static/img/pytorch.svg | 12 +
website/website/static/img/reproducible.svg | 1 +
215 files changed, 25704 insertions(+)
create mode 100644 .circleci/config.yml
create mode 100644 .flake8
create mode 100644 .github/CODE_OF_CONDUCT.md
create mode 100644 .github/CONTRIBUTING.md
create mode 100644 .github/ISSUE_TEMPLATE/bugs.md
create mode 100644 .github/ISSUE_TEMPLATE/config.yml
create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md
create mode 100644 .github/ISSUE_TEMPLATE/questions-help.md
create mode 100644 .github/PULL_REQUEST_TEMPLATE.md
create mode 100644 .github/logo_horizontal_color.png
create mode 100644 .github/logo_horizontal_color.svg
create mode 100644 .gitignore
create mode 100644 .readthedocs.yml
create mode 100644 CONTRIBUTING.md
create mode 100644 INSTALL.md
create mode 100644 LICENSE
create mode 100644 MANIFEST.in
create mode 100644 README.md
create mode 100644 dev/README.md
create mode 100755 dev/linter.sh
create mode 100644 docs/Makefile
create mode 100644 docs/README.md
create mode 100644 docs/make.bat
create mode 100644 docs/requirements.txt
create mode 100644 docs/source/_static/css/pytorchvideo_theme.css
create mode 100644 docs/source/_static/img/ptv_logo.png
create mode 100644 docs/source/_static/img/ptv_logo.svg
create mode 100644 docs/source/_templates/layout.html
create mode 100644 docs/source/accelerator.md
create mode 100644 docs/source/api/data/charades.rst
create mode 100644 docs/source/api/data/domsev.rst
create mode 100644 docs/source/api/data/encoded_video.rst
create mode 100644 docs/source/api/data/epic_kitchen.rst
create mode 100644 docs/source/api/data/extra.rst
create mode 100644 docs/source/api/data/hmdb51.rst
create mode 100644 docs/source/api/data/index.rst
create mode 100644 docs/source/api/index.rst
create mode 100644 docs/source/api/layers/index.rst
create mode 100644 docs/source/api/layers/layers.rst
create mode 100644 docs/source/api/models/byol.rst
create mode 100644 docs/source/api/models/csn.rst
create mode 100644 docs/source/api/models/head.rst
create mode 100644 docs/source/api/models/index.rst
create mode 100644 docs/source/api/models/masked_multistream.rst
create mode 100644 docs/source/api/models/memory_bank.rst
create mode 100644 docs/source/api/models/net.rst
create mode 100644 docs/source/api/models/r2plus1d.rst
create mode 100644 docs/source/api/models/resnet.rst
create mode 100644 docs/source/api/models/simclr.rst
create mode 100644 docs/source/api/models/slowfast.rst
create mode 100644 docs/source/api/models/stem.rst
create mode 100644 docs/source/api/models/x3d.rst
create mode 100644 docs/source/api/transforms/index.rst
create mode 100644 docs/source/api/transforms/transforms.rst
create mode 100644 docs/source/conf.py
create mode 100644 docs/source/data.md
create mode 100644 docs/source/data_preparation.md
create mode 100644 docs/source/index.rst
create mode 100644 docs/source/model_zoo.md
create mode 100644 docs/source/models.md
create mode 100644 docs/source/transforms.md
create mode 100644 hubconf.py
create mode 100644 pytorchvideo/__init__.py
create mode 100644 pytorchvideo/accelerator/__init__.py
create mode 100644 pytorchvideo/accelerator/deployment/__init__.py
create mode 100644 pytorchvideo/accelerator/deployment/common/__init__.py
create mode 100644 pytorchvideo/accelerator/deployment/common/model_transmuter.py
create mode 100644 pytorchvideo/accelerator/deployment/mobile_cpu/__init__.py
create mode 100644 pytorchvideo/accelerator/deployment/mobile_cpu/transmuter/__init__.py
create mode 100644 pytorchvideo/accelerator/deployment/mobile_cpu/transmuter/transmuter_mobile_cpu.py
create mode 100644 pytorchvideo/accelerator/deployment/mobile_cpu/utils/__init__.py
create mode 100644 pytorchvideo/accelerator/deployment/mobile_cpu/utils/model_conversion.py
create mode 100644 pytorchvideo/accelerator/efficient_blocks/__init__.py
create mode 100644 pytorchvideo/accelerator/efficient_blocks/efficient_block_base.py
create mode 100644 pytorchvideo/accelerator/efficient_blocks/no_op_convert_block.py
create mode 100644 pytorchvideo/data/__init__.py
create mode 100644 pytorchvideo/data/charades.py
create mode 100644 pytorchvideo/data/clip_sampling.py
create mode 100644 pytorchvideo/data/dataset_manifest_utils.py
create mode 100644 pytorchvideo/data/decoder.py
create mode 100644 pytorchvideo/data/domsev.py
create mode 100644 pytorchvideo/data/encoded_video.py
create mode 100644 pytorchvideo/data/encoded_video_dataset.py
create mode 100644 pytorchvideo/data/encoded_video_pyav.py
create mode 100644 pytorchvideo/data/encoded_video_torchvision.py
create mode 100644 pytorchvideo/data/epic_kitchen/__init__.py
create mode 100644 pytorchvideo/data/epic_kitchen/epic_kitchen_dataset.py
create mode 100644 pytorchvideo/data/epic_kitchen/utils.py
create mode 100644 pytorchvideo/data/epic_kitchen_forecasting.py
create mode 100644 pytorchvideo/data/epic_kitchen_recognition.py
create mode 100644 pytorchvideo/data/frame_video.py
create mode 100644 pytorchvideo/data/hmdb51.py
create mode 100644 pytorchvideo/data/kinetics.py
create mode 100644 pytorchvideo/data/labeled_video_paths.py
create mode 100644 pytorchvideo/data/ssv2.py
create mode 100644 pytorchvideo/data/ucf101.py
create mode 100644 pytorchvideo/data/utils.py
create mode 100644 pytorchvideo/data/video.py
create mode 100644 pytorchvideo/layers/__init__.py
create mode 100644 pytorchvideo/layers/accelerator/__init__.py
create mode 100644 pytorchvideo/layers/accelerator/mobile_cpu/__init__.py
create mode 100644 pytorchvideo/layers/accelerator/mobile_cpu/activation_functions.py
create mode 100644 pytorchvideo/layers/accelerator/mobile_cpu/attention.py
create mode 100644 pytorchvideo/layers/accelerator/mobile_cpu/conv_helper.py
create mode 100644 pytorchvideo/layers/accelerator/mobile_cpu/convolutions.py
create mode 100644 pytorchvideo/layers/accelerator/mobile_cpu/fully_connected.py
create mode 100644 pytorchvideo/layers/accelerator/mobile_cpu/pool.py
create mode 100644 pytorchvideo/layers/batch_norm.py
create mode 100644 pytorchvideo/layers/convolutions.py
create mode 100644 pytorchvideo/layers/distributed.py
create mode 100644 pytorchvideo/layers/fusion.py
create mode 100644 pytorchvideo/layers/mlp.py
create mode 100644 pytorchvideo/layers/nonlocal_net.py
create mode 100644 pytorchvideo/layers/positional_encoding.py
create mode 100644 pytorchvideo/layers/squeeze_excitation.py
create mode 100644 pytorchvideo/layers/swish.py
create mode 100644 pytorchvideo/layers/utils.py
create mode 100644 pytorchvideo/models/__init__.py
create mode 100644 pytorchvideo/models/accelerator/__init__.py
create mode 100644 pytorchvideo/models/accelerator/mobile_cpu/__init__.py
create mode 100644 pytorchvideo/models/accelerator/mobile_cpu/efficient_x3d.py
create mode 100644 pytorchvideo/models/accelerator/mobile_cpu/residual_blocks.py
create mode 100644 pytorchvideo/models/byol.py
create mode 100644 pytorchvideo/models/csn.py
create mode 100644 pytorchvideo/models/head.py
create mode 100644 pytorchvideo/models/hub/__init__.py
create mode 100644 pytorchvideo/models/hub/efficient_x3d_mobile_cpu.py
create mode 100644 pytorchvideo/models/hub/resnet.py
create mode 100644 pytorchvideo/models/hub/slowfast.py
create mode 100644 pytorchvideo/models/hub/x3d.py
create mode 100644 pytorchvideo/models/masked_multistream.py
create mode 100644 pytorchvideo/models/memory_bank.py
create mode 100644 pytorchvideo/models/net.py
create mode 100644 pytorchvideo/models/r2plus1d.py
create mode 100644 pytorchvideo/models/resnet.py
create mode 100644 pytorchvideo/models/simclr.py
create mode 100644 pytorchvideo/models/slowfast.py
create mode 100644 pytorchvideo/models/stem.py
create mode 100644 pytorchvideo/models/weight_init.py
create mode 100644 pytorchvideo/models/x3d.py
create mode 100644 pytorchvideo/transforms/__init__.py
create mode 100644 pytorchvideo/transforms/functional.py
create mode 100644 pytorchvideo/transforms/transforms.py
create mode 100644 setup.cfg
create mode 100755 setup.py
create mode 100644 tests/README.md
create mode 100644 tests/__init__.py
create mode 100644 tests/benchmark_accelerator_efficient_blocks.py
create mode 100644 tests/benchmark_transforms.py
create mode 100644 tests/test_accelerator_deployment_mobile_cpu_model_conversion.py
create mode 100644 tests/test_accelerator_deployment_model_transmuter.py
create mode 100644 tests/test_accelerator_efficient_blocks_mobile_cpu_activation_attention.py
create mode 100644 tests/test_accelerator_efficient_blocks_mobile_cpu_conv3d.py
create mode 100644 tests/test_accelerator_efficient_blocks_mobile_cpu_head_layer.py
create mode 100644 tests/test_accelerator_efficient_blocks_mobile_cpu_residual_block.py
create mode 100644 tests/test_accelerator_models_efficient_x3d.py
create mode 100644 tests/test_data_charades_dataset.py
create mode 100644 tests/test_data_dataset_manifest_utils.py
create mode 100644 tests/test_data_domsev_dataset.py
create mode 100644 tests/test_data_encoded_video.py
create mode 100644 tests/test_data_encoded_video_dataset.py
create mode 100644 tests/test_data_epic_kitchen_dataset.py
create mode 100644 tests/test_data_epic_kitchen_forecasting.py
create mode 100644 tests/test_data_epic_kitchen_recognition.py
create mode 100644 tests/test_data_epic_kitchen_utils.py
create mode 100644 tests/test_data_frame_video.py
create mode 100644 tests/test_data_ssv2_dataset.py
create mode 100644 tests/test_data_utils.py
create mode 100644 tests/test_layers_convolutions.py
create mode 100644 tests/test_layers_fusion.py
create mode 100644 tests/test_layers_mlp.py
create mode 100644 tests/test_layers_nonlocal_net.py
create mode 100644 tests/test_layers_positional_encoding.py
create mode 100644 tests/test_layers_squeeze_excitation.py
create mode 100644 tests/test_models_byol.py
create mode 100644 tests/test_models_csn.py
create mode 100644 tests/test_models_head.py
create mode 100644 tests/test_models_masked_multistream.py
create mode 100644 tests/test_models_memory_bank.py
create mode 100644 tests/test_models_r2plus1d.py
create mode 100644 tests/test_models_resnet.py
create mode 100644 tests/test_models_slowfast.py
create mode 100644 tests/test_models_stem.py
create mode 100644 tests/test_models_x3d.py
create mode 100644 tests/test_simclr.py
create mode 100644 tests/test_transforms.py
create mode 100644 tests/utils.py
create mode 100644 tutorials/accelerator/Build_your_model_with_PytorchVideo_Accelerator.ipynb
create mode 100644 tutorials/accelerator/Use_Model_Transmuter.ipynb
create mode 100644 tutorials/accelerator/Use_PytorchVideo_Accelerator_Model_Zoo.ipynb
create mode 100644 tutorials/torchhub_inference_tutorial.ipynb
create mode 100644 website/.dockerignore
create mode 100644 website/.gitignore
create mode 100644 website/docs/tutorial_accelerator_build_your_model.md
create mode 100644 website/docs/tutorial_accelerator_use_accelerator_model_zoo.md
create mode 100644 website/docs/tutorial_accelerator_use_model_transmuter.md
create mode 100644 website/docs/tutorial_classification.md
create mode 100644 website/docs/tutorial_overview.md
create mode 100644 website/docs/tutorial_torchhub_inference.md
create mode 100644 website/website/README.md
create mode 100644 website/website/core/Footer.js
create mode 100644 website/website/package.json
create mode 100644 website/website/pages/en/index.js
create mode 100644 website/website/sidebars.json
create mode 100644 website/website/siteConfig.js
create mode 100644 website/website/static/css/custom.css
create mode 100644 website/website/static/img/efficient.svg
create mode 100644 website/website/static/img/favicon.png
create mode 100644 website/website/static/img/logo.svg
create mode 100644 website/website/static/img/logo_no_text.svg
create mode 100644 website/website/static/img/logo_white.svg
create mode 100644 website/website/static/img/modelzoo.svg
create mode 100644 website/website/static/img/oss_logo.png
create mode 100644 website/website/static/img/pytorch.svg
create mode 100644 website/website/static/img/reproducible.svg
diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 00000000..7fa6853f
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,205 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+# -------------------------------------------------------------------------------------
+# CircleCI configuration file.
+# Specifies automated environment setup and tests.
+#
+# See https://circleci.com/docs/2.0/language-python/ for more details
+# Available Machine Images:
+# https://circleci.com/docs/2.0/configuration-reference/#available-machine-images
+# -------------------------------------------------------------------------------------
+
+version: 2.1
+
+# -------------------------------------------------------------------------------------
+# Environments to run the jobs in
+# -------------------------------------------------------------------------------------
+cpu: &cpu
+ machine:
+ image: ubuntu-2004:202101-01
+
+gpu: &gpu
+ environment:
+ CUDA_VERSION: "10.2"
+ resource_class: gpu.medium # tesla m60
+ machine:
+ image: ubuntu-2004:202101-01
+
+setup_cuda: &setup_cuda
+ run:
+ name: Setup CUDA
+ working_directory: ~/
+ command: |
+ # download and install nvidia drivers, cuda, etc
+ wget --no-verbose --no-clobber -P ~/nvidia-downloads https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run
+ sudo sh ~/nvidia-downloads/cuda_11.2.2_460.32.03_linux.run --silent
+ echo "Done installing CUDA."
+ nvidia-smi
+
+# -------------------------------------------------------------------------------------
+# Re-usable commands
+# -------------------------------------------------------------------------------------
+install_conda: &install_conda
+ run:
+ name: Setup Conda
+ working_directory: ~/
+ command: |
+ curl --retry 3 -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+ sh conda.sh -b -p $HOME/miniconda3
+
+setup_ptv_conda: &setup_ptv_conda
+ run:
+ name: Setup Conda Environment
+ command: |
+ pyenv versions
+ export PATH="$HOME/miniconda3/bin:$PATH"
+ conda update -y conda
+ conda init bash
+ source ~/.bashrc
+ conda create --name pytorchvideo python=3.7.9
+
+install_pytorch: &install_pytorch
+ - run:
+ name: Install Pytorch
+ command: |
+ export PATH="$HOME/miniconda3/bin:$PATH"
+ conda activate pytorchvideo
+ conda install pytorch=1.8.0 torchvision -c pytorch
+ python -c 'import torch; print(torch.__version__)'
+ python -c 'import torch; print("CUDA:", torch.cuda.is_available())'
+ python -c 'import torchvision; print(torchvision.__version__)'
+
+install_pytorchvideo: &install_pytorchvideo
+ - run:
+ name: Install PyTorchVideo
+ command: |
+ export PATH="$HOME/miniconda3/bin:$PATH"
+ conda activate pytorchvideo
+ pip install -U --progress-bar off -e .[test]
+ python -c 'import pytorchvideo; print(pytorchvideo.__version__)'
+
+build_wheels: &build_wheels
+ - run:
+ name: Install PyTorchVideo
+ command: |
+ export PATH="$HOME/miniconda3/bin:$PATH"
+ conda activate pytorchvideo
+ python setup.py sdist
+
+ export BUILD_NIGHTLY="1"
+ python setup.py sdist
+
+run_unittests: &run_unittests
+ - run:
+ name: Run Unit Tests
+ command: |
+ export PATH="$HOME/miniconda3/bin:$PATH"
+ conda activate pytorchvideo
+ python -m unittest discover -v -s tests
+
+run_unittests_with_coverage: &run_unittests_with_coverage
+ - run:
+ name: Run Unit Tests
+ command: |
+ export PATH="$HOME/miniconda3/bin:$PATH"
+ conda activate pytorchvideo
+ coverage run -m unittest discover -v -s tests
+ bash <(curl -s https://codecov.io/bash)
+
+# -------------------------------------------------------------------------------------
+# Jobs to run
+# -------------------------------------------------------------------------------------
+jobs:
+ cpu_tests:
+ <<: *cpu
+ working_directory: ~/pytorchvideo
+ steps:
+ - checkout
+ - <<: *install_conda
+ - <<: *setup_ptv_conda
+ - <<: *install_pytorch
+ - <<: *install_pytorchvideo
+ - <<: *build_wheels
+ - <<: *run_unittests_with_coverage
+ - store_artifacts:
+ path: ~/pytorchvideo/dist
+ - persist_to_workspace:
+ root: ~/pytorchvideo/dist
+ paths:
+ - "*"
+
+ gpu_tests:
+ working_directory: ~/pytorchvideo
+ <<: *gpu
+ steps:
+ - checkout
+ - <<: *setup_cuda
+ - <<: *install_conda
+ - <<: *setup_ptv_conda
+ - <<: *install_pytorch
+ - <<: *install_pytorchvideo
+ - <<: *run_unittests
+
+ upload_wheel:
+ docker:
+ - image: circleci/python:3.7
+ auth:
+ username: $DOCKERHUB_USERNAME
+ password: $DOCKERHUB_TOKEN
+ working_directory: ~/pytorchvideo
+ steps:
+ - checkout
+ - attach_workspace:
+ at: ~/workspace
+ - run:
+ command: |
+ # no commits in the last 25 hours
+ if [[ -z $(git log --since="25 hours ago") ]]; then
+ echo "No commits in the last day."
+ exit 0
+ fi
+ pip install --progress-bar off --user twine
+ for pkg in ~/workspace/*.tar.gz; do
+ if [[ "$pkg" == *"nightly"* ]];
+ then
+ twine upload --verbose --skip-existing --username __token__ --password $PTV_NIGHTLY_PYPI_TOKEN $pkg
+ else
+ twine upload --verbose --skip-existing --username __token__ --password $PTV_PYPI_TOKEN $pkg
+ fi
+ done
+# -------------------------------------------------------------------------------------
+# Workflows to launch
+# -------------------------------------------------------------------------------------
+workflows:
+ version: 2
+ regular_test:
+ jobs:
+ - cpu_tests:
+ context:
+ - DOCKERHUB_TOKEN
+ - gpu_tests:
+ context:
+ - DOCKERHUB_TOKEN
+
+ nightly:
+ jobs:
+ # https://circleci.com/docs/2.0/contexts/#creating-and-using-a-context
+ - cpu_tests:
+ context:
+ - DOCKERHUB_TOKEN
+ - gpu_tests:
+ context:
+ - DOCKERHUB_TOKEN
+ - upload_wheel:
+ requires:
+ - cpu_tests
+ - gpu_tests
+ context:
+ - DOCKERHUB_TOKEN
+ triggers:
+ - schedule:
+ cron: "0 0 * * *"
+ filters:
+ branches:
+ only:
+ - master
diff --git a/.flake8 b/.flake8
new file mode 100644
index 00000000..6c3b6d91
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,6 @@
+[flake8]
+ignore = E203, E266, E501, W503, E221
+max-line-length = 88
+max-complexity = 18
+select = B,C,E,F,W,T4,B9
+exclude = build,__init__.py
diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..f049d4c5
--- /dev/null
+++ b/.github/CODE_OF_CONDUCT.md
@@ -0,0 +1,76 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+ advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+ address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at . All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
new file mode 100644
index 00000000..0cea92e6
--- /dev/null
+++ b/.github/CONTRIBUTING.md
@@ -0,0 +1,55 @@
+# Contributing to PyTorchVIdeo
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+However, if you're adding any significant features, please make sure to have a corresponding issue to outline your proposal and motivation and allow time for us to give feedback, *before* you send a PR.
+We do not always accept new features, and we take the following factors into consideration:
+
+- Whether the same feature can be achieved without modifying PyTorchVideo directly. If any aspect of the API is not extensible, please highlight this in an issue so we can work on making this more extensible.
+- Whether the feature is potentially useful to a large audience, or only to a small portion of users.
+- Whether the proposed solution has a good design and interface.
+- Whether the proposed solution adds extra mental/practical overhead to users who don't need such feature.
+- Whether the proposed solution breaks existing APIs.
+
+When sending a PR, please ensure you complete the following steps:
+
+1. Fork the repo and create your branch from `master`. Follow the instructions
+ in [INSTALL.md](../INSTALL.md) to build the repo.
+2. If you've added code that should be tested, add tests.
+3. If you've changed any APIs, please update the documentation.
+4. Ensure the test suite passes:
+ ```
+ cd pytorchvideo/tests
+ python -m unittest -v
+ ```
+5. Make sure your code lints by running `dev/linter.sh` from the project root.
+6. If a PR contains multiple orthogonal changes, split it into multiple separate PRs.
+7. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here:
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## Coding Style
+We follow these [python](http://google.github.io/styleguide/pyguide.html) and [C++](https://google.github.io/styleguide/cppguide.html) style guides.
+
+For the linter to work, you will need to install `black`, `flake`, `isort` and `clang-format`, and
+they need to be fairly up to date.
+
+## License
+By contributing to PyTorchVideo, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
+
diff --git a/.github/ISSUE_TEMPLATE/bugs.md b/.github/ISSUE_TEMPLATE/bugs.md
new file mode 100644
index 00000000..b6ea6e9f
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bugs.md
@@ -0,0 +1,30 @@
+---
+name: "🐛 Bugs / Unexpected behaviors"
+about: Please report unexpected behaviors or bugs in PyTorchVideo.
+
+---
+
+If you do not know the root cause of the problem / bug, and wish someone to help you, please
+post according to this template:
+
+## 🐛 Bugs / Unexpected behaviors
+
+
+NOTE: Please look at the existing list of Issues tagged with the label ['bug`](https://github.com/facebookresearch/pytorchvideo/issues?q=label%3Abug). **Only open a new issue if this bug has not already been reported. If an issue already exists, please comment there instead.**.
+
+## Instructions To Reproduce the Issue:
+
+Please include the following (depending on what the issue is):
+
+1. Any changes you made (`git diff`) or code you wrote
+```
+
+```
+2. The exact command(s) you ran:
+3. What you observed (including the full logs):
+```
+
+```
+
+Please also simplify the steps as much as possible so they do not require additional resources to
+ run, such as a private dataset, models, etc.
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000..3ba13e0c
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: false
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 00000000..4390d86b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,21 @@
+---
+name: "\U0001F680 Feature Request"
+about: Submit a proposal/request for a new PyTorchVideo feature
+
+---
+
+## 🚀 Feature
+
+
+NOTE: Please look at the existing list of Issues tagged with the label ['enhancement`](https://github.com/facebookresearch/pytorchvideo/issues?q=label%3Aenhancement). **Only open a new issue if you do not see your feature request there**.
+
+## Motivation
+
+
+
+## Pitch
+
+
+
+NOTE: we only consider adding new features if they are useful for many users.
diff --git a/.github/ISSUE_TEMPLATE/questions-help.md b/.github/ISSUE_TEMPLATE/questions-help.md
new file mode 100644
index 00000000..76bc0d4d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/questions-help.md
@@ -0,0 +1,21 @@
+---
+name: "❓ Questions"
+about: How do I do X with PyTorchVideo? How does PyTorchVideo do X?
+
+---
+
+## ❓ Questions on how to use PyTorchVideo
+
+
+
+
+NOTE: Please look at the existing list of Issues tagged with the label ['question`](https://github.com/facebookresearch/pytorchvideo/issues?q=label%3Aquestion) or ['how-to`](https://github.com/facebookresearch/pytorchvideo/issues?q=label%3A%22how+to%22). **Only open a new issue if you cannot find an answer there**.
+
+Also note the following:
+
+1. If you encountered any errors or unexpected issues while using PyTorchVideo and need help resolving them,
+ please use the "Bugs / Unexpected behaviors" issue template.
+
+2. We do not answer general machine learning / computer vision questions that are not specific to
+ PyTorchVideo, such as how a model works or what algorithm/methods can be
+ used to achieve X.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 00000000..b6851e7b
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,30 @@
+## Motivation and Context
+
+
+
+
+
+## How Has This Been Tested
+
+
+
+## Types of changes
+
+
+- [ ] Docs change / refactoring / dependency upgrade
+- [ ] Bug fix (non-breaking change which fixes an issue)
+- [ ] New feature (non-breaking change which adds functionality)
+- [ ] Breaking change (fix or feature that would cause existing functionality to change)
+
+## Checklist
+
+
+
+- [ ] My code follows the code style of this project.
+- [ ] My change requires a change to the documentation.
+- [ ] I have updated the documentation accordingly.
+- [ ] I have read the **CONTRIBUTING** document.
+- [ ] I have completed my CLA (see **CONTRIBUTING**)
+- [ ] I have added tests to cover my changes.
+- [ ] All new and existing tests passed.
+
diff --git a/.github/logo_horizontal_color.png b/.github/logo_horizontal_color.png
new file mode 100644
index 0000000000000000000000000000000000000000..8f7ee6778d9539e3560909a4b4bf55af6e71025b
GIT binary patch
literal 8071
zcma)hXHe5m)GjuZ-kV78Eulxcbm=8@0tnJeFrXk^nm|BMM5)}!L23C@eevE;_rsleXSSWQ`#k5Io!OnUvnl4L`ZSccDapvlXbkS_Sdx)l11`sy
zTh}hzxZNnz%kH|DrimsQ+4~HtGuIoJGVjCtmL_Cm;lgBO&l1SUPA*l?evpwpktQSi
z^?-~_IhTx#?a|v#3)RbsI&)(y-M@eTet5H29k8=H@?WgvddY*W$;QK%dOt{e=gD_B
z8zXji*UxgHzdn^60Li~M=TGK)NUN)>4Gj%}fq@zt8kaONF)?OlW^{CPt1+v`>&Ihq
zV+$S&dt-Yz9ImggFC!ykw{N$&xOm!ldb?@c#l?k(hv#R-&$GR=-Q8WB25voX9SjD4
z^!?T$Gd)+nJ8M6iB){I1yj!D+|0p+cxNwNNze-3W;DhiQ^7=}b_O7@Y>Fbas-{87`
zDNsDRZy!uXM#cEAUYTBwKD$)jd~9H%dvpC72b%)VT=!kVKTu$hsnr!+S!g~P8CSc3
zj;7Vq#ht>fe8Sz#dwdg02Pg|DyED9_9?a3)Hk=7CZE8o{c^*5k>N^Ny3{Kd_sAmc#vhc0OXSY
z(XU!`nJQf|q39|CVd5gD|C7R6&neDkh?mwCg5eYbIj%!NF
z^Irt2=QW#)%jn}s_pvQC*;NyY{KGJcmN9S})`Un|AJKg3AzpEH~U6mOo|0g1jS
z);d=3PCF^wco)e3xBV;9QNVGmbMC_Ek8-DPGDkbAGwcJo8hi)b75y8i6zsMA}w87&zr2Q
z@J7!nUPmBC!5}zF5opV`UIO12!tSgP%GM&+&-Ms&poS0iZ7*_~ap2*1eOa0DhDfR~
z`%KunYAMhcwI6*4j86LFrFpI&V=q)Bj#PQ?Cf-3Au7&7}`TM79bB!OHNP`Jd^v=4O
z3B~K3>m6{lDF3AVu#yk5g^kA>N=7Zf;v`vS)+S&Wvr^PVy7#wAn>8%hbqR>
z?yl;-EQeX@C}6ek@1LrDL@Xg4Rpx+=i7nyVAMCv!VT}j&zsV#Q^sel9tX?>*d485M
z$6v%_dmgWZ{pAg|2~?X&@j?T|HS%o8Sfm!>-k$+
zZ$k?Ht0)nX;3Ug`^v}@~Lg8i!Veiw%XtFKj$J#+7
zsBJ>X0)ZeT1&Ih1R1kuGmv?kja~>BU#HWAa720*GtYBB6fxWlP>E0Q@!5)(B$vdI`!+ku!j5^%9aLkudQUc9|=R!htZhJ;&ru{;L*X%}p=l0M?v
z`wW@d1p{;mb0O=bAZr*S&i6lQPjtCF`*qm97NOt
zg3@mkU5l?##Ty!(E`^66aA(K*-w28h6+V6)pTbJ+)nQxS{&OZJ_o|1($hvW8H!pMO
zBZH`U-@~oLkgT6CrhwVbxxE~%%xcuF><`L;}zwSdcEa}R;$h(V|U00*0)u}=e8)too7K&T2FN5$?Y+FVpH}lWU^sPGle>w>{qOO3
zvdPz;%MJSLoCZ|*NwAQvP&
zjhTd#AU3B@A-|(SijTfKUG6M(%hB?!LWAT6>rl)zG*}&fxb&$In
zs3M@DnolVYuyH(In^H}gU+Br#4eyoX$1Z!?Xk9Nfa-zdHd8TSL9c}l_MQ7(YAipuN
zsYL0XXZ4l`3U0WPUhpEYLT`L%$6dq_oZ7W%IVB+n$Ce`7_G~LIf(|gtrL#OR{1b
z0u#>CUPw1YzUpFDvuD4($|f3XY|nv!Tr{b^ziYTerLVu720+ga6c~=<*=4}-G?LVN
zDHBYP>aAAb~Km|9*4MSr4GsJ
zyo1=SMA^Z-M~Ld`&mOTfJgZS!;Ze&(M&wJy(dRW;5{R!DWydKz$%}lB61H$HjyFv)
zdk9?u;ELus1YLDzq08=ryssG{@hW&-|$F4!y{Sc#~R7Hu}uln!xhNgFHlFu
zlpaR<>m*7Po^*b)DvKd?)OL6k#pE}mecQGC5oUG2^Zx77lh2~ck0>r^1SlLiwD4)6
zgL8m$j8BC+(N$_W&B7p!
zV)^L+XN$b8z{b)*n36rASJq)6RbWv8B4s;~K)Ek{T%|&e9T%TzXvd9w-imUmn*^h`
z0=%g8P4v>AYEPIPoTQw8)KqyQpc!O-2$NegX=GLq}uKtUqv#J_!g1a
z#7(*)JE&R0Ahh{wX(Xbm?Z(BFL+GT(rqWf60?m4uC(Q3*j2l~+PT=0-ydhWH&pwFv
zN?bPI>=rJfp8XJb^-M|(4Yg)mlXNnl31$vhzd84=IBZf%ASvrcz}9&`I{{>fzv*)s
zlCs-x4%OHF$NfWpxI(Tw;%)1OQ!+?*rzCI|yhKzJRlR)M4OR1~_?fZeOfkgNf`v_C
z>c!RGLQ;2bB}j3@2H8jp&RaoUk$=b$EKK-=M3UDO9}Nj3rEU<*sC
z-&iw2Ve*1Tsv*gcu@KeII`8`)eJgf5g|Uc1RDk%3#Ih~PTY5Has=JeHmew0V(zpAy<96#%RLDfhZONd{_N3gr%srM4N?xsu>+DNx6g+w$3~Qx9`F{CO|ZX76PI
zljg*PB25?!xUs16dr6IY;9;7>XWutB=_bHvjlSc;mN!V1ePwA6^)cy{IJ#q5Z0qe2
zzIn;u`8=e$5Zqg5^j91uDYy~iG}Qp94w}-=rI;9xPgAU+K$I#{jPqAwZt(roDj6|(
zi&Ocu8GCxmwj&H1)!1f<_*T03RHM@NcJ9ZbWH%YC!V?lf?e48sKEY6|8u^Iw9~U;C
z>}@%(Z^aJ=BHo%9ad7>{Vp~rtzv)7%fish+@gC(Dx4h4r_B`wb^h?Dd#Q@#BjVQ?K
zE;{_TRi*^ZSZneDhbp4fvB$GMppk(4`OXQUeeRp9&SSW)8ztw47)B?M`izMW84@TJN$PcR#qyL^&FS&)~`x=2yl3jIUKT)dc%iRttAkGUyGhy(c?rg3~WC#LUjH+_DIzWSUS
z5zf#k1%G5DSlyx2tm1T?kgu+EVKy*=_r@DdYMUcUUyRs~JWJuUBYgdroO<{qQ?xMJuR
zVNEi-aMJ(4Ja2!xlk#1wAvlS}Ra`B|P4J_D9qosUe9m$hJw$ctgtt%9yani|Ry6qW
zjOq)NcEQ(9#!?5*P)TP0d{c$;$TYT1wh*f)i8f!Nipi=0nZZK)I~$6qo>v(|a=Gcg
zCHoQWrE`(~txZCG%$7Q1Lk7C3RIMDArTK~ivkQZmfMXtKY8}{|6rSBP5F06Iu=+{W
zKonFOup4u>=sbIVYSxcReQ=roba3!=<96iT-^xVjuh8k^bA`9Yz_iJnl2>lC3>ZL(
ziIj|xtbTG|ZgJk!6hHONicB4hM?&lPj-qNiV>l#C=EaTh>jr|CiA&FE$Ii>*z7<1D
zh#+xGy^xoJLgU6Sf#mJUcn~$>M$?rIY*V^a>U*s?UOm{J`3&u0D4EIMmoUwQWU5&u
zupd7^TFm2%@}PMWjp$0&dH;JT@4}{#77h=G*s+jRK!lI!u`TaeM$<4
z=|gd$WcQr!Tt$4N8akUuOy&IaV21y1_%(}b{Hoo5pnbq=PwKA)H3>>oYZYGau^!$L
zv6k&wN*sFl<0ql9gJ-XAThH-8`d1Lsi?YL}SmSQr7TOzu90bZ7
z`YK$|iiA1CK2`GK=dP!S;TIeF_H>a67!UC^w)W+;(xM)!i|jH?s9`^&ZOKkzV0q^a
zvns}vi1H!jVfB&}nnd^4$rAa}X$g`45oA628JWSLvC%_}nMsl?->5XMbV+by!VUPw
zmSg8>`qCY@sryNGHANN42fw+RP3CyzU?=A9z^qh=>2n>tP%z)mwErAM8wcq_xzsV*
zaD)WNR>}TV5WIF-tAaEwA~WbYVxgp5@d?4
z^jm7gEhYW2ROyU>ML6%
z)f$&{7a{R3zz)CW%Vyjeer?Fca<
zYCXrwM!H+<5`uZ<;jJlreek{o(9?4d`J0QsQ|aNvn2Q}l=uk&wEFn(123?;5N!l>O
z>%*q5Q{s;4Zhu5IwPgdlOHy6FSm|$Kp)GUN1oC~UVGN6P6f~yaObR5MRnYiC6T?OI
z<_`qbA+aspz2F~b{FDS@X?I&(l&hi?-Todr^50l)A#QSwT?md?1Ij2IA%+ZW!w+Ui
z(Z8-SK*Be&1VB1HYkO4+38Al)>+T~Yssj26T2DvvoF2Crf;>*+#nHuUIMbp1J=Gi=6OESNEWsMX!bi#1{E)K
zi4GnR%Ydtmv(0yB4Hyw?T!hb5sAvz-$kD54zs-I@x6m{(c|lsCuZo
zwn}pl24uRDPxJXa5>us9Io*^ygi^K))cn<51tWj*66QQccnJfJx{8SdlVgz`yp<^W
z0-!Xt{avahQFC6<_unb(zd=TLV@l{lDz5`3
zqh$O`u_TjM3H}QB!dozB%;xLVtbjOzjpGslJDyN=lq_@W4k9liZQtN$#&w-Xkqtg9?GPyk
zvfBFoHd8KU|9e}YXTQ}=%4EN;wAnQN`H09b=cCPwZsqEUoO|d)s8pPDoA~eaItzqB
zED0^lxGld^0aoerTXE=*zRCxx*~#$rc8mkeJDiH)GxZ*AzV`63a`L>0kz5eN{Gyt{
z+tOBf1ayYk*hmSt!lw{XVDenH8u%K6->!+>j%1IaR76=IinW(yF$%clRzxH-E6#Fy
zaX#OAU292sM&(Vi-wvJRidVBLVtgjt$?}2xM04ox)8-`)&i!n@_=wB1(TS%Sh_0t4-e?
zyaF0EkKCm0Fp8VOlFQRkx5Zrxk*R*R9ef_bn7LS+N=!AJm>Z@3_>t1?tx}-8Z5=+E
zWF7}hmZ|1_gC3DlHj|Xu*mXiobP0>rM2ER?DGQj~kLlR(OeKSt^z*er*G2>%x!b@Z
zRN|u>C&~lY5|vkL1C^j=Is_ZDB}+`=25I8%Ai6l00GEuv>Jd0+m3NsqIui+M{}pau
zgr*P%%;9bhl^xS^(b7#>ehedaPs+f;?wjFvG8ujmzd%m%zih7QIRw(pl{DPLi~BmJ
z14FfI0Zj}sNG2#uqVbOO{NB-z6!&3Ft8mcuRrUsnuD?!YfOLO#-~~@`jEW(w>rJY}
z%%h{dJ+X&8lY?G~kHJelr+SE?wEeCl&H7>IwzTh+uvPJz040J$KX0`D0TUV!@l$D1
zc<(&vD-w~zLWxBwMX)iFo;q{}n~pHgqqE>IJh{t(_@cG)hH0*Y<*m7RY|ELt;;^?U?wpC3HwdGR?VB4W^@wjXbDvXP7ljthpm%%^PQ^2<8^8H`n!UtED59T@xFOJVN
z1E#${yI6uzw9T(S(6^6V+PXZ2zZJ#fc)>3sUahHHqxwLF<591oqOgR@ai@B^h-!Pt
zOrEunqtPuR7Dx}1l**dUJ^VIj>4IOY^9jnPGQA4(gkN9&LS;7yg7s{B?MIEMLj~9q
zly~f=POJ^IYCdnd%DAlT-Rpzr)I=+`@wCJ6X&v)~h62i}_ABjfK@~_xBkSn;pXqE`
zgz&M$1y3z)JJrwu1vZYMUrj^-=?CBG>DzxTW(%P?i!zOe**xM@+-V%@dkTVZ@0~*u
z0O#QI57whBkl9KPjb2TJQUN}lBafvE^+?IKQSTzTG}`Q?RXTI~$zrxJdgjU-W9yd{
zkMN0O$Jck=KPEdyM0q*0DSuc;syyO;aKT*B2(;>b$aNktN*R%9SQv
zgWz^uu-AR%&%;~`djeMYVzbf!AHpKa&cag1XWHHCgGjYm_I-rk(r?A2U=?96e`=g=
z3d1~niZX~l2%~1(iF)QcCiuO)vnTp4H=cI+7LN|h*D+4q$FT^YFUl0OcrhH(tCo3k
z=O;<89@tzVi&}oeX}G!vW)?uJWsSZiK8+PY2V&;lS1g|hGP7`|o&EXdtnqERvHjWD
zC+TYq0PbBt7-0zB%G7Mq&)smoq8;2uz^ZXRpF7&^dMwqKw|a}UD^E(}r6iaC;Ko#3
zVgu7r|MHO$xLt~ke5ZHIxlY3^y(fCbz%nq8@j3WV(6_WfAMZ7N#7!8nnHq!R+P(X?
zJU8kt(wl!TKHqs!4Gvr6k!5pk2qB!YCf-EfUMiwZn|3Bm$Nr!23yQ$Ng7u(N-HLPHa!izB!02Ft6IC?d
zq`!I{PDQCsRVp2*m3kmmX`40TntB>ji|Vg;Wbv|wW~tn(Z#qwGZOujvZTnY=dJNK&
z*gms1m5q`kSk_IiW6xPyTQ}%GZr`sFt=QLYJ#;1al<%&y%H*j<-TcH#`k`XM3=XVC
zB@LS3m*4H`Saf&Sv~iE|qD=)XYljPb3p^U)0YhvUG@v;bs~lQLtQL)2v09?`WHKhU
z%^-sU{|UnKLielcMMKw3T1~F~tR&0<;)chUJU-dV$#qsmkXgkWR>19iOYzZ5LbsYy
zv5J$+S&(qKFiYe5e(z}b2_dPS%dMtDGNm~}^_;v(OIbBrJreV)DMS+xZJxUI&B
z|NF&A46CT4VyD1P*yMc#-m|Hd}KeiC5!GCs>~{P+_i8v1Qn4kYKT?ZfwmG6
zHM6KP)oV-ZzR{?#)lDOluu@TE+5fuiMV@I`5U63E?DavH|G7esb?qLzyFT_%ei-C&
z*^$Xg%gRVe%Sp*9SjoyME66G<+?9})R+g65Nq4gTe*gi2?mnJj|9^nO%=*tu0GWZV
LsSaA}LEQfU5_HQB
literal 0
HcmV?d00001
diff --git a/.github/logo_horizontal_color.svg b/.github/logo_horizontal_color.svg
new file mode 100644
index 00000000..cbee163c
--- /dev/null
+++ b/.github/logo_horizontal_color.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..776740c4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,34 @@
+*.DS_Store
+
+build/
+_ext
+*.pyc
+*.pyd
+*.so
+*.dll
+*.egg-info/
+**/__pycache__/
+*-checkpoint.ipynb
+**/.ipynb_checkpoints
+**/.ipynb_checkpoints/**
+
+
+# Docusaurus site
+website/yarn.lock
+website/build/
+website/i18n/
+website/node_modules/*
+website/npm-debug.log
+
+## Generated for tutorials
+website/_tutorials/
+website/static/files/
+website/pages/tutorials/*
+!website/pages/tutorials/index.js
+
+
+## Conda and pip builds
+packaging/out/
+packaging/output_files/
+dist/
+wheels/
diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 00000000..d27f4993
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,25 @@
+# .readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+ builder: html
+ configuration: docs/source/conf.py
+
+# Build documentation with MkDocs
+#mkdocs:
+# configuration: mkdocs.yml
+
+# Optionally build your docs in additional formats such as PDF and ePub
+formats: all
+
+# Optionally set the version of Python and requirements required to build your docs
+python:
+ version: 3.7
+ system_packages: true
+ install:
+ - requirements: docs/requirements.txt
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..59d832c5
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,41 @@
+# Contributing to fvcore
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `master`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Testing
+
+Please follow the instructions mentioned in [test-README](https://github.com/facebookresearch/pytorchvideo/blob/master/tests/README.md) to run the existing and your newly added tests.
+
+## Linting
+
+We provide a linting script to correctly format your code changes.
+Please follow the instructions mentioned in [dev-README](https://github.com/facebookresearch/pytorchvideo/blob/master/dev/README.md) to run the linter.
+
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here:
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## License
+By contributing to fvcore, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
diff --git a/INSTALL.md b/INSTALL.md
new file mode 100644
index 00000000..bdfc293f
--- /dev/null
+++ b/INSTALL.md
@@ -0,0 +1,68 @@
+# Installation
+
+## Installing PytorchVideo
+
+
+### 1. Install from PyPI
+For stable release,
+```
+pip install pytorchvideo
+=======
+conda create -n pytorchvideo python=3.7
+conda activate pytorchvideo
+conda install -c pytorch pytorch=1.8.0 torchvision cudatoolkit=10.2
+conda install -c conda-forge -c fvcore -c iopath fvcore=0.1.4 iopath
+```
+
+For nighly builds,
+```
+pip install pytorchvideo-nightly
+```
+
+### 2. Install from GitHub using pip
+```
+pip install "git+https://github.com/facebookresearch/pytorchvideo.git"
+```
+To install using the code of the released version instead of from the main branch, use the following instead.
+```
+pip install "git+https://github.com/facebookresearch/pytorchvideo.git@stable"
+```
+
+### 3. Install from a local clone
+```
+git clone https://github.com/facebookresearch/pytorchvideo.git
+cd pytorchvideo
+pip install -e .
+
+# For developing and testing
+pip install -e . [test,dev]
+```
+
+
+## Requirements
+
+### Core library
+
+- Python 3.7 or 3.8
+- PyTorch 1.8.0 or higher.
+- torchvision that matches the PyTorch installation. You can install them together as explained at pytorch.org to make sure of this.
+- [fvcore](https://github.com/facebookresearch/fvcore) version 0.1.4 or higher
+- [ioPath](https://github.com/facebookresearch/iopath)
+- If CUDA is to be used, use a version which is supported by the corresponding pytorch version and at least version 10.2 or higher.
+
+We recommend setting up a conda environment with Pytorch and Torchvision before installing PyTorchVideo.
+For instance, follow the bellow instructions to setup the conda environment,
+```
+conda create -n pytorchvideo python=3.7
+conda activate pytorchvideo
+conda install -c pytorch pytorch=1.8.0 torchvision cudatoolkit=10.2
+```
+
+## Testing
+
+Please follow the instructions mentioned in [test-README](https://github.com/facebookresearch/pytorchvideo/blob/master/tests/README.md) to run the provided tests.
+
+## Linting
+
+We also provide a linting script to correctly format your code edits.
+Please follow the instructions mentioned in [dev-README](https://github.com/facebookresearch/pytorchvideo/blob/master/dev/README.md) to run the linter.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..5a90478a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction,
+and distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by
+the copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all
+other entities that control, are controlled by, or are under common
+control with that entity. For the purposes of this definition,
+"control" means (i) the power, direct or indirect, to cause the
+direction or management of such entity, whether by contract or
+otherwise, or (ii) ownership of fifty percent (50%) or more of the
+outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity
+exercising permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications,
+including but not limited to software source code, documentation
+source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical
+transformation or translation of a Source form, including but
+not limited to compiled object code, generated documentation,
+and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or
+Object form, made available under the License, as indicated by a
+copyright notice that is included in or attached to the work
+(an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object
+form, that is based on (or derived from) the Work and for which the
+editorial revisions, annotations, elaborations, or other modifications
+represent, as a whole, an original work of authorship. For the purposes
+of this License, Derivative Works shall not include works that remain
+separable from, or merely link (or bind by name) to the interfaces of,
+the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including
+the original version of the Work and any modifications or additions
+to that Work or Derivative Works thereof, that is intentionally
+submitted to Licensor for inclusion in the Work by the copyright owner
+or by an individual or Legal Entity authorized to submit on behalf of
+the copyright owner. For the purposes of this definition, "submitted"
+means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems,
+and issue tracking systems that are managed by, or on behalf of, the
+Licensor for the purpose of discussing and improving the Work, but
+excluding communication that is conspicuously marked or otherwise
+designated in writing by the copyright owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity
+on behalf of whom a Contribution has been received by Licensor and
+subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the
+Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+(except as stated in this section) patent license to make, have made,
+use, offer to sell, sell, import, and otherwise transfer the Work,
+where such license applies only to those patent claims licensable
+by such Contributor that are necessarily infringed by their
+Contribution(s) alone or by combination of their Contribution(s)
+with the Work to which such Contribution(s) was submitted. If You
+institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work
+or a Contribution incorporated within the Work constitutes direct
+or contributory patent infringement, then any patent licenses
+granted to You under this License for that Work shall terminate
+as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+Work or Derivative Works thereof in any medium, with or without
+modifications, and in Source or Object form, provided that You
+meet the following conditions:
+
+(a) You must give any other recipients of the Work or
+Derivative Works a copy of this License; and
+
+(b) You must cause any modified files to carry prominent notices
+stating that You changed the files; and
+
+(c) You must retain, in the Source form of any Derivative Works
+that You distribute, all copyright, patent, trademark, and
+attribution notices from the Source form of the Work,
+excluding those notices that do not pertain to any part of
+the Derivative Works; and
+
+(d) If the Work includes a "NOTICE" text file as part of its
+distribution, then any Derivative Works that You distribute must
+include a readable copy of the attribution notices contained
+within such NOTICE file, excluding those notices that do not
+pertain to any part of the Derivative Works, in at least one
+of the following places: within a NOTICE text file distributed
+as part of the Derivative Works; within the Source form or
+documentation, if provided along with the Derivative Works; or,
+within a display generated by the Derivative Works, if and
+wherever such third-party notices normally appear. The contents
+of the NOTICE file are for informational purposes only and
+do not modify the License. You may add Your own attribution
+notices within Derivative Works that You distribute, alongside
+or as an addendum to the NOTICE text from the Work, provided
+that such additional attribution notices cannot be construed
+as modifying the License.
+
+You may add Your own copyright statement to Your modifications and
+may provide additional or different license terms and conditions
+for use, reproduction, or distribution of Your modifications, or
+for any such Derivative Works as a whole, provided Your use,
+reproduction, and distribution of the Work otherwise complies with
+the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+any Contribution intentionally submitted for inclusion in the Work
+by You to the Licensor shall be under the terms and conditions of
+this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify
+the terms of any separate license agreement you may have executed
+with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+names, trademarks, service marks, or product names of the Licensor,
+except as required for reasonable and customary use in describing the
+origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+agreed to in writing, Licensor provides the Work (and each
+Contributor provides its Contributions) on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied, including, without limitation, any warranties or conditions
+of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE. You are solely responsible for determining the
+appropriateness of using or redistributing the Work and assume any
+risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+whether in tort (including negligence), contract, or otherwise,
+unless required by applicable law (such as deliberate and grossly
+negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special,
+incidental, or consequential damages of any character arising as a
+result of this License or out of the use or inability to use the
+Work (including but not limited to damages for loss of goodwill,
+work stoppage, computer failure or malfunction, or any and all
+other commercial damages or losses), even if such Contributor
+has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+the Work or Derivative Works thereof, You may choose to offer,
+and charge a fee for, acceptance of support, warranty, indemnity,
+or other liability obligations and/or rights consistent with this
+License. However, in accepting such obligations, You may act only
+on Your own behalf and on Your sole responsibility, not on behalf
+of any other Contributor, and only if You agree to indemnify,
+defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason
+of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+To apply the Apache License to your work, attach the following
+boilerplate notice, with the fields enclosed by brackets "[]"
+replaced with your own identifying information. (Don't include
+the brackets!) The text should be enclosed in the appropriate
+comment syntax for the file format. We also recommend that a
+file or class name and description of purpose be included on the
+same "printed page" as the copyright notice for easier
+identification within third-party archives.
+
+Copyright 2019, Facebook, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..538a8f8e
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,3 @@
+include LICENSE
+include CONTRIBUTING.md
+include requirements.txt
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..3c1f12c4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,79 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A deep learning library for video understanding research.
+
+
+ Check the website for more information.
+
+
+
+## Introduction
+
+PyTorchVideo is a deeplearning library with a focus on video understanding work. PytorchVideo provides resusable, modular and efficient components needed to accelerate the video understanding research. PyTorchVideo is developed using [PyTorch](https://pytorch.org) and supports different deeplearning video components like video models, video datasets, and video-specific transforms.
+
+Key features include:
+
+- **Based on PyTorch:** Built using PyTorch. Makes it easy to use all of the PyTorch-ecosystem components.
+- **Reproducible Model Zoo:** Variety of state of the art pretrained video models and their associated benchmarks that are ready to use.
+ Complementing the model zoo, PyTorchVideo comes with extensive data loaders supporting different datasets.
+- **Efficient Video Components:** Video-focused fast and efficient components that are easy to use. Supports accelerated inference on hardware.
+
+
+## Installation
+
+Install PyTorchVideo inside a conda environment(Python >=3.7) with
+```shell
+pip install pytorchvideo
+```
+
+For detailed instructions please refer to [INSTALL.md](INSTALL.md).
+
+## License
+
+PyTorchVideo is released under the [Apache 2.0 License](LICENSE).
+
+## Tutorials
+
+Get started with PyTorchVideo by trying out one of our [tutorials](https://pytorchvideo.org/tutorials/) or by running examples in the [projects folder](https://github.com/facebookresearch/pytorchvideo/tree/master/projects).
+
+
+## Model Zoo and Baselines
+We provide a large set of baseline results and trained models available for download in the [PyTorchVideo Model Zoo](https://github.com/facebookresearch/pytorchvideo/blob/master/docs/source/model_zoo.md).
+
+## Contributors
+
+PyTorchVideo is written and maintained by the Facebook AI Research, including following memebers in alphabetical order: Aaron Adcock, Amy Bearman, Bernard Nguyen, Bo Xiong, Chengyuan Yan, Christoph Feichtenhofer, Dave Schnizlein, Haoqi Fan, Heng Wang, Jackson Hamburger, Kalyan Vasudev Alwala, Matt Feiszli, Nikhila Ravi, Tullie Murrell, Wan-Yen Lo, Weiyao Wang, Yanghao Li, Yilei Li, Zhengxing Chen, Zhicheng Yan.
+
+## Development
+
+We welcome new contributions to PyTorchVideo and we will be actively maintaining this library! Please refer to [`CONTRIBUTING.md`](./.github/CONTRIBUTING.md) for full instructions on how to run the code, tests and linter, and submit your pull requests.
+
+
diff --git a/dev/README.md b/dev/README.md
new file mode 100644
index 00000000..f6030ff0
--- /dev/null
+++ b/dev/README.md
@@ -0,0 +1,11 @@
+## Running Linter
+
+
+Before running the linter, please ensure that you installed the necessary additional linter dependencies.
+If not installed, check the [install-README](https://github.com/facebookresearch/pytorchvideo/blob/master/INSTALL.md) on how to do it.
+
+Post that, you can run the linter from the project root using,
+
+```
+./dev/linter.sh
+```
diff --git a/dev/linter.sh b/dev/linter.sh
new file mode 100755
index 00000000..5a517f69
--- /dev/null
+++ b/dev/linter.sh
@@ -0,0 +1,32 @@
+#!/bin/bash -ev
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Run this script at project root with "./dev/linter.sh" before you commit.
+
+{
+ black --version | grep "20.8b1" > /dev/null
+} || {
+ echo "Linter requires black==20.8b1 !"
+ exit 1
+}
+
+echo "Running autoflake..."
+python -m autoflake --remove-all-unused-imports -i .
+
+echo "Running isort..."
+isort -y -sp .
+
+echo "Running black..."
+black .
+
+echo "Running flake8..."
+if [ -x "$(command -v flake8-3)" ]; then
+ flake8-3 .
+else
+ python3 -m flake8 .
+fi
+
+command -v arc > /dev/null && {
+ echo "Running arc lint ..."
+ arc lint
+}
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 00000000..d0c3cbf1
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = source
+BUILDDIR = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 00000000..ab8a5b22
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,65 @@
+
+## Setup
+
+### Install dependencies
+
+```
+pip install -U recommonmark mock sphinx sphinx_rtd_theme sphinx_markdown_tables
+```
+
+### Add symlink to the root README.md
+
+We want to include the root readme as an overview. Before generating the docs create a symlink to the root readme.
+
+```
+cd /docs
+ln -s ../README.md overview.md
+```
+
+In `conf.py` for deployment this is done using `subprocess.call`.
+
+### Add a new file
+
+Add a new `.md` or `.rst` file and add the name to the doc tree in `index.rst` e.g
+
+```
+.. toctree::
+ :maxdepth: 1
+ :caption: Intro Documentation
+
+ overview
+```
+
+### Build
+
+From `pytorchvideo/docs` run:
+
+```
+> make html
+```
+
+The website is generated in `build/html`.
+
+### Common Issues
+
+Sphinx can be fussy, and sometimes about things you weren’t expecting. For example, you might encounter something like:
+
+WARNING: toctree contains reference to nonexisting document u'overview'
+...
+checking consistency...
+/docs/overview.rst::
+WARNING: document isn't included in any toctree
+
+You might have indented overview in the .. toctree:: in index.rst with four spaces, when Sphinx is expecting three.
+
+
+### View
+
+Start a python simple server:
+
+```
+> python -m http.server
+```
+
+Navigate to: `http://0.0.0.0:8000/`
+
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 00000000..6247f7e2
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 00000000..58a9308d
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,16 @@
+docutils==0.16
+# https://github.com/sphinx-doc/sphinx/commit/7acd3ada3f38076af7b2b5c9f3b60bb9c2587a3d
+sphinx==3.2.0
+recommonmark==0.6.0
+sphinx_rtd_theme
+sphinx_markdown_tables
+mock
+numpy
+av
+opencv-python
+parameterized
+git+git://github.com/facebookresearch/fvcore.git
+git+git://github.com/facebookresearch/iopath.git
+https://download.pytorch.org/whl/cpu/torchvision-0.8.2%2Bcpu-cp37-cp37m-linux_x86_64.whl
+https://download.pytorch.org/whl/cpu/torch-1.7.1%2Bcpu-cp37-cp37m-linux_x86_64.whl
+
diff --git a/docs/source/_static/css/pytorchvideo_theme.css b/docs/source/_static/css/pytorchvideo_theme.css
new file mode 100644
index 00000000..d1ee46a0
--- /dev/null
+++ b/docs/source/_static/css/pytorchvideo_theme.css
@@ -0,0 +1,134 @@
+/* Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. */
+/*
+ * some extra css to make markdown look similar between github/sphinx
+ */
+
+/*
+ * Below is for install.md:
+ */
+.rst-content code {
+ white-space: pre;
+ border: 0px;
+}
+
+th {
+ border: 1px solid #e1e4e5;
+}
+
+div.section > details {
+ padding-bottom: 1em;
+}
+
+body {
+ font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
+
+/* Default header fonts are ugly */
+h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption {
+ font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
+
+
+/* Settings for the top left section for logo */
+.wy-side-nav-search {
+ background-color: #fff;
+}
+
+.wy-nav-content-wrap, .wy-menu li.current > a {
+ background-color: #fff;
+}
+
+@media screen and (min-width: 1400px) {
+ .wy-nav-content-wrap {
+ background-color: rgba(0, 0, 0, 0.0470588);
+ }
+
+ .wy-nav-content {
+ background-color: #fff;
+ }
+}
+
+/* Make sure that the logo fits nicely and takes the space available */
+@media screen and (max-width: 1000px) {
+ .wy-side-nav-search>a img.logo {
+ height: 60px;
+ }
+}
+
+.wy-side-nav-search a {
+ display: block;
+}
+
+.wy-side-nav-search>div.version {
+ color: #000;
+}
+
+/* This is needed otherwise the home icon disappears */
+a {
+ color: #0B1BE8;
+}
+
+a:hover {
+ color: #0B1BE8;
+}
+
+a:visited {
+ color: #0B1BE8;
+}
+
+/*a.icon.icon-home {
+ color: #f0327a;
+}
+.version{
+ color: #f0327a !important;
+}*/
+
+.wy-menu a {
+ color: #b3b3b3;
+}
+
+.wy-menu a:hover {
+ color: #b3b3b3;
+}
+
+.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
+ color: #0B1BE8;
+}
+
+.rst-content dl:not(.docutils) dt {
+ display: table;
+}
+
+/* footer settings */
+footer {
+ font-size: 80%;
+}
+
+footer .rst-footer-buttons {
+ font-size: 125%; /* revert footer settings - 1/80% = 125% */
+}
+
+footer p {
+ font-size: 100%;
+}
+
+/* Fixes for mobile - adopted from pytorch theme*/
+.wy-nav-top {
+ background-color: #fff;
+ background-image: url('../img/pytorchvideo-logo.png');
+ background-repeat: no-repeat;
+ background-position: center;
+ padding: 0;
+ margin: 0.4045em 0.809em;
+ color: #333;
+}
+
+.wy-nav-top > a {
+ display: none;
+}
+
+@media screen and (max-width: 768px) {
+ .wy-side-nav-search>a img.logo {
+ height: 60px;
+ }
+}
diff --git a/docs/source/_static/img/ptv_logo.png b/docs/source/_static/img/ptv_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..b81fc9bcda59fce259ed0f07252b77ab78a915e5
GIT binary patch
literal 10053
zcmeHt_g9lk)NTSu2T?#!qbj-_JaTjgzfC#A9RF>I)p$(Aa`zFw+PAJAoizRw8|3t
z$!zp$E(V%LPcQT-T2U(8gXb$FOH%Or=B@i
zmxe&T@5T&rLh`-n1R=U}3>uK=FDyXfXWS$RgjHe)3gI!%r-w*p>oP%1)nWkSlmE~7
z|I|Ze6BeoZFxo**!(b>@@Ie2F&?ieb>DJ>}1*^*4;qap;VIfCrJHHNI*0LyEi{T`F
zmw>|~!^1)%QJ`b&!72U}T_)s)9~|DMuGQSCq1CMI9|S#cXBQmG2nU>zfU5w5q3U#J
zFzIMaL5Sw>=nsNQFc^6Z<}NR;7R*c=P}6N>&~TItJ|}Ma-jUk=mWdRv%d~!jHE6b*
zfmO)`+VIa16t|=1)|I6TtvZz^7Oj|RlCSoXg~wgo6jE1mgDL3_YfxJX3y;L#=SxFS
z7v;S3w79p`kTMozJ7Lm>-SJcmr)1~+gfn2c!bOT6xdyo1y66~c?PBhOev#54Xi;rF
ze--twAoAfTLEA)VxM4m$qhCa5Y8UP(6b^T=ga?jpd;YLtC&oql<)@va)@bTwBQR4x@AIm$V@0=w?o2j*4`vkggzroUm1!)!C3pMrp3Q}NBX~hA#
zz?a3TyB=_OWw^FB5VDh-6g>=>Ljog0fBl1Xq1o?!op9Ez2J6@<&c*`>K
zbYHP@bV2m_6X)w+IR~v9Xhp@s8qU#o89(Ecyq{ESh`Sy!{eNW`216Ef)<_sA4pa;x
zB)w!AlhSpW?%hbL)ja=bGL2{5ONKECSm@sOa#s^=iqm8ox|CtYWc>Q9kRv;4&Ky!K
zO@~k8r>0|qO$y0KZgB%h;pHJ{S_amo45I-c(kh_j^2%ADN|iy~aR&(MdsD>l4K~es
zZ&7k0$M#k#eS+u^G+ZjVwhZ%F+jckf9~F>rp~E(zrs>E`C%FDH(SXn6*7A7A
zrjY%9-^ov41tVm4@ppuVL1hE8|7kIHZTmm9JY41>*UVPw51i9w)?e{hqCSNn{e%8^T*XD3^
zB~FOH3U=iRd)=E@V&&+F0N*P)`$H6z7`423^x5o+$l$a$ZaaB?($KG?4e&^vyOt+puk-Q=
zEKl-ZOt3pSzUr?dR{IcZFGtA|pgNa^KIf7|hu|l>Cr1mxR#DVfl=}Tf+F4RVy;u)X
zoB-Wi^*_ZQqY^i5chER!e7e%Xpvu<0K~_POPOzP`bKuNi|NFOygJ=xR`g*Fc)m?1_{%^DMg8^I$>4?Pjc*T`{oCc3k-8HA49Aj$`b#}f
z9Ecb){c(t=xWYfPKG}7&M|!(K9yrVlZK%Jx~M}2@=oY
zFJCAYywpsOI75nV5$}PTivawRcYbNe{pImHsZUdbmWa-7F3PM0Na&ct9;cY+7XT
zZyy8nA)u`QSKrPoMn{4GmIj8V0r(^U>1ZH(C8*p?rIiRD&YtYaA1_Ab9;KZ=-qX=O
zq|Ci)m2Kx|LF)R&Gy6wveLmSIN9`t_Z1F`|o)AIA+7-rN4yXV>BVu3W88jWe7&P+v
z>6x0^KJ8k0z+TFaoO6m5R8R%6B}|jkzmX-Voy3f6f3z#Os9oxILhI1kN4K?5B~Hl3
zi6X&M0I6?MwLR8)ZU*ZsgP7axWF3%13R%f{*JCXgX8B@kNf*D@S(Mupv?eC=O_$
zP*E4Z)}M0_TzR}Ed`OMjE*YOaaXsA@rAR2MAi%Vm8QX>T+ohmEvZtB=1TDr7uq6t?
z=J4?hIGlKL*fy_}7mNtky$MS3-eo9aFBUIB!vc
z)=QVL>5eVtAl*t}8U+_t|3>|uu&a($nu!W5RVp!@{5O;S6+oooHY}?Q5!HD+Pnx9&
zTCE_IoaNR0%t=VlWnwR?{KlEX&@?4R=$BB5m|EJjPaXJ%KeH}mozG8rt;_Vfv{3{W
z4CF>3TeQjhhkf`8?&auYV7NQQ{EJs^uzMNh)87SnQwAtd{H13XArnAQm;V~(
zH9*_M-_({GJN<2I>AEku8erx%z>XY?KSU64Be)hJ8UVF=S$9)$Z(r?lFGeeZX3=|6
zJuNRp+})WRm4We1VlTEs06h(8G@yonasub!V^EJ41j)zy{rTKHFd`1PtpJ#_>pC#A
zek#3no-R{TtO!$;dJHE)>N=-mIPgmD*o(TsKxd{vu@nHdYbeUj&X3rORaT&p3v@<-
z8LR+30hEvaR1;Ox;@Tm_$(K9_eXW~MuW`8t%5DNEfcrWqB)q^x1T8SZ3lSy-Auz!?
zFi`|}mlx1xK>r2pDA4{|gvpdMhV#!QFtIU^+#i7Us>2S_68ske|GEBBpxit_p-`*w-7J=I?Je6XcH-#%L&gO{7U
z3^EXr7raweF<+7(F`yjh^B<8>(fkQ(^2Vf>suF#&8K_JL>hG+85>&$Fs19T+b2S`p
zv2x}j7<1Tb{%`7&=~6BB^DrO^U0#14x#7R@xN2v9bHCsq^4AAvam3?%2P4U!#GD?H
z?TRngkWzt4sG3EpmbO+iO0mnQho8Q2J+myHypBT8hDU_{)Oe<{>c7#wz#KA*4~&H4
zckdwZy}8a%3@N@ZRAWSG%;Jvuspl~)$RixzoN*O>dcw0uzE`8PCgghTh9V+C3I1ju
zH=TEKGmumD(h$drF#`@3h|oFTdmUQ%Z`p`eO0|0-P|K0>k2=?y@h0XlqeRWNoxrF4
z%*YNyUYxIaz0d57+ED2Sw7(azq6stCpFmky3@yH}uX@Z$hz=y7D6YNL!c;y+>c9y#
zE?#27wY8LdaO|Zj<9f>~tme4p`7G_dQ;oiGWr^lE^l3h$URHq
zdd)T}F_&aTF7kF4K7y4Iv
zw}k?mcoj#iy;Bw71lKT(S7X?dM4v05wd8H3sRfVQ?O2&G_K%^~vHTZ`R;h;Mmqv#ee{
zik@hn-SOYSv&q}Ef6nml;7(Axvu$
zmWlC!*_U&ZF6^NMm36O7CuOW|cd33VeXX6GCm|aI-4`1eHi|&4Dxt=^)nB@xD{D@E
zP6V%BYlMbHyc&t3dUbAAI=l6>JX`Q)RUH4~
zeyr*i5_A;0LWkh(R$cr!Vh7vo$9fLxN88e9Q*W3{wwbcl>3#v=t)4WanFb6xItRyoayl&D
zP#g=49Dj1@aQj`X1p@DTmE&i>m+Dk$nOabSb;DH@#%%(9IN*Krk=BEfEsrPM9IJ_P
z9k&^3eU+k)wl-L88QS^EV(o5W-)TLd{#j7T>rTc6zObD`t7XW@ya18MJxC-}kgO+1
z^G%tro!to;U5Vr|O?l_Ape`=($H^;id2R
z^))PtPxKPPMCajge+GzAIWq=gJl@;HkMp#*DK!0As8p-c;DY;9P+daxiiF#Jm{PnC
zY@#JINLyDU)|lea@CQ096+ITH<@-J_aEzJwlC>AjToDHr0TYOCdYmS3o1w6bc@0$A
z(tD?mRJE*?Ez0rfLRItcR>`ujV39LzAIR8VvZ&QF9AB;R^P#b>q=O$O*C}Ar4}Xr|
zZTVCj;8bBQb$F|Eb`STUGu#Bf2!FcPnzH|J?qIV(uzbFjHV1e+E8Idebn|8O%nQ}W
zw5!r`=FZLiG}5C^G|a%>pSc;2Wf|<4-cN_c278wUoJkm5SRRGzwe}gDS#QX?$kk1g
z#w;Ni2i*nZvBi&PP17#%22}S^6yk1uOfc+LnxQa37)X0yzSm
z5;yI=DV)j_ktvm+YZd+J-SR}yfh5K~vR6(LTlNxycwstZP}p>jRcuA?DfW+XcJNSD
z!xB
zfxe-iK1gG4XUK))tH{ts*AExSNPk-eIry#of!QFUvPt3{`aWzi1$}@~ppJ^n^KzS@
zu;;%%%~+B3VQ`m{q*@TKd3+LW7PiY~(=(gYw8`un;wMco2W-xNvKeGRC{7_qH;Hk)
zusq)%eWvzmCS|R9@)yz{J@V9Mj+#_Q{n%C^)a^}|GL}}JiJy_}JFwdqa&Dd+O^15e
z8p()QuzBA{yF>|fA3Jw$MvPa5T`x>`s@PCG8Jqa(SDF2<@H-;C{_*<6{&XEqCHSKh
z38jYa&uvg2*blwCg6tIPf7K{>_Kecm06faqHl|ap$c~GDfddK8G|U^V99jRV%vSF$
zVpQuhdT%BZ9zhEyyyaKI?}%iEu?TrRSwC&LZu(lB@DaiFVzV0)v{N>gQW4NME}TM>
z{4v}pbG-jnF|bWyXyeu>clc;V>xb~#$}I+?(ZfWtPcqum{N=HD`mwc45AmF#=e0-t
zH0#dL4J^)a@>BVkh_-d(1ww)0A20pRd2|S|eAeaZ$9<07;ttAMqj#)NhhNJ}a+B0p
zpr1Fzx#QT&73D>=I}@MvWqOKP&$j--YNSCG%^y7*U~BO{tE;c&8aQdF@qjCTF*7f<
z4PJeVlW9#`4f~@U`>Si?QLpX~15KNMVtgFWNcx*N!Hi(UPjicmx1&Zz8=TD1?0yqm
zroPVo^~MvQ@5U&=BP^9~)dHsxNn{Q}L=KK-z4Bxt
zUhdv;>+I;C$n!-?ElKM|YWK-1Vb<(=6?XD_<>uw^Zxf%4@!W1r(rtM2N3iKJzQwxk
zQ+ct<)=Xm|A}Fqb`Mt3Beb?*@F+L$id~RsA{M9d;bxsavv=hU{621veDhPfq*-p*E
z=*UQeB2B|Lj5Q$G>f&~9$e_!vId4P9Lin(05&7}5Lq%lB#ac5>ZWwi%Bisv*8r^E9
zh_UakhjuifCbz^ojoy{O^Ajok#gL=AS4&ks8x_%Q|T&8GQ~F;l0e#*M7U=X0!jZj5q}h|!*xd5Cl_sdN@?Tl-c)P^*L@H?su|quPLN*6`L|K4g_{Jb
zBO=u&+3$K&{E>aPXZF&KuHF%Gy^o~>EfnW$t28wSqhupI$FP-_Pt7ck_N!aR#E?;!
zCz~pyo8y;)mB{NfZqZA8PrLR+@H}$4=c}^$rsSR18fERhlSB2M-OKT&;_a)qFLA|a!5C|LPZjuDPLov6
zxvcolCi;f4|9klnYXU;+kqR*h7nWR{s+i>&rQi6AtO!3V@USdE%lO*la=6B#-s=4=
z$24*nBKXSB_smEwTLR|?53QVFa?;8JMfHL*`G6o?-iXf+#bp_)#lBI7z)IuLj(RZf
zmBy@hxErI2=WsnpK1+*Z?1d?%ulzsk1bzoS^c+-anax?xX-?to{jw8uXv;`k|EDR2
z9_iv5)V{#a8&hGxhJIaX^DrnaUJYM0yXyR^H$Ji8D|+vdD&H%GYym^OzTRuP;JiFD
zpX~lAt;zuoG3-x&Zm$MTqzrcKR=|b<-q9TIz%tuetmx@7{cPd0b$XRe^<{g$1$g2=
zLxEQU(c^(fpq4F5N54J+33@K!+nFiVb$w*hN73Mz;HKP34gbo?POqLB?L1O^-Uzk8
zV)9g%Cdo(IwXCuj9V9i|Io*WP8mW$Qq4_M@Z`m
zPksV>Wu5JP9vFR7v74$e&LdJvzUGKt=z6xkCe=DPnHt$zQtV=$BXB#Z{JU14w*aYz
z``xW`SF^gp-uy=N?M^4{{;3_`7mS2u-B`4zTy&Dh=V?Cn(`%VByS&uPIq^AcxJqbf
zSbRaDxM9z(C(>TiL(N>jp@KZ^pu_JMm+Sox8{#4MVdd2Y|4~avf${-ic=G**gYT9^
znl*j7XPJ1_SLbh37}2F(qA*Osm{C
zG#!Ze9;ZJVc&vZ8eq%IF*XL8G+~7XC@5mSI8?koJcdx2OY}WXB*`)cTQE+yJ_KyQA
zKFm$K>b>?<>%O*k34c4Cz~NBGzcagQ>8S6u%j-W^@R~AV#yLF*hgn(h1gmeW_kpXS~?DQT79i=+nmze
zm83?SmNVbRu_J}V`IwPMcMy-;Y?%EcQL6zQlv)+o`XGEI-oC8d&?Vl)(e^%j1f
zT!F;>i2nM|#r_*6P7S+Qm;03(nE&2>uvY_0wwHfeqZ+f`&!4_^TZx^F^?6LNQvF`d
zzf~hwFD9Yao*Qz8jF{O&$yBM1*AsrCq{(b4q3Izqpi~XVDacI?FC7GYs7Y@y3VV~D
z{%A|7*r#@SG-OKahT#RSmXvB$iunbmhxbM)%7{Gg?xE$1BVSg+^s(w}XH4fG!&|!f
z=oBlCv`ZIYMk(#EgAEk6!TC;FYqEFsf;azY-6fd@I$QM=pU!hOg%O!D4{hb+Mcd`M
zNaA9cXHASD<7r_*Y+11qdUe&kUHMaW@}*
zXT1|VTHfxDKPEi4SxhHLF3lCJUJw1aDM}R-aLJ$Qg)=Zcz-qQ
zh7l|@#^iP~Lx$mB4zd=m7$<-?>8$ff@d{*9?
zm9o2^(iK=%*fFezT9z
zpR9QI`g?qrl5}BHReQNq}N7+^^9lDo5UvUvrX^wrlb{FczJw
zTg95>O_mC!7EGy7hlXhCui6Pa7s8KprG|{GD*n4FOt$zPO)^qJtmjcx7_VU7xpTOz
z*z=q#Fu8z1erpO`4Jz)pbew$NL{}zYgh#G5tQV2%%hnt$;0~&XY4VJe?_Pzg*-;~S
z_0uaH8>HUYc($Dz`RG#KH?PzB4$OTEaY6qT+|3{-7`IY1#`LSyeXTcTl5=F%K3o;=
zQ$BT~d1fta_4j>fU!rb08?%Yw^jg`SbvJOnbh(Y9!PQALi?mud9Y
ze^cRc=q$W)NC7*q2YQ-W1GA)VqdlX1bi?}Mi@rRQCJk64W+AT#e354)|Gf+FZeODd
z>n$5=V4hia7eTpn5rCFh;Y2?
zVXm(L=B8&Nmo_2XIz-;&?=}p^{@XzI#v=)8T4{XoLc|2}8tqz;k8wV_sZP&?9(fal^evD-|8<)7+swU#N3Oj?yp|{>OQfo)jS!r=h>XG&0PEn
z_JIGBEAvsr)Iu_43MTVoxRMpczd9x_zGH7Kr*-)V+u0PV-uaEI_k9Jv^rDK^mR8DL
z(={74&)WC5pQf)250!_-nylbR36EF{hxL!pyeG;zEAD=B4&_-P+{7`H`4x}ig2RW-
zxVK2zoRvZRpPVAAmAvlOTqD>q6~6#}iDg^z|3g{xnuDcx(m5Df=o+sUAr689ASs;X#tI
z_T#04k!N*2*wPTojQ(v)p~@*0lam)>DBi1a-^_2nxI2Uq0=DdM7zg8_eJ?c#w0PS8j+
zgy@ocHO2?*=iXP4Gh|{7FJ!g_htKMl+Lj0``E?u^cpqVQzE69bZ5{e`-kX`Cct6;^
z&30p!n1T6?9kF!uTks0d%lk_8>oO36B+UuF>n$vKdMyi{>I
z)kpEPA@R~krTWPjwwO{fQ$5dFVRjYB$)DdMT8EK@s#IUTkgq)U-rpz&TtzNnw120T
z*}s9$Q?#qI9Aja$=OpKINL|pFUJn0bPC6g<(J8EDp*POL@95!xekNw4_|qFxqXHG2F)i$>myr)uHUj|ePBI0x#nVdady3$+vy4Ak*yDh7)@F1uamHnGj0FlU)G<{`U
z2JwMiv{w7atKmTFq~gGtcursMuUDlFr-nxcjtI=i@O)6Keo1oZcTkhVq9vK3?ZhR5Z)MeP_TbpBICf(Lp3%vl4f*O9!CsD6@CyRBXklc>gRYGRWXC
zocQ%TL^6Yw_yX)Al(D8(30k&jx60i0?Iwn3DQ?m&>DX!Ck&sW=3Wu*)f5Xlct`!bM
zxjgXdeeFCGNaj^U#U)(_u+;O_e=Ld
z+jEDOG@Pq>zL_PyJPl`P=b5+9gg_xQ{d}35NoC<&I*Fd6MN7B;ekmAl&AS94SpA<1
lSxN8y*WJ>32yRG4Wwk-r?M(f`z`Z!g9ev}Q6*nAX{|CBzZ^r-t
literal 0
HcmV?d00001
diff --git a/docs/source/_static/img/ptv_logo.svg b/docs/source/_static/img/ptv_logo.svg
new file mode 100644
index 00000000..352c7dd4
--- /dev/null
+++ b/docs/source/_static/img/ptv_logo.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html
new file mode 100644
index 00000000..7ab36676
--- /dev/null
+++ b/docs/source/_templates/layout.html
@@ -0,0 +1,25 @@
+
+
+{%- extends "!layout.html" %}
+
+{% block sidebartitle %}
+
+
+
+
+
+{% if theme_display_version %}
+ {%- set nav_version = version %}
+ {% if READTHEDOCS and current_version %}
+ {%- set nav_version = current_version %}
+ {% endif %}
+ {% if nav_version %}
+
+ {{ nav_version }}
+
+ {% endif %}
+{% endif %}
+
+{% include "searchbox.html" %}
+
+{% endblock %}
\ No newline at end of file
diff --git a/docs/source/accelerator.md b/docs/source/accelerator.md
new file mode 100644
index 00000000..5f6f6442
--- /dev/null
+++ b/docs/source/accelerator.md
@@ -0,0 +1,53 @@
+
+# Overview
+
+Our vision for PytorchVideo/Accelerator is to enable video understanding models to run efficiently on all tiers of hardware devices, from mobile phone to GPU. PytorchVideo/Accelerator (Accelerator) is aimed to accelerate the speed of video understanding model running on various hardware devices, as well as the whole process of design and deploy hardware-aware efficient video understanding models. Specifically, Accelerator provides a complete environment which allows users to:
+
+* Design efficient models for target hardware with carefully tuned efficient blocks;
+* Fine tune efficient model from Model Zoo;
+* Optimize model kernel and graph for target device;
+* Deploy efficient model to target device.
+
+
+We benchmarked the latency of SOTA models ([X3D-XS and X3D-S](https://arxiv.org/abs/2004.04730)) on a mainstream mobile device (Samsung S9 International, released in 2018). With Accelerator, we not only observed 4-6X latency reduction on fp32, but also enabled int8 operation which has not been supported in vanilla Pytorch. A table summarizing latency comparison is shown below.
+
+|model |implementation |precision |latency per 1-s clip (ms) |speed up |
+|--- |--- |--- |--- |--- |
+|X3D-XS |Vanilla Pytorch |fp32 |1067 |1.0X |
+|X3D-XS |PytrochVideo/Accelerator |fp32 |233 |4.6X |
+|X3D-XS |PytrochVideo/Accelerator |int8 |165 |6.5X |
+|X3D-S |Vanilla Pytorch |fp32 |4248 |1.0X |
+|X3D-S |PytrochVideo/Accelerator |fp32 |763 |5.6X |
+|X3D-S |PytrochVideo/Accelerator |int8 |503 |8.4X |
+
+## Components in PytorchVideo/Accelerator
+
+### Efficient block library
+
+Efficient block library contains common building blocks (residual block, squeeze-excite, etc.) that can be mapped to high-performance kernel operator implementation library of target device platform. The rationale behind having an efficient block library is that high-performance kernel operator library generally only supports a small set of kernel operators. In other words, a randomly picked kernel might not be supported by high-performance kernel operator library. By having an efficient block library and building model using efficient blocks in that library can guarantee the model is deployable with high efficiency on target device.
+
+Efficient block library lives under `pytorchvideo/layers/accelerator/` (for simple layers) and `pytorchvideo/models/accelerator/` (for complex modules such as residual block). Please also check [Build your model with PytorchVideo/Accelerator](link_fixme) tutorial for detailed examples.
+
+### Deployment
+
+Deployment flow includes kernel optimization as well as model export for target backend. Kernel optimization utilities can be an extremely important part that decides performance of on-device model operation. Accelerator provides a bunch of useful utilities for deployment under `pytorchvideo/accelerator/deployment`. Please also check related tutorials ([Build your model with PytorchVideo/Accelerator](link_fixme), [Accelerate your model with model transmuter in PytorchVideo/Accelerator](link_fixme)) for detailed examples.
+
+### Model zoo
+
+Accelerator provides efficient model zoo for target devices, which include model builder (under `pytorchvideo/models/accelerator/`) as well as pretrained checkpoint. Please also refer to [Use PytorchVideo/Accelerator Model Zoo](link_fixme) for how to use model zoo.
+
+
+## Supported devices
+
+Currently mobile cpu (ARM-based cpu on mobile phones) is supported. We will update this page once more target devices are supported.
+
+
+## Jumpstart
+
+Refer to following tutorial pages to get started!
+
+[Build your model with PytorchVideo/Accelerator](link_fixme)
+
+[Use PytorchVideo/Accelerator Model Zoo](link_fixme)
+
+[Accelerate your model with model transmuter in PytorchVideo/Accelerator](link_fixme)
diff --git a/docs/source/api/data/charades.rst b/docs/source/api/data/charades.rst
new file mode 100644
index 00000000..285c3bd7
--- /dev/null
+++ b/docs/source/api/data/charades.rst
@@ -0,0 +1,10 @@
+pytorchvideo.data.charades
+===========================
+
+Dataset loaders and supporting classes for Charades dataset stored as frames
+
+.. automodule:: pytorchvideo.data.charades
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
diff --git a/docs/source/api/data/domsev.rst b/docs/source/api/data/domsev.rst
new file mode 100644
index 00000000..88f6ec34
--- /dev/null
+++ b/docs/source/api/data/domsev.rst
@@ -0,0 +1,9 @@
+pytorchvideo.data.domsev
+===========================
+
+
+.. automodule:: pytorchvideo.data.domsev
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
diff --git a/docs/source/api/data/encoded_video.rst b/docs/source/api/data/encoded_video.rst
new file mode 100644
index 00000000..1057b581
--- /dev/null
+++ b/docs/source/api/data/encoded_video.rst
@@ -0,0 +1,59 @@
+pytorchvideo.data.encoded_video_dataset
+===========================
+
+Dataset loaders and supporting classes for encode video datasets (Ex: Kinetics, HmDB51, UCF101, etc)
+
+.. automodule:: pytorchvideo.data.encoded_video_dataset
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+pytorchvideo.data.encoded_video_pyav
+===========================
+
+
+.. automodule:: pytorchvideo.data.encoded_video_pyav
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+pytorchvideo.data.encoded_video_torchvision
+===========================
+
+
+.. automodule:: pytorchvideo.data.encoded_video_torchvision
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+pytorchvideo.data.encoded_video
+===========================
+
+
+.. automodule:: pytorchvideo.data.encoded_video
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+pytorchvideo.data.ucf101
+===========================
+
+
+.. automodule:: pytorchvideo.data.ucf101
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+pytorchvideo.data.kinetics
+===========================
+
+
+.. automodule:: pytorchvideo.data.kinetics
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/data/epic_kitchen.rst b/docs/source/api/data/epic_kitchen.rst
new file mode 100644
index 00000000..5fe93103
--- /dev/null
+++ b/docs/source/api/data/epic_kitchen.rst
@@ -0,0 +1,18 @@
+pytorchvideo.data.epic_kitchen_forecasting
+=================================
+
+
+.. automodule:: pytorchvideo.data.epic_kitchen_forecasting
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+pytorchvideo.data.epic_kitchen_recognition
+=================================
+
+
+.. automodule:: pytorchvideo.data.epic_kitchen_recognition
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/source/api/data/extra.rst b/docs/source/api/data/extra.rst
new file mode 100644
index 00000000..71ccc653
--- /dev/null
+++ b/docs/source/api/data/extra.rst
@@ -0,0 +1,48 @@
+pytorchvideo.data.labeled_video_paths
+===========================
+
+
+.. automodule:: pytorchvideo.data.labeled_video_paths
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+pytorchvideo.data.frame_video
+===========================
+
+
+.. automodule:: pytorchvideo.data.frame_video
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+pytorchvideo.data.clip_sampling
+===========================
+
+
+.. automodule:: pytorchvideo.data.clip_sampling
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+pytorchvideo.data.video
+===========================
+
+
+.. automodule:: pytorchvideo.data.video
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+pytorchvideo.data.utils
+===========================
+
+
+.. automodule:: pytorchvideo.data.utils
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/data/hmdb51.rst b/docs/source/api/data/hmdb51.rst
new file mode 100644
index 00000000..b63e6c6d
--- /dev/null
+++ b/docs/source/api/data/hmdb51.rst
@@ -0,0 +1,9 @@
+pytorchvideo.data.hmdb51
+===========================
+
+
+.. automodule:: pytorchvideo.data.hmdb51
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
diff --git a/docs/source/api/data/index.rst b/docs/source/api/data/index.rst
new file mode 100644
index 00000000..960f3c56
--- /dev/null
+++ b/docs/source/api/data/index.rst
@@ -0,0 +1,11 @@
+Data
+==================
+
+.. toctree::
+
+ encoded_video
+ charades
+ epic_kitchen
+ domsev
+ hmdb51
+ extra
\ No newline at end of file
diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
new file mode 100644
index 00000000..ef7efd8f
--- /dev/null
+++ b/docs/source/api/index.rst
@@ -0,0 +1,9 @@
+API Documentation
+==================
+
+.. toctree::
+
+ models/index
+ data/index
+ layers/index
+ transforms/index
\ No newline at end of file
diff --git a/docs/source/api/layers/index.rst b/docs/source/api/layers/index.rst
new file mode 100644
index 00000000..2e72243c
--- /dev/null
+++ b/docs/source/api/layers/index.rst
@@ -0,0 +1,6 @@
+Layers
+==================
+
+.. toctree::
+
+ layers
\ No newline at end of file
diff --git a/docs/source/api/layers/layers.rst b/docs/source/api/layers/layers.rst
new file mode 100644
index 00000000..82ac603e
--- /dev/null
+++ b/docs/source/api/layers/layers.rst
@@ -0,0 +1,72 @@
+pytorchvideo.layers.batch_norm
+=================================
+
+
+.. automodule:: pytorchvideo.layers.batch_norm
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+pytorchvideo.layers.convolutions
+=================================
+
+
+.. automodule:: pytorchvideo.layers.convolutions
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+pytorchvideo.layers.fusion
+=================================
+
+
+.. automodule:: pytorchvideo.layers.fusion
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+pytorchvideo.layers.mlp
+=================================
+
+
+.. automodule:: pytorchvideo.layers.mlp
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+pytorchvideo.layers.nonlocal_net
+=================================
+
+
+.. automodule:: pytorchvideo.layers.nonlocal_net
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+pytorchvideo.layers.positional_encoding
+=================================
+
+
+.. automodule:: pytorchvideo.layers.positional_encoding
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+pytorchvideo.layers.swish
+=================================
+
+
+.. automodule:: pytorchvideo.layers.swish
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+pytorchvideo.layers.squeeze_excitation
+=================================
+
+
+.. automodule:: pytorchvideo.layers.squeeze_excitation
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/models/byol.rst b/docs/source/api/models/byol.rst
new file mode 100644
index 00000000..48aefbfc
--- /dev/null
+++ b/docs/source/api/models/byol.rst
@@ -0,0 +1,8 @@
+pytorchvideo.models.byol
+=================================
+
+
+.. automodule:: pytorchvideo.models.byol
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/models/csn.rst b/docs/source/api/models/csn.rst
new file mode 100644
index 00000000..cdc2beb9
--- /dev/null
+++ b/docs/source/api/models/csn.rst
@@ -0,0 +1,8 @@
+pytorchvideo.models.csn
+=================================
+
+
+.. automodule:: pytorchvideo.models.csn
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/models/head.rst b/docs/source/api/models/head.rst
new file mode 100644
index 00000000..91538143
--- /dev/null
+++ b/docs/source/api/models/head.rst
@@ -0,0 +1,8 @@
+pytorchvideo.models.head
+=================================
+
+
+.. automodule:: pytorchvideo.models.head
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/models/index.rst b/docs/source/api/models/index.rst
new file mode 100644
index 00000000..f63d30a4
--- /dev/null
+++ b/docs/source/api/models/index.rst
@@ -0,0 +1,17 @@
+Models
+==================
+
+.. toctree::
+
+ resnet
+ net
+ head
+ stem
+ csn
+ x3d
+ slowfast
+ r2plus1d
+ simclr
+ byol
+ memory_bank
+ masked_multistream
\ No newline at end of file
diff --git a/docs/source/api/models/masked_multistream.rst b/docs/source/api/models/masked_multistream.rst
new file mode 100644
index 00000000..afb01120
--- /dev/null
+++ b/docs/source/api/models/masked_multistream.rst
@@ -0,0 +1,8 @@
+pytorchvideo.models.masked_multistream
+=================================
+
+
+.. automodule:: pytorchvideo.models.masked_multistream
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/models/memory_bank.rst b/docs/source/api/models/memory_bank.rst
new file mode 100644
index 00000000..79492ee2
--- /dev/null
+++ b/docs/source/api/models/memory_bank.rst
@@ -0,0 +1,8 @@
+pytorchvideo.models.memory_bank
+=================================
+
+
+.. automodule:: pytorchvideo.models.memory_bank
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/models/net.rst b/docs/source/api/models/net.rst
new file mode 100644
index 00000000..a12e1aae
--- /dev/null
+++ b/docs/source/api/models/net.rst
@@ -0,0 +1,8 @@
+pytorchvideo.models.net
+=================================
+
+
+.. automodule:: pytorchvideo.models.net
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/models/r2plus1d.rst b/docs/source/api/models/r2plus1d.rst
new file mode 100644
index 00000000..3ab38d5c
--- /dev/null
+++ b/docs/source/api/models/r2plus1d.rst
@@ -0,0 +1,8 @@
+pytorchvideo.models.r2plus1d
+=================================
+
+
+.. automodule:: pytorchvideo.models.r2plus1d
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/models/resnet.rst b/docs/source/api/models/resnet.rst
new file mode 100644
index 00000000..0f3679e4
--- /dev/null
+++ b/docs/source/api/models/resnet.rst
@@ -0,0 +1,9 @@
+pytorchvideo.models.resnet
+=================================
+
+Building blocks for Resnet and resnet-like models
+
+.. automodule:: pytorchvideo.models.resnet
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/models/simclr.rst b/docs/source/api/models/simclr.rst
new file mode 100644
index 00000000..aabb83be
--- /dev/null
+++ b/docs/source/api/models/simclr.rst
@@ -0,0 +1,8 @@
+pytorchvideo.models.simclr
+=================================
+
+
+.. automodule:: pytorchvideo.models.simclr
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/models/slowfast.rst b/docs/source/api/models/slowfast.rst
new file mode 100644
index 00000000..6d521b45
--- /dev/null
+++ b/docs/source/api/models/slowfast.rst
@@ -0,0 +1,8 @@
+pytorchvideo.models.slowfast
+=================================
+
+
+.. automodule:: pytorchvideo.models.slowfast
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/models/stem.rst b/docs/source/api/models/stem.rst
new file mode 100644
index 00000000..dbf8046c
--- /dev/null
+++ b/docs/source/api/models/stem.rst
@@ -0,0 +1,8 @@
+pytorchvideo.models.stem
+=================================
+
+
+.. automodule:: pytorchvideo.models.stem
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/models/x3d.rst b/docs/source/api/models/x3d.rst
new file mode 100644
index 00000000..0d7b953b
--- /dev/null
+++ b/docs/source/api/models/x3d.rst
@@ -0,0 +1,8 @@
+pytorchvideo.models.x3d
+=================================
+
+
+.. automodule:: pytorchvideo.models.x3d
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/api/transforms/index.rst b/docs/source/api/transforms/index.rst
new file mode 100644
index 00000000..313f68e2
--- /dev/null
+++ b/docs/source/api/transforms/index.rst
@@ -0,0 +1,6 @@
+Transforms
+==================
+
+.. toctree::
+
+ transforms
\ No newline at end of file
diff --git a/docs/source/api/transforms/transforms.rst b/docs/source/api/transforms/transforms.rst
new file mode 100644
index 00000000..87f468c9
--- /dev/null
+++ b/docs/source/api/transforms/transforms.rst
@@ -0,0 +1,18 @@
+pytorchvideo.transforms.transforms
+=================================
+
+
+.. automodule:: pytorchvideo.transforms.transforms
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+pytorchvideo.transforms.functional
+=================================
+
+
+.. automodule:: pytorchvideo.transforms.functional
+ :members:
+ :undoc-members:
+ :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 00000000..d8f0b3fd
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,197 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# flake8: noqa
+import os
+import sys
+
+import mock
+
+# -- Project information -----------------------------------------------------
+import sphinx_rtd_theme
+from recommonmark.parser import CommonMarkParser
+
+
+# -- Path setup --------------------------------------------------------------
+sys.path.insert(0, os.path.abspath("../"))
+sys.path.insert(0, os.path.abspath("../pytorchvideo"))
+sys.path.insert(0, os.path.abspath("../../"))
+
+
+# The full version, including alpha/beta/rc tags
+try:
+ import torch # noqa
+except ImportError:
+ for m in [
+ "torch",
+ "torchvision",
+ "torch.nn",
+ "torch.autograd",
+ "torch.autograd.function",
+ "torch.nn.modules",
+ "torch.nn.modules.utils",
+ "torch.utils",
+ "torch.utils.data",
+ "torchvision",
+ "torchvision.ops",
+ "torchvision.datasets",
+ "torchvision.datasets.folder",
+ "torch.utils.data.IterableDataset",
+ ]:
+ sys.modules[m] = mock.Mock(name=m)
+
+
+project = "PyTorchVideo"
+copyright = "2021, PyTorchVideo contributors"
+author = "PyTorchVideo contributors"
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+needs_sphinx = "3.0"
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ "recommonmark",
+ "sphinx.ext.autodoc",
+ "sphinx.ext.napoleon",
+ "sphinx.ext.intersphinx",
+ "sphinx.ext.todo",
+ "sphinx.ext.coverage",
+ "sphinx.ext.mathjax",
+ "sphinx.ext.viewcode",
+ "sphinx.ext.githubpages",
+ "sphinx.ext.doctest",
+ "sphinx.ext.ifconfig",
+ "sphinx_markdown_tables",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# -- Configurations for plugins ------------
+napoleon_google_docstring = True
+napoleon_include_init_with_doc = True
+napoleon_include_special_with_doc = True
+napoleon_numpy_docstring = False
+napoleon_use_rtype = False
+autodoc_inherit_docstrings = False
+autodoc_member_order = "bysource"
+
+intersphinx_mapping = {
+ "python": ("https://docs.python.org/3.6", None),
+ "numpy": ("https://docs.scipy.org/doc/numpy/", None),
+ "torch": ("https://pytorch.org/docs/master/", None),
+}
+# -------------------------
+
+source_parsers = {".md": CommonMarkParser}
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = [".rst", ".md"]
+
+# The master toctree document.
+master_doc = "index"
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "build", "README.md"]
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "sphinx"
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#
+html_theme_options = {
+ "collapse_navigation": False, # default
+ "display_version": True, # default
+ "logo_only": True, # default = False
+}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+
+html_logo = "_static/img/ptv_logo.png"
+html_favicon = "../../website/website/static/img/favicon.png"
+
+
+# setting custom stylesheets https://stackoverflow.com/a/34420612
+html_context = {"css_files": ["_static/css/pytorchvideo_theme.css"]}
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = "pytorchvideodoc"
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, "pytorchvideo", "PyTorchVideo Documentation", [author], 1)]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ (
+ master_doc,
+ "PyTorchVideo",
+ "PyTorchVideo Documentation",
+ author,
+ "PyTorchVideo",
+ "One line description of project.",
+ "Miscellaneous",
+ )
+]
+
+
+def setup(app):
+ from recommonmark.transform import AutoStructify
+
+ app.add_config_value(
+ "recommonmark_config",
+ {
+ "auto_toc_tree_section": "Contents",
+ "enable_math": True,
+ "enable_inline_math": True,
+ "enable_eval_rst": True,
+ "enable_auto_toc_tree": True,
+ },
+ True,
+ )
+ return app
diff --git a/docs/source/data.md b/docs/source/data.md
new file mode 100644
index 00000000..909c0354
--- /dev/null
+++ b/docs/source/data.md
@@ -0,0 +1,48 @@
+# Overview
+
+PyTorchVideo datasets are subclasses of either torch.utils.data.Dataset or torch.utils.data.IterableDataset. As such, they can all be used with a torch.utils.data.DataLoader, which can load multiple samples in parallel using torch.multiprocessing workers. For example:
+
+```python
+dataset = pytorchvideo.data.Kinetics(
+ data_path="path/to/kinetics_root/train.csv",
+ clip_sampler=pytorchvideo.data.make_clip_sampler("random", duration=2),
+)
+data_loader = torch.utils.data.DataLoader(dataset, batch_size=8)
+```
+
+## How do PyTorchVideo dataset work?
+
+Although there isn't a strict interface governing how PyTorchVideo datasets work, they all share a common design as follows:
+
+1. Each dataset starts by taking a list of video paths and labels in some form. For example, Kinetics can take a file with each row containing a video path and label, or a directory containing a \/\.mp4 like file structure. Each respective dataset documents the exact structure it expected for the given data path.
+
+2. At each iteration a video sampler is used to determine which video-label pair is going to be sampled from the list of videos from the previous point. For some datasets this is required to be a random sampler, others reuse the torch.utils.data.Sampler interface for more flexibility.
+
+3. A clip sampler is then used to determine which frames to sample from the selected video. For example, your application may want to sample 2 second clips at random for the selected video at each iteration. Some datasets like Kinetics make the most of the pytorchvideo.data.clip_sampling interface to provide flexibility on how to define these clips. Other datasets simply require you to specify an enum for common clip sampling configurations.
+
+4. Depending on if the underlying videos are stored as either encoded videos (e.g. mp4) or frame videos (i.e. a folder of images containing each decoded frame) - the video clip is then selectively read or decoded into the canonical video tensor with shape (C, T, H, W) and audio tensor with shape (S). We provide two options for decoding: PyAv or TorchVision, which can be chosen in the interface of the datasets that supported encoded videos.
+
+5. The next step of a PyTorchVideo dataset is creating a clip dictionary containing the video modalities, label and metadata ready to be returned. An example clip dictionary might look like this:
+ ```
+ {
+ 'video': , # Shape: (C, T, H, W)
+ 'audio': , # Shape: (S)
+ 'label': , # Integer defining class annotation
+ 'video_name': , # Video file path stem
+ 'video_index': , # index of video used by sampler
+ 'clip_index': # index of clip sampled within video
+ }
+ ```
+ All datasets share the same canonical modality tensor shapes and dtypes, which aligns with tensor types of other domain specific libraries (e.g. TorchVision, TorchAudio).
+
+6. The final step before returning a clip, involves feeding it into a transform callable that can be defined for of all PyTorchVideo datasets. This callable is used to allow custom data processing or augmentations to be applied before batch collation in the torch.utils.data.DataLoader. PyTorchVideo provides common [transforms](http://pytorchvideo.org/docs/api/transforms/transforms.html) that are useful for this callable, but users can easily define their own too.
+
+## Available datasets:
+
+* [Charades](http://pytorchvideo.org/docs/api/data/charades.html#pytorchvideo-data-charades)
+* [Domsev](http://pytorchvideo.org/docs/api/data/domsev.html#module-pytorchvideo.data.domsev)
+* [EpicKitchen](http://pytorchvideo.org/docs/api/data/encoded_video.html#pytorchvideo-data-encoded-video-dataset)
+* [HMDB51](http://pytorchvideo.org/docs/api/data/encoded_video.html#pytorchvideo-data-encoded-video-dataset)
+* [Kinetics](http://pytorchvideo.org/docs/api/data/encoded_video.html#pytorchvideo-data-encoded-video-dataset)
+* SSV2 (TODO)
+* [UCF101](http://pytorchvideo.org/docs/api/data/encoded_video.html#pytorchvideo-data-encoded-video-dataset)
diff --git a/docs/source/data_preparation.md b/docs/source/data_preparation.md
new file mode 100644
index 00000000..aced4c2d
--- /dev/null
+++ b/docs/source/data_preparation.md
@@ -0,0 +1,45 @@
+## Data Preparation
+
+### Kinetics
+
+For more information about Kinetics dataset, please refer the official [website](https://deepmind.com/research/open-source/kinetics). You can take the following steps to prepare the dataset:
+
+1. Download the videos via the official [scripts](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics).
+
+2. Preprocess the downloaded videos by resizing to the short edge size of 256.
+
+3. Prepare the csv files for training, validation, and testing set as `train.csv`, `val.csv`, `test.csv`. The format of the csv file is:
+
+```
+path_to_video_1 label_1
+path_to_video_2 label_2
+path_to_video_3 label_3
+...
+path_to_video_N label_N
+```
+
+All the Kinetics models in the Model Zoo are trained and tested with the same data as [Non-local Network](https://github.com/facebookresearch/video-nonlocal-net/blob/master/DATASET.md) and [PySlowFast](https://github.com/facebookresearch/SlowFast/blob/master/slowfast/datasets/DATASET.md). For dataset specific issues, please reach out to the [dataset provider](https://deepmind.com/research/open-source/kinetics).
+
+
+### Charades
+
+We follow [PySlowFast](https://github.com/facebookresearch/SlowFast/blob/master/slowfast/datasets/DATASET.md) to prepare the Charades dataset as follow:
+
+1. Download the Charades RGB frames from [official website](http://ai2-website.s3.amazonaws.com/data/Charades_v1_rgb.tar).
+
+2. Download the *frame list* from the following links: ([train](https://dl.fbaipublicfiles.com/pyslowfast/dataset/charades/frame_lists/train.csv), [val](https://dl.fbaipublicfiles.com/pyslowfast/dataset/charades/frame_lists/val.csv)).
+
+
+### Something-Something V2
+
+We follow [PySlowFast](https://github.com/facebookresearch/SlowFast/blob/master/slowfast/datasets/DATASET.md) to prepare the Something-Something V2 dataset as follow:
+
+1. Download the dataset and annotations from [official website](https://20bn.com/datasets/something-something).
+
+2. Download the *frame list* from the following links: ([train](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/train.csv), [val](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/val.csv)).
+
+3. Extract the frames from downloaded videos at 30 FPS. We used ffmpeg-4.1.3 with command:
+ ```
+ ffmpeg -i "${video}" -r 30 -q:v 1 "${out_name}"
+ ```
+4. The extracted frames should be organized to be consistent with the paths in frame lists.
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 00000000..ba6dcdda
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,47 @@
+.. pytorchvideo documentation master file, created by
+ sphinx-quickstart on Tue Feb 23 17:19:36 2021.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+:github_url: https://github.com/facebookresearch/pytorchvideo/
+
+
+Welcome to PyTorchVideo's Documentation
+========================================
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Models
+
+ models
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Data
+
+ data
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Transforms
+
+ transforms
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Accelerator
+
+ accelerator
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Model Zoo
+
+ model_zoo
+ data_preparation
+
+.. toctree::
+ :maxdepth: 2
+ :caption: API
+
+ api/index
diff --git a/docs/source/model_zoo.md b/docs/source/model_zoo.md
new file mode 100644
index 00000000..073218cb
--- /dev/null
+++ b/docs/source/model_zoo.md
@@ -0,0 +1,63 @@
+
+
+
+## PyTorchVideo Model Zoo and Benchmarks
+
+PyTorchVideo provides reference implementation of a large number of video understanding approaches. In this document, we also provide comprehensive benchmarks to evaluate the supported models on different datasets using standard evaluation setup. All the models can be downloaded from the provided links.
+
+### Kinetics-400
+
+arch | depth | pretrain | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) | Model
+-------- | ----- | -------- | -------------------------- | ----- | ----- | --------- | ---------- | --------------------------------------------------------------------------------------------------
+C2D | R50 | \- | 8x8 | 71.46 | 89.68 | 25.89 | 24.33 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/C2D\_8x8\_R50.pyth)
+I3D | R50 | \- | 8x8 | 73.27 | 90.70 | 37.53 | 28.04 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/I3D\_8x8\_R50.pyth)
+Slow | R50 | \- | 4x16 | 72.40 | 90.18 | 27.55 | 32.45 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/SLOW\_4x16\_R50.pyth)
+Slow | R50 | \- | 8x8 | 74.58 | 91.63 | 54.52 | 32.45 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/SLOW\_8x8\_R50.pyth)
+SlowFast | R50 | \- | 4x16 | 75.34 | 91.89 | 36.69 | 34.48 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/SLOWFAST\_4x16\_R50.pyth)
+SlowFast | R50 | \- | 8x8 | 76.94 | 92.69 | 65.71 | 34.57 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/SLOWFAST\_8x8\_R50.pyth)
+SlowFast | R101 | \- | 8x8 | 77.90 | 93.27 | 127.20 | 62.83 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/SLOWFAST\_8x8\_R101.pyth)
+CSN | R101 | \- | 32x2 | 77.00 | 92.90 | 75.62 | 22.21 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/CSN\_32x2\_R101.pyth)
+R(2+1)D | R50 | \- | 16x4 | 76.01 | 92.23 | 76.45 | 28.11 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/R2PLUS1D\_16x4\_R50.pyth)
+X3D | XS | \- | 4x12 | 69.12 | 88.63 | 0.91 | 3.79 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/X3D\_XS.pyth)
+X3D | S | \- | 13x6 | 73.33 | 91.27 | 2.96 | 3.79 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/X3D\_S.pyth)
+X3D | M | \- | 16x5 | 75.94 | 92.72 | 6.72 | 3.79 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/X3D\_M.pyth)
+
+### Something-Something V2
+
+| arch | depth | pretrain | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) | Model |
+| -------- | ----- | ------------ | -------------------------- | ----- | ----- | --------- | ---------- | ----- |
+| Slow | R50 | Kinetics 400 | 8x8 | 60.04 | 85.19 | 55.10 | 31.96 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/ssv2/SLOW\_8x8\_R50.pyth) |
+| SlowFast | R50 | Kinetics 400 | 8x8 | 61.68 | 86.92 | 66.60 | 34.04 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/ssv2/SLOWFAST\_8x8\_R50.pyth) |
+
+
+### Charades
+
+| arch | depth | pretrain | frame length x sample rate | MAP | Flops (G) | Params (M) | Model |
+| -------- | ----- | ------------ | ---------------- | ----- | --------- | ---------- | ----- |
+| Slow | R50 | Kinetics 400 | 8x8 | 34.72 | 55.10 | 31.96 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/charades/SLOW\_8x8\_R50.pyth) |
+| SlowFast | R50 | Kinetics 400 | 8x8 | 37.24 | 66.60 | 34.00 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/charades/SLOWFAST\_8x8\_R50.pyth) |
+
+
+### Using PytorchVideo model zoo
+We provide several different ways to use PyTorchVideo model zoo.
+* The models have been integrated into TorchHub, so could be loaded with TorchHub with or without pre-trained models. Additionally, we provide a [tutorial](https://pytorchvideo.org/docs/tutorial_torchhub_inference) which goes over the steps needed to load models from TorchHub and perform inference.
+* PyTorchVideo models/datasets are also supported in PySlowFast. You can use [PySlowFast workflow](https://github.com/facebookresearch/SlowFast/tree/master/projects/pytorchvideo) to train or test PyTorchVideo models/datasets.
+* You can also use [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning) to build training/test pipeline for PyTorchVideo models and datasets. Please check this [tutorial](https://pytorchvideo.org/docs/tutorial_classification) for more information.
+
+
+Notes:
+* The above benchmarks are conducted by [PySlowFast workflow](https://github.com/facebookresearch/SlowFast/tree/master/projects/pytorchvideo) using PyTorchVideo datasets and models.
+* For more details on the data preparation, you can refer to [PyTorchVideo Data Preparation](data_preparation.md).
+
+
+
+### PytorchVideo Accelerator Model Zoo
+Accelerator model zoo provides a set of efficient models on target device with pretrained checkpoints. To learn more about how to build model, load checkpoint and deploy, please refer to [Use PytorchVideo/Accelerator Model Zoo](https://pytorchvideo.org/docs/tutorial_accelerator_use_accelerator_model_zoo).
+
+**Efficient Models for mobile CPU**
+All top1/top5 accuracies are measured with 10-clip evaluation. Latency is benchmarked on Samsung S8 phone with 1s input clip length.
+
+| model | model builder | top 1 | top 5 | latency (ms) | params (M) | checkpoint |
+|--------|--------------------------------------------------------------------------|-------|-------|--------------|----------------|---------------------|
+| X3D_XS | models.accelerator.mobile_cpu.efficient_x3d.EfficientX3d(expansion="XS") | 68.5 | 88.0 | 233 | 3.8 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/efficient_x3d_xs_original_form.pyth) |
+| X3D_S | models.accelerator.mobile_cpu.efficient_x3d.EfficientX3d(expansion="S") | 73.0 | 90.6 | 764 | 3.8 | [link](https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/efficient_x3d_s_original_form.pyth) |
diff --git a/docs/source/models.md b/docs/source/models.md
new file mode 100644
index 00000000..84af2fb2
--- /dev/null
+++ b/docs/source/models.md
@@ -0,0 +1,180 @@
+# PyTorchVideo/Model
+
+
+PyTorchVideo is an open source video understanding library that provides up to date builders for state of the art video understanding backbones, layers, heads, and losses addressing different tasks, including acoustic event detection, action recognition (video classification), action detection (video detection), multimodal understanding (acoustic visual classification), self-supervised learning.
+
+The models subpackage contains definitions for the following model architectures and layers:
+
+
+* Acoustic Backbone
+ * Acoustic ResNet
+* Visual Backbone
+ * [I3D](https://arxiv.org/pdf/1705.07750.pdf)
+ * [C2D](https://arxiv.org/pdf/1711.07971.pdf)
+ * [Squeeze-and-Excitation Networks](https://arxiv.org/pdf/1709.01507.pdf)
+ * [Nonlocal Networks](https://arxiv.org/pdf/1711.07971.pdf)
+ * [R2+1D](https://openaccess.thecvf.com/content_cvpr_2018/papers/Tran_A_Closer_Look_CVPR_2018_paper.pdf)
+ * CSN
+ * [SlowFast](https://arxiv.org/pdf/1812.03982.pdf)
+ * [X3D](https://arxiv.org/pdf/2004.04730.pdf)
+* Self-Supervised Learning
+ * [SimCLR](https://arxiv.org/pdf/2002.05709.pdf)
+ * [Bootstrap Your Own Latent](https://arxiv.org/pdf/2006.07733.pdf)
+ * [Non-Parametric Instance Discrimination](https://openaccess.thecvf.com/content_cvpr_2018/CameraReady/0801.pdf)
+
+
+## Build standard models
+
+PyTorchVideo provide default builders to construct state-of-the-art video understanding models, layers, heads, and losses.
+
+### Models
+
+You can construct a model with random weights by calling its constructor:
+
+```
+import pytorchvideo.models as models
+
+resnet = models.create_resnet()
+acoustic_resnet = models.create_acoustic_resnet()
+slowfast = models.create_slowfast()
+x3d = models.create_x3d()
+r2plus1d = models.create_r2plus1d()
+csn = models.create_csn()
+```
+
+You can verify whether you have built the model successfully by:
+
+```
+import pytorchvideo.models as models
+
+resnet = models.create_resnet()
+B, C, T, H, W = 2, 3, 8, 224, 224
+input_tensor = torch.zeros(B, C, T, H, W)
+output = resnet(input_tensor)
+```
+
+### Layers
+
+You can construct a layer with random weights by calling its constructor:
+
+```
+import pytorchvideo.layers as layers
+
+nonlocal = layers.create_nonlocal(dim_in=256, dim_inner=128)
+swish = layers.Swish()
+conv_2plus1d = layers.create_conv_2plus1d(in_channels=256, out_channels=512)
+```
+
+You can verify whether you have built the model successfully by:
+
+```
+import pytorchvideo.layers as layers
+
+nonlocal = layers.create_nonlocal(dim_in=256, dim_inner=128)
+B, C, T, H, W = 2, 256, 4, 14, 14
+input_tensor = torch.zeros(B, C, T, H, W)
+output = nonlocal(input_tensor)
+
+swish = layers.Swish()
+B, C, T, H, W = 2, 256, 4, 14, 14
+input_tensor = torch.zeros(B, C, T, H, W)
+output = swish(input_tensor)
+
+conv_2plus1d = layers.create_conv_2plus1d(in_channels=256, out_channels=512)
+B, C, T, H, W = 2, 256, 4, 14, 14
+input_tensor = torch.zeros(B, C, T, H, W)
+output = conv_2plus1d(input_tensor)
+```
+
+### Heads
+
+You can construct a head with random weights by calling its constructor:
+
+```
+import pytorchvideo.models as models
+
+res_head = models.head.create_res_basic_head(in_features, out_features)
+x3d_head = models.x3d.create_x3d_head(dim_in=1024, dim_inner=512, dim_out=2048, num_classes=400)
+```
+
+You can verify whether you have built the head successfully by:
+
+```
+import pytorchvideo.models as models
+
+res_head = models.head.create_res_basic_head(in_features, out_features)
+B, C, T, H, W = 2, 256, 4, 14, 14
+input_tensor = torch.zeros(B, C, T, H, W)
+output = res_head(input_tensor)
+
+x3d_head = models.x3d.create_x3d_head(dim_in=1024, dim_inner=512, dim_out=2048, num_classes=400)
+B, C, T, H, W = 2, 256, 4, 14, 14
+input_tensor = torch.zeros(B, C, T, H, W)
+output = x3d_head(input_tensor)
+```
+
+### Losses
+
+You can construct a loss by calling its constructor:
+
+```
+import pytorchvideo.models as models
+
+simclr_loss = models.SimCLR()
+```
+
+You can verify whether you have built the loss successfully by:
+
+```
+import pytorchvideo.models as models
+import pytorchvideo.layers as layers
+
+resnet = models.create_resnet()
+mlp = layers.make_multilayer_perceptron(fully_connected_dims=(2048, 1024, 2048))
+simclr_loss = models.SimCLR(mlp=mlp, backbone=resnet)
+B, C, T, H, W = 2, 256, 4, 14, 14
+view1, view2 = torch.zeros(B, C, T, H, W), torch.zeros(B, C, T, H, W)
+loss = simclr_loss(view1, view2)
+```
+
+## Build customized models
+
+PyTorchVideo also supports building models with customized components, which is an important feature for video understanding research. Here we take a standard stem model as an example, show how to build each resnet components (head, backbone, stem) separately, and how to use your customized components to replace standard components.
+
+
+```
+from pytorchvideo.models.stem import create_res_basic_stem
+
+
+# Create standard stem layer.
+stem = create_res_basic_stem(in_channels=3, out_channels=64)
+
+# Create customized stem layer with YourFancyNorm
+stem = create_res_basic_stem(
+ in_channels=3,
+ out_channels=64,
+ norm=YourFancyNorm, # GhostNorm for example
+)
+
+# Create customized stem layer with YourFancyConv
+stem = create_res_basic_stem(
+ in_channels=3,
+ out_channels=64,
+ conv=YourFancyConv, # OctConv for example
+)
+
+# Create customized stem layer with YourFancyAct
+stem = create_res_basic_stem(
+ in_channels=3,
+ out_channels=64,
+ activation=YourFancyAct, # Swish for example
+)
+
+# Create customized stem layer with YourFancyPool
+stem = create_res_basic_stem(
+ in_channels=3,
+ out_channels=64,
+ pool=YourFancyPool, # MinPool for example
+)
+
+```
diff --git a/docs/source/transforms.md b/docs/source/transforms.md
new file mode 100644
index 00000000..42afc1a4
--- /dev/null
+++ b/docs/source/transforms.md
@@ -0,0 +1,33 @@
+# Overview
+
+The PyTorchVideo transforms package contains common video algorithms used for preprocessing and/or augmenting video data. The package also contains helper dictionary transforms that are useful for interoperability between PyTorchVideo datasets clip outputs (TODO link to sample datasets clip) and domain specific transforms. For example, here is a standard transform pipeline for a video model, that could be used with a PyTorchVideo dataset:
+
+```python
+transform = torchvision.transforms.Compose([
+ pytorchvideo.transforms.ApplyTransformToKey(
+ key="video",
+ transform=torchvision.transforms.Compose([
+ pytorchvideo.transforms.UniformTemporalSubsample(8),
+ pytorchvideo.transforms.Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
+ pytorchvideo.transforms.RandomShortSideScale(min_size=256, max_size=320),
+ torchvision.transforms.RandomCrop(244),
+ torchvision.transforms.RandomHorizontalFlip(p=0.5),
+ )]
+ )
+])
+dataset = pytorchvideo.data.Kinetics(
+ data_path="path/to/kinetics_root/train.csv",
+ clip_sampler=pytorchvideo.data.make_clip_sampler("random", duration=2),
+ transform=transform
+)
+```
+
+Notice how the example also includes transforms from TorchVision? PyTorchVideo uses the same canonical tensor shape as TorchVision for video and TorchAudio for audio. This allows the frameworks to be used together freely.
+
+## Transform vs Functional interface
+
+The example above demonstrated the pytorchvideo.transforms interface. These transforms are nn.module callable classes that can be stringed together in a declarative way. PyTorchVideo also provides a pytorchvideo.transforms.functional interface, which is essentially just the functions that the nn.module classes use. These allow more fine-grained control over the transformations and may be more suitable for use outside the dataset preprocessing use case.
+
+## Scriptable transforms
+
+All non-OpenCV transforms are TorchScriptable, as described in the [TorchVision docs](https://pytorch.org/vision/stable/transforms.html#scriptable-transforms), in order to script the transforms together, please use torch.nn.Sequential instead of torchvision.transform.Compose.
diff --git a/hubconf.py b/hubconf.py
new file mode 100644
index 00000000..38e86bd5
--- /dev/null
+++ b/hubconf.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+dependencies = ["torch"]
+from pytorchvideo.models.hub import ( # noqa: F401, E402
+ slow_r50,
+ slowfast_r50,
+ slowfast_r101,
+ x3d_m,
+ x3d_s,
+ x3d_xs,
+ efficient_x3d_xs,
+ efficient_x3d_s,
+)
diff --git a/pytorchvideo/__init__.py b/pytorchvideo/__init__.py
new file mode 100644
index 00000000..90afbb95
--- /dev/null
+++ b/pytorchvideo/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+__version__ = "0.1.0"
diff --git a/pytorchvideo/accelerator/__init__.py b/pytorchvideo/accelerator/__init__.py
new file mode 100644
index 00000000..5c7f19c6
--- /dev/null
+++ b/pytorchvideo/accelerator/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/pytorchvideo/accelerator/deployment/__init__.py b/pytorchvideo/accelerator/deployment/__init__.py
new file mode 100644
index 00000000..5c7f19c6
--- /dev/null
+++ b/pytorchvideo/accelerator/deployment/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/pytorchvideo/accelerator/deployment/common/__init__.py b/pytorchvideo/accelerator/deployment/common/__init__.py
new file mode 100644
index 00000000..5c7f19c6
--- /dev/null
+++ b/pytorchvideo/accelerator/deployment/common/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/pytorchvideo/accelerator/deployment/common/model_transmuter.py b/pytorchvideo/accelerator/deployment/common/model_transmuter.py
new file mode 100644
index 00000000..e1593528
--- /dev/null
+++ b/pytorchvideo/accelerator/deployment/common/model_transmuter.py
@@ -0,0 +1,86 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import logging
+from typing import List
+
+import torch.nn as nn
+
+
+"""
+This file contains top-level transmuter to convert user input model (nn.Module) into
+an equivalent model composed of efficientBlocks for target device.
+Specifically, each target device has a transmuter list, which contains transmuter
+functions to convert module into equivalent efficientBlock. Each transmuter list is
+registered in EFFICIENT_BLOCK_TRANSMUTER_REGISTRY to be accessed by top-level transmuter.
+"""
+EFFICIENT_BLOCK_TRANSMUTER_REGISTRY = {}
+
+
+def _find_equivalent_efficient_module(
+ module_input: nn.Module,
+ efficient_block_transmuter_list: List,
+ module_name: str = "",
+):
+ """
+ Given module_input, search through efficient_block_registry to see whether the
+ module_input can be replaced with equivalent efficientBlock. Returns None if no
+ equivalent efficientBlock is found, else returns an instance of equivalent
+ efficientBlock.
+ Args:
+ module_input (nn.Module): module to be replaced by equivalent efficientBlock
+ efficient_block_transmuter_list (list): a transmuter list that contains transmuter
+ functions for available efficientBlocks
+ module_name (str): name of module_input in original model
+ """
+ eq_module_hit_list = []
+ for iter_func in efficient_block_transmuter_list:
+ eq_module = iter_func(module_input)
+ if eq_module is not None:
+ eq_module_hit_list.append(eq_module)
+ if len(eq_module_hit_list) > 0:
+ # Check for multiple matches.
+ if len(eq_module_hit_list) > 1:
+ logging.warning(f"{module_name} has multiple matches:")
+ for iter_match in eq_module_hit_list:
+ logging.warning(f"{iter_match.__class__.__name__} is a match.")
+ logging.warning(
+ f"Will use {eq_module_hit_list[0]} as it has highest priority."
+ )
+ return eq_module_hit_list[0]
+ return None
+
+
+def transmute_model(
+ model: nn.Module,
+ target_device: str = "mobile_cpu",
+ prefix: str = "",
+):
+ """
+ Recursively goes through user input model and replace module in place with available
+ equivalent efficientBlock for target device.
+ Args:
+ model (nn.Module): user input model to be transmuted
+ target_device (str): name of target device, used to access transmuter list in
+ EFFICIENT_BLOCK_TRANSMUTER_REGISTRY
+ prefix (str): name of current hierarchy in user model
+ """
+ assert (
+ target_device in EFFICIENT_BLOCK_TRANSMUTER_REGISTRY
+ ), f"{target_device} not registered in EFFICIENT_BLOCK_TRANSMUTER_REGISTRY!"
+ transmuter_list = EFFICIENT_BLOCK_TRANSMUTER_REGISTRY[target_device]
+ for name, child in model.named_children():
+ equivalent_module = _find_equivalent_efficient_module(
+ child, transmuter_list, module_name=f"{prefix}.{name}"
+ )
+ if equivalent_module is not None:
+ model._modules[name] = equivalent_module
+ logging.info(
+ f"Replacing {prefix}.{name} ({child.__class__.__name__}) with "
+ f"{equivalent_module.__class__.__name__}"
+ )
+ else:
+ transmute_model(
+ child,
+ target_device=target_device,
+ prefix=f"{prefix}.{name}",
+ )
diff --git a/pytorchvideo/accelerator/deployment/mobile_cpu/__init__.py b/pytorchvideo/accelerator/deployment/mobile_cpu/__init__.py
new file mode 100644
index 00000000..5c7f19c6
--- /dev/null
+++ b/pytorchvideo/accelerator/deployment/mobile_cpu/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/pytorchvideo/accelerator/deployment/mobile_cpu/transmuter/__init__.py b/pytorchvideo/accelerator/deployment/mobile_cpu/transmuter/__init__.py
new file mode 100644
index 00000000..8c573dc2
--- /dev/null
+++ b/pytorchvideo/accelerator/deployment/mobile_cpu/transmuter/__init__.py
@@ -0,0 +1,10 @@
+from pytorchvideo.accelerator.deployment.common.model_transmuter import (
+ EFFICIENT_BLOCK_TRANSMUTER_REGISTRY,
+)
+
+from .transmuter_mobile_cpu import EFFICIENT_BLOCK_TRANSMUTER_MOBILE_CPU
+
+
+EFFICIENT_BLOCK_TRANSMUTER_REGISTRY[
+ "mobile_cpu"
+] = EFFICIENT_BLOCK_TRANSMUTER_MOBILE_CPU
diff --git a/pytorchvideo/accelerator/deployment/mobile_cpu/transmuter/transmuter_mobile_cpu.py b/pytorchvideo/accelerator/deployment/mobile_cpu/transmuter/transmuter_mobile_cpu.py
new file mode 100644
index 00000000..dfaee8a8
--- /dev/null
+++ b/pytorchvideo/accelerator/deployment/mobile_cpu/transmuter/transmuter_mobile_cpu.py
@@ -0,0 +1,204 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import torch.nn as nn
+from pytorchvideo.layers.accelerator.mobile_cpu.convolutions import (
+ Conv3d3x1x1BnAct,
+ Conv3d3x3x3DwBnAct,
+ Conv3d5x1x1BnAct,
+ Conv3dPwBnAct,
+ Conv3dTemporalKernel1BnAct,
+)
+
+
+def transmute_Conv3dPwBnAct(input_module: nn.Module):
+ """
+ Given an input_module, transmutes it into a equivalent Conv3dPwBnAct. Returns None
+ if no equivalent Conv3dPwBnAct is found, else returns an instance of equivalent
+ Conv3dPwBnAct.
+ Args:
+ input_module (nn.Module): input module to find an equivalent Conv3dPwBnAct
+ """
+ if not isinstance(input_module, nn.Conv3d):
+ return None
+ if (
+ input_module.kernel_size == (1, 1, 1)
+ and input_module.groups == 1
+ and input_module.stride == (1, 1, 1)
+ and input_module.padding == (0, 0, 0)
+ and input_module.dilation == (1, 1, 1)
+ ):
+ module = Conv3dPwBnAct(
+ in_channels=input_module.in_channels,
+ out_channels=input_module.out_channels,
+ bias=False if input_module.bias is None else True,
+ activation="identity",
+ use_bn=False,
+ )
+ module.kernel.conv.load_state_dict(input_module.state_dict())
+ return module
+ else:
+ return None
+
+
+def transmute_Conv3d3x3x3DwBnAct(input_module: nn.Module):
+ """
+ Given an input_module, transmutes it into a equivalent Conv3d3x3x3DwBnAct. Returns
+ None if no equivalent Conv3d3x3x3DwBnAct is found, else returns an instance of
+ equivalent Conv3d3x3x3DwBnAct.
+ Args:
+ input_module (nn.Module): input module to find an equivalent Conv3d3x3x3DwBnAct
+ """
+ if not isinstance(input_module, nn.Conv3d):
+ return None
+ if (
+ input_module.kernel_size == (3, 3, 3)
+ and input_module.in_channels == input_module.out_channels
+ and input_module.groups == input_module.out_channels
+ and input_module.stride[0] == 1
+ and input_module.stride[1] == input_module.stride[2]
+ and input_module.padding == (1, 1, 1)
+ and input_module.padding_mode == "zeros"
+ and input_module.dilation == (1, 1, 1)
+ ):
+ spatial_stride = input_module.stride[1]
+ module = Conv3d3x3x3DwBnAct(
+ in_channels=input_module.in_channels,
+ spatial_stride=spatial_stride,
+ bias=False if input_module.bias is None else True,
+ activation="identity",
+ use_bn=False,
+ )
+ module.kernel.conv.load_state_dict(input_module.state_dict())
+ return module
+ else:
+ return None
+
+
+def transmute_Conv3dTemporalKernel1BnAct(input_module: nn.Module):
+ """
+ Given an input_module, transmutes it into a equivalent Conv3dTemporalKernel1BnAct.
+ Returns None if no equivalent Conv3dTemporalKernel1BnAct is found, else returns
+ an instance of equivalent Conv3dTemporalKernel1BnAct.
+ Args:
+ input_module (nn.Module): input module to find an equivalent Conv3dTemporalKernel1BnAct
+ """
+ if not isinstance(input_module, nn.Conv3d):
+ return None
+ """
+ If the input_module can be replaced by Conv3dPwBnAct, don't use
+ Conv3dTemporalKernel1BnAct.
+ """
+ if (
+ input_module.kernel_size == (1, 1, 1)
+ and input_module.groups == 1
+ and input_module.stride == (1, 1, 1)
+ and input_module.padding == (0, 0, 0)
+ and input_module.dilation == (1, 1, 1)
+ ):
+ return None
+
+ if (
+ input_module.kernel_size[0] == 1
+ and input_module.kernel_size[1] == input_module.kernel_size[2]
+ and input_module.stride[0] == 1
+ and input_module.stride[1] == input_module.stride[2]
+ and input_module.padding[0] == 0
+ and input_module.dilation[0] == 1
+ ):
+ spatial_stride = input_module.stride[1]
+ spatial_kernel = input_module.kernel_size[1]
+ spatial_padding = input_module.padding[1]
+ spatial_dilation = input_module.dilation[1]
+ module = Conv3dTemporalKernel1BnAct(
+ in_channels=input_module.in_channels,
+ out_channels=input_module.out_channels,
+ bias=False if input_module.bias is None else True,
+ groups=input_module.groups,
+ spatial_kernel=spatial_kernel,
+ spatial_stride=spatial_stride,
+ spatial_padding=spatial_padding,
+ spatial_dilation=spatial_dilation,
+ activation="identity",
+ use_bn=False,
+ )
+ module.kernel.conv.load_state_dict(input_module.state_dict())
+ return module
+ else:
+ return None
+
+
+def transmute_Conv3d3x1x1BnAct(input_module: nn.Module):
+ """
+ Given an input_module, transmutes it into a equivalent Conv3d3x1x1BnAct.
+ Returns None if no equivalent Conv3d3x1x1BnAct is found, else returns
+ an instance of equivalent Conv3d3x1x1BnAct.
+ Args:
+ input_module (nn.Module): input module to find an equivalent Conv3d3x1x1BnAct
+ """
+ if not isinstance(input_module, nn.Conv3d):
+ return None
+
+ if (
+ input_module.kernel_size == (3, 1, 1)
+ and input_module.stride == (1, 1, 1)
+ and input_module.padding == (1, 0, 0)
+ and input_module.dilation == (1, 1, 1)
+ and input_module.padding_mode == "zeros"
+ ):
+ module = Conv3d3x1x1BnAct(
+ in_channels=input_module.in_channels,
+ out_channels=input_module.out_channels,
+ bias=False if input_module.bias is None else True,
+ groups=input_module.groups,
+ activation="identity",
+ use_bn=False,
+ )
+ module.kernel.conv.load_state_dict(input_module.state_dict())
+ return module
+ else:
+ return None
+
+
+def transmute_Conv3d5x1x1BnAct(input_module: nn.Module):
+ """
+ Given an input_module, transmutes it into a equivalent Conv3d5x1x1BnAct.
+ Returns None if no equivalent Conv3d5x1x1BnAct is found, else returns
+ an instance of equivalent Conv3d5x1x1BnAct.
+ Args:
+ input_module (nn.Module): input module to find an equivalent Conv3d5x1x1BnAct
+ """
+ if not isinstance(input_module, nn.Conv3d):
+ return None
+
+ if (
+ input_module.kernel_size == (5, 1, 1)
+ and input_module.stride == (1, 1, 1)
+ and input_module.padding == (2, 0, 0)
+ and input_module.dilation == (1, 1, 1)
+ and input_module.padding_mode == "zeros"
+ ):
+ module = Conv3d5x1x1BnAct(
+ in_channels=input_module.in_channels,
+ out_channels=input_module.out_channels,
+ bias=False if input_module.bias is None else True,
+ groups=input_module.groups,
+ activation="identity",
+ use_bn=False,
+ )
+ module.kernel.conv.load_state_dict(input_module.state_dict())
+ return module
+ else:
+ return None
+
+
+"""
+List of efficient_block transmuters for mobile_cpu. If one module matches multiple
+transmuters, the first matched transmuter in list will be used.
+"""
+EFFICIENT_BLOCK_TRANSMUTER_MOBILE_CPU = [
+ transmute_Conv3dPwBnAct,
+ transmute_Conv3d3x3x3DwBnAct,
+ transmute_Conv3dTemporalKernel1BnAct,
+ transmute_Conv3d3x1x1BnAct,
+ transmute_Conv3d5x1x1BnAct,
+]
diff --git a/pytorchvideo/accelerator/deployment/mobile_cpu/utils/__init__.py b/pytorchvideo/accelerator/deployment/mobile_cpu/utils/__init__.py
new file mode 100644
index 00000000..5c7f19c6
--- /dev/null
+++ b/pytorchvideo/accelerator/deployment/mobile_cpu/utils/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/pytorchvideo/accelerator/deployment/mobile_cpu/utils/model_conversion.py b/pytorchvideo/accelerator/deployment/mobile_cpu/utils/model_conversion.py
new file mode 100644
index 00000000..e53f04c4
--- /dev/null
+++ b/pytorchvideo/accelerator/deployment/mobile_cpu/utils/model_conversion.py
@@ -0,0 +1,102 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from copy import deepcopy
+from typing import Dict, List
+
+import torch
+import torch.nn as nn
+from pytorchvideo.accelerator.efficient_blocks.efficient_block_base import (
+ EfficientBlockBase,
+)
+
+
+def _add_input_tensor_size_lut_hook(
+ module: nn.Module,
+ input_tensor_size_lut: Dict,
+ hook_handle_list: List,
+ base_name: str = "",
+) -> None:
+ """
+ This helper function recursively goes through all modules in a network, registers
+ forward hook function to each module. The hook function records the input tensor
+ size in forward in input_tensor_size_lut[base_name].
+ Args:
+ module (nn.Module): input module to add hook recursively.
+ input_tensor_size_lut (dict): lut to record input tensor size for hook function.
+ hook_handle_list (list): a list to contain hook handles.
+ base_name (str): name for module input.
+ """
+
+ def hook_fn(_, _in, _out):
+ if isinstance(_in[0], torch.Tensor):
+ input_tensor_size_lut[base_name] = tuple(_in[0].size())
+ return
+
+ handle = module.register_forward_hook(hook_fn)
+ hook_handle_list.append(handle)
+ for name, child in module.named_children():
+ _add_input_tensor_size_lut_hook(
+ child,
+ input_tensor_size_lut,
+ hook_handle_list,
+ base_name=f"{base_name}.{name}",
+ )
+
+
+def _convert_module(
+ module: nn.Module,
+ input_tensor_size_lut: Dict,
+ base_name: str = "",
+) -> None:
+ """
+ This helper function recursively goes through sub-modules in a network. If current
+ module is a efficient block (instance of EfficientBlockBase) with convert() method,
+ its convert() method will be called, and the input tensor size (needed by efficient
+ blocks for mobile cpu) will be provided by matching module name in
+ input_tensor_size_lut.
+ Otherwise if the input module is a non efficient block, this function will try to go
+ through child modules of input module to look for any efficient block in lower
+ hierarchy.
+ Args:
+ module (nn.Module): input module for convert.
+ input_tensor_size_lut (dict): input tensor size look-up table.
+ base_name (str): module name for input module.
+ """
+ if isinstance(module, EfficientBlockBase):
+ module.convert(input_tensor_size_lut[base_name])
+ else:
+ for name, child in module.named_children():
+ _convert_module(
+ child, input_tensor_size_lut, base_name=f"{base_name}.{name}"
+ )
+
+
+def convert_to_deployable_form(
+ model: nn.Module,
+ input_tensor: torch.Tensor,
+) -> nn.Module:
+ """
+ This function takes an input model, and returns a deployable model copy.
+ Args:
+ model (nn.Module): input model for conversion. The model can include a mix of
+ efficient blocks (instances of EfficientBlockBase) and non efficient blocks.
+ The efficient blocks will be converted by calling its convert() method, while
+ other blocks will stay unchanged.
+ input_tensor (torch.Tensor): input tensor for model. Note current conversion for
+ deployable form in mobile cpu only works for single input tensor size (i.e.,
+ the future input tensor to converted model should have the same size as
+ input_tensor specified here).
+ """
+ input_tensor_size_lut = {}
+ hook_handle_list = []
+ _add_input_tensor_size_lut_hook(model, input_tensor_size_lut, hook_handle_list)
+ # Run forward to fill in input tensor lut.
+ model.eval()
+ model(input_tensor)
+ # Remove forward hooks.
+ for handle in hook_handle_list:
+ handle.remove()
+ model_converted = deepcopy(model)
+ model_converted.eval()
+ _convert_module(model_converted, input_tensor_size_lut)
+ return model_converted
diff --git a/pytorchvideo/accelerator/efficient_blocks/__init__.py b/pytorchvideo/accelerator/efficient_blocks/__init__.py
new file mode 100644
index 00000000..5c7f19c6
--- /dev/null
+++ b/pytorchvideo/accelerator/efficient_blocks/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/pytorchvideo/accelerator/efficient_blocks/efficient_block_base.py b/pytorchvideo/accelerator/efficient_blocks/efficient_block_base.py
new file mode 100644
index 00000000..1040218d
--- /dev/null
+++ b/pytorchvideo/accelerator/efficient_blocks/efficient_block_base.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from abc import abstractmethod
+
+import torch.nn as nn
+
+
+class EfficientBlockBase(nn.Module):
+ """
+ PyTorchVideo/accelerator provides a set of efficient blocks
+ that have optimal efficiency for each target hardware device.
+
+ Each efficient block has two forms:
+ - original form: this form is for training. When efficient block is instantiated,
+ it is in this original form.
+ - deployable form: this form is for deployment. Once the network is ready for
+ deploy, it can be converted into deployable form for efficient execution
+ on target hardware. One block is transformed into deployable form by calling
+ convert() method. By conversion to deployable form,
+ various optimization (operator fuse, kernel optimization, etc.) are applied.
+
+ EfficientBlockBase is the base class for efficient blocks.
+ All efficient blocks should inherit this base class
+ and implement following methods:
+ - forward(): same as required by nn.Module
+ - convert(): called to convert block into deployable form
+ """
+
+ @abstractmethod
+ def convert(self):
+ pass
+
+ @abstractmethod
+ def forward(self):
+ pass
diff --git a/pytorchvideo/accelerator/efficient_blocks/no_op_convert_block.py b/pytorchvideo/accelerator/efficient_blocks/no_op_convert_block.py
new file mode 100644
index 00000000..81ce0aa5
--- /dev/null
+++ b/pytorchvideo/accelerator/efficient_blocks/no_op_convert_block.py
@@ -0,0 +1,26 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import torch.nn as nn
+
+from .efficient_block_base import EfficientBlockBase
+
+
+class NoOpConvertBlock(EfficientBlockBase):
+ """
+ This class provides an interface with EfficientBlockBase for modules that do not
+ need convert.
+ Args:
+ model (nn.Module): NoOpConvertBlock takes model as input and generate a wrapper
+ instance of EfficientBlockBase with same functionality as model, with no change
+ applied when convert() is called.
+ """
+
+ def __init__(self, model: nn.Module):
+ super().__init__()
+ self.model = model
+
+ def convert(self, *args, **kwargs):
+ pass
+
+ def forward(self, x):
+ return self.model(x)
diff --git a/pytorchvideo/data/__init__.py b/pytorchvideo/data/__init__.py
new file mode 100644
index 00000000..56430feb
--- /dev/null
+++ b/pytorchvideo/data/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from .charades import Charades # noqa
+from .clip_sampling import make_clip_sampler
+from .domsev import DomsevDataset # noqa
+from .encoded_video_dataset import EncodedVideoDataset # noqa
+from .epic_kitchen_forecasting import EpicKitchenForecasting # noqa
+from .epic_kitchen_recognition import EpicKitchenRecognition # noqa
+from .hmdb51 import Hmdb51 # noqa
+from .kinetics import Kinetics # noqa
+from .ssv2 import SSv2
+from .ucf101 import Ucf101 # noqa
diff --git a/pytorchvideo/data/charades.py b/pytorchvideo/data/charades.py
new file mode 100644
index 00000000..c1e12c22
--- /dev/null
+++ b/pytorchvideo/data/charades.py
@@ -0,0 +1,225 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import csv
+import functools
+import itertools
+import os
+from collections import defaultdict
+from typing import Any, Callable, List, Optional, Tuple, Type
+
+import torch
+import torch.utils.data
+from iopath.common.file_io import g_pathmgr
+from pytorchvideo.data.clip_sampling import ClipSampler
+from pytorchvideo.data.frame_video import FrameVideo
+
+from .utils import MultiProcessSampler
+
+
+class Charades(torch.utils.data.IterableDataset):
+ """
+ Action recognition video dataset for Charades stored as image frames.
+
+
+ This dataset handles the parsing of frames, loading and clip sampling for the
+ videos. All io reading is done with PathManager, enabling non-local storage
+ uri's to be used.
+ """
+
+ # Number of classes represented by this dataset's annotated labels.
+ NUM_CLASSES = 157
+
+ def __init__(
+ self,
+ data_path: str,
+ clip_sampler: ClipSampler,
+ video_sampler: Type[torch.utils.data.Sampler] = torch.utils.data.RandomSampler,
+ transform: Optional[Callable[[dict], Any]] = None,
+ video_path_prefix: str = "",
+ frames_per_clip: Optional[int] = None,
+ ) -> None:
+ """
+ Args:
+ data_path (str): Path to the data file. This file must be a space
+ separated csv with the format:
+ `original_vido_id video_id frame_id path labels`
+
+ clip_sampler (ClipSampler): Defines how clips should be sampled from each
+ video. See the clip sampling documentation for more information.
+
+ video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal
+ video container. This defines the order videos are decoded and,
+ if necessary, the distributed split.
+
+ transform (Optional[Callable]): This callable is evaluated on the clip output before
+ the clip is returned. It can be used for user defined preprocessing and
+ augmentations to the clips. The clip output is a dictionary with the
+ following format:
+ {
+ 'video': ,
+ 'label': for clip-level label,
+ 'video_label': for video-level label,
+ 'video_index': ,
+ 'clip_index': ,
+ 'aug_index': , augmentation index as augmentations
+ might generate multiple views for one clip.
+ }
+ If transform is None, the raw clip output in the above format is
+ returned unmodified.
+ video_path_prefix (str): prefix path to add to all paths from data_path.
+ frames_per_clip (Optional[int]): The number of frames per clip to sample.
+ """
+ self._transform = transform
+ self._clip_sampler = clip_sampler
+ (
+ self._path_to_videos,
+ self._labels,
+ self._video_labels,
+ ) = _read_video_paths_and_labels(data_path, prefix=video_path_prefix)
+ self._video_sampler = video_sampler(self._path_to_videos)
+ self._video_sampler_iter = None # Initialized on first call to self.__next__()
+ self._frame_filter = (
+ functools.partial(
+ Charades._sample_clip_frames,
+ frames_per_clip=frames_per_clip,
+ )
+ if frames_per_clip is not None
+ else None
+ )
+
+ # Depending on the clip sampler type, we may want to sample multiple clips
+ # from one video. In that case, we keep the store video, label and previous sampled
+ # clip time in these variables.
+ self._loaded_video = None
+ self._loaded_clip = None
+ self._next_clip_start_time = 0.0
+
+ @staticmethod
+ def _sample_clip_frames(
+ frame_indices: List[int], frames_per_clip: int
+ ) -> List[int]:
+ """
+ Args:
+ frame_indices (list): list of frame indices.
+ frames_per+clip (int): The number of frames per clip to sample.
+
+ Returns:
+ (list): Outputs a subsampled list with num_samples frames.
+ """
+ num_frames = len(frame_indices)
+ indices = torch.linspace(0, num_frames - 1, frames_per_clip)
+ indices = torch.clamp(indices, 0, num_frames - 1).long()
+
+ return [frame_indices[idx] for idx in indices]
+
+ @property
+ def video_sampler(self):
+ return self._video_sampler
+
+ def __next__(self) -> dict:
+ """
+ Retrieves the next clip based on the clip sampling strategy and video sampler.
+
+ Returns:
+ A video clip with the following format if transform is None:
+ {
+ 'video': ,
+ 'label': for clip-level label,
+ 'video_label': for video-level label,
+ 'video_index': ,
+ 'clip_index': ,
+ 'aug_index': , augmentation index as augmentations
+ might generate multiple views for one clip.
+ }
+ Otherwise, the transform defines the clip output.
+ """
+ if not self._video_sampler_iter:
+ # Setup MultiProcessSampler here - after PyTorch DataLoader workers are spawned.
+ self._video_sampler_iter = iter(MultiProcessSampler(self._video_sampler))
+
+ if self._loaded_video:
+ video, video_index = self._loaded_video
+ else:
+ video_index = next(self._video_sampler_iter)
+ path_to_video_frames = self._path_to_videos[video_index]
+ video = FrameVideo.from_frame_paths(path_to_video_frames)
+ self._loaded_video = (video, video_index)
+
+ clip_start, clip_end, clip_index, aug_index, is_last_clip = self._clip_sampler(
+ self._next_clip_start_time, video.duration
+ )
+ # Only load the clip once and reuse previously stored clip if there are multiple
+ # views for augmentations to perform on the same clip.
+ if aug_index == 0:
+ self._loaded_clip = video.get_clip(clip_start, clip_end, self._frame_filter)
+ frames, frame_indices = (
+ self._loaded_clip["video"],
+ self._loaded_clip["frame_indices"],
+ )
+ self._next_clip_start_time = clip_end
+
+ if is_last_clip:
+ self._loaded_video = None
+ self._next_clip_start_time = 0.0
+
+ # Merge unique labels from each frame into clip label.
+ labels_by_frame = [
+ self._labels[video_index][i]
+ for i in range(min(frame_indices), max(frame_indices) + 1)
+ ]
+ sample_dict = {
+ "video": frames,
+ "label": labels_by_frame,
+ "video_label": self._video_labels[video_index],
+ "video_name": str(video_index),
+ "video_index": video_index,
+ "clip_index": clip_index,
+ "aug_index": aug_index,
+ }
+ if self._transform is not None:
+ sample_dict = self._transform(sample_dict)
+
+ return sample_dict
+
+ def __iter__(self):
+ return self
+
+
+def _read_video_paths_and_labels(
+ video_path_label_file: List[str], prefix: str = ""
+) -> Tuple[List[str], List[int]]:
+ """
+ Args:
+ video_path_label_file (List[str]): a file that contains frame paths for each
+ video and the corresponding frame label. The file must be a space separated
+ csv of the format:
+ `original_vido_id video_id frame_id path labels`
+
+ prefix (str): prefix path to add to all paths from video_path_label_file.
+
+ """
+ image_paths = defaultdict(list)
+ labels = defaultdict(list)
+ with g_pathmgr.open(video_path_label_file, "r") as f:
+
+ # Space separated CSV with format: original_vido_id video_id frame_id path labels
+ csv_reader = csv.DictReader(f, delimiter=" ")
+ for row in csv_reader:
+ assert len(row) == 5
+ video_name = row["original_vido_id"]
+ path = os.path.join(prefix, row["path"])
+ image_paths[video_name].append(path)
+ frame_labels = row["labels"].replace('"', "")
+ label_list = []
+ if frame_labels:
+ label_list = [int(x) for x in frame_labels.split(",")]
+
+ labels[video_name].append(label_list)
+
+ # Extract image paths from dictionary and return paths and labels as list.
+ video_names = image_paths.keys()
+ image_paths = [image_paths[key] for key in video_names]
+ labels = [labels[key] for key in video_names]
+ # Aggregate labels from all frames to form video-level labels.
+ video_labels = [list(set(itertools.chain(*label_list))) for label_list in labels]
+ return image_paths, labels, video_labels
diff --git a/pytorchvideo/data/clip_sampling.py b/pytorchvideo/data/clip_sampling.py
new file mode 100644
index 00000000..90a9a7cb
--- /dev/null
+++ b/pytorchvideo/data/clip_sampling.py
@@ -0,0 +1,171 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import random
+from abc import ABC, abstractmethod
+from typing import NamedTuple
+
+
+class ClipInfo(NamedTuple):
+ """
+ Named-tuple for clip information with:
+ clip_start_sec (float): clip start time.
+ clip_end_sec (float): clip end time.
+ clip_index (int): clip index in the video.
+ aug_index (int): augmentation index for the clip. Different augmentation methods
+ might generate multiple views for the same clip.
+ is_last_clip (bool): a bool specifying whether there are more clips to be
+ sampled from the video.
+ """
+
+ clip_start_sec: float
+ clip_end_sec: float
+ clip_index: int
+ aug_index: int
+ is_last_clip: bool
+
+
+class ClipSampler(ABC):
+ """
+ Interface for clip sampler's which take a video time, previous sampled clip time,
+ and returns a named-tuple `ClipInfo`.
+ """
+
+ def __init__(self, clip_duration: float) -> None:
+ self._clip_duration = clip_duration
+ self._current_clip_index = 0
+ self._current_aug_index = 0
+
+ @abstractmethod
+ def __call__(self, last_clip_time: float, video_duration: float) -> ClipInfo:
+ pass
+
+
+def make_clip_sampler(sampling_type: str, *args) -> ClipSampler:
+ """
+ Constructs the clip samplers found in this module from the given arguments.
+ Args:
+ sampling_type (str): choose clip sampler to return. It has two options:
+ - uniform: constructs and return UniformClipSampler
+ - random: construct and return RandomClipSampler
+ *args: the args to pass to the chosen clip sampler constructor
+ """
+ if sampling_type == "uniform":
+ return UniformClipSampler(*args)
+ elif sampling_type == "random":
+ return RandomClipSampler(*args)
+ elif sampling_type == "constant_clips_per_video":
+ return ConstantClipsPerVideoSampler(*args)
+ else:
+ raise NotImplementedError(f"{sampling_type} not supported")
+
+
+class UniformClipSampler(ClipSampler):
+ """
+ Evenly splits the video into clips of size clip_duration.
+ """
+
+ def __init__(self, clip_duration: float) -> None:
+ super().__init__(clip_duration)
+
+ def __call__(self, last_clip_time: float, video_duration: float) -> ClipInfo:
+ """
+ Args:
+ last_clip_time (float): the last clip end time sampled from this video. This
+ should be 0.0 if the video hasn't had clips sampled yet.
+ segments, clip_index is the segment index to sample.
+ video_duration: (float): the duration of the video that's being sampled in seconds
+ Returns:
+ a named-tuple `ClipInfo`: includes the clip information of (clip_start_time,
+ clip_end_time, clip_index, aug_index, is_last_clip), where the times are in
+ seconds and is_last_clip is False when there is still more of time in the video
+ to be sampled.
+
+ """
+ clip_start_sec = last_clip_time
+ clip_end_sec = clip_start_sec + self._clip_duration
+ clip_index = self._current_clip_index
+ self._current_clip_index += 1
+ is_last_clip = (clip_end_sec + self._clip_duration) > video_duration
+ return ClipInfo(clip_start_sec, clip_end_sec, clip_index, 0, is_last_clip)
+
+
+class RandomClipSampler(ClipSampler):
+ """
+ Randomly samples clip of size clip_duration from the videos.
+ """
+
+ def __init__(self, clip_duration: float) -> None:
+ super().__init__(clip_duration)
+
+ def __call__(self, last_clip_time: float, video_duration: float) -> ClipInfo:
+ """
+ Args:
+ last_clip_time (float): Not used for RandomClipSampler.
+ video_duration: (float): the duration (in seconds) for the video that's
+ being sampled
+ Returns:
+ a named-tuple `ClipInfo`: includes the clip information of (clip_start_time,
+ clip_end_time, clip_index, aug_index, is_last_clip). The times are in seconds.
+ clip_index, aux_index and is_last_clip are always 0, 0 and True, respectively.
+
+ """
+ max_possible_clip_start = max(video_duration - self._clip_duration, 0)
+ clip_start_sec = random.uniform(0, max_possible_clip_start)
+ return ClipInfo(
+ clip_start_sec, clip_start_sec + self._clip_duration, 0, 0, True
+ )
+
+
+class ConstantClipsPerVideoSampler(ClipSampler):
+ """
+ Evenly splits the video into clips_per_video increments and samples clips of size
+ clip_duration at these increments.
+ """
+
+ def __init__(
+ self, clip_duration: float, clips_per_video: int, augs_per_clip: int = 1
+ ) -> None:
+ super().__init__(clip_duration)
+ self._clips_per_video = clips_per_video
+ self._augs_per_clip = augs_per_clip
+
+ def __call__(self, last_clip_time: float, video_duration: float) -> ClipInfo:
+ """
+ Args:
+ last_clip_time (float): Not used for ConstantClipsPerVideoSampler.
+ video_duration: (float): the duration (in seconds) for the video that's
+ being sampled.
+ Returns:
+ a named-tuple `ClipInfo`: includes the clip information of (clip_start_time,
+ clip_end_time, clip_index, aug_index, is_last_clip). The times are in seconds.
+ is_last_clip is True after clips_per_video clips have been sampled or the end
+ of the video is reached.
+
+ """
+ max_possible_clip_start = max(video_duration - self._clip_duration, 0)
+ uniform_clip = max_possible_clip_start / self._clips_per_video
+ clip_start_sec = uniform_clip * self._current_clip_index
+ clip_index = self._current_clip_index
+ aug_index = self._current_aug_index
+
+ self._current_aug_index += 1
+ if self._current_aug_index >= self._augs_per_clip:
+ self._current_clip_index += 1
+ self._current_aug_index = 0
+
+ # Last clip is True if sampled self._clips_per_video or if end of video is reached.
+ is_last_clip = False
+ if (
+ self._current_clip_index >= self._clips_per_video
+ or uniform_clip * self._current_clip_index > max_possible_clip_start
+ ):
+ self._current_clip_index = 0
+ is_last_clip = True
+
+ return ClipInfo(
+ clip_start_sec,
+ clip_start_sec + self._clip_duration,
+ clip_index,
+ aug_index,
+ is_last_clip,
+ )
diff --git a/pytorchvideo/data/dataset_manifest_utils.py b/pytorchvideo/data/dataset_manifest_utils.py
new file mode 100644
index 00000000..623f7af6
--- /dev/null
+++ b/pytorchvideo/data/dataset_manifest_utils.py
@@ -0,0 +1,266 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import datetime
+import os
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, Optional, Union
+
+from pytorchvideo.data.encoded_video import EncodedVideo
+from pytorchvideo.data.frame_video import FrameVideo
+from pytorchvideo.data.utils import (
+ DataclassFieldCaster,
+ load_dataclass_dict_from_csv,
+ save_dataclass_objs_to_headered_csv,
+)
+from pytorchvideo.data.video import Video
+
+
+@dataclass
+class EncodedVideoInfo(DataclassFieldCaster):
+ """
+ Class representing the location of an available encoded video.
+ """
+
+ video_id: str
+ file_path: str
+
+
+@dataclass
+class VideoFrameInfo(DataclassFieldCaster):
+ """
+ Class representing the locations of all frames that compose a video.
+ """
+
+ video_id: str
+ location: str
+ frame_file_stem: str
+ frame_string_length: int
+ min_frame_number: int
+ max_frame_number: int
+ file_extension: str
+
+
+@dataclass
+class VideoInfo(DataclassFieldCaster):
+ """
+ Class representing the video-level metadata of a video from an arbitrary video dataset.
+ """
+
+ video_id: str
+ resolution: str
+ duration: float
+ fps: float
+
+
+@dataclass
+class VideoClipInfo(DataclassFieldCaster):
+ video_id: str
+ start_time: float
+ stop_time: float
+
+
+class VideoDatasetType(Enum):
+ Frame = 1
+ EncodedVideo = 2
+
+
+class VideoDataset:
+ @staticmethod
+ def _load_videos(
+ video_data_manifest_file_path: Optional[str],
+ video_info_file_path: str,
+ multithreaded_io: bool,
+ dataset_type: VideoDatasetType,
+ ) -> Dict[str, Video]:
+ video_infos: Dict[str, VideoInfo] = load_dataclass_dict_from_csv(
+ video_info_file_path, VideoInfo, "video_id"
+ )
+ if dataset_type == VideoDatasetType.Frame:
+ return VideoDataset._load_frame_videos(
+ video_data_manifest_file_path, video_infos, multithreaded_io
+ )
+ elif dataset_type == VideoDatasetType.EncodedVideo:
+ return VideoDataset._load_encoded_videos(
+ video_data_manifest_file_path, video_infos
+ )
+
+ @staticmethod
+ def _load_frame_videos(
+ frame_manifest_file_path: str,
+ video_infos: Dict[str, VideoInfo],
+ multithreaded_io: bool,
+ ):
+ video_frames: Dict[str, VideoFrameInfo] = load_dataclass_dict_from_csv(
+ frame_manifest_file_path, VideoFrameInfo, "video_id"
+ )
+ VideoDataset._remove_video_info_missing_or_incomplete_videos(
+ video_frames, video_infos
+ )
+ return {
+ video_id: FrameVideo(
+ video_frame_paths=VideoDataset._frame_number_to_filepaths(
+ video_id, video_frames, video_infos
+ ),
+ duration=video_infos[video_id].duration,
+ fps=video_infos[video_id].fps,
+ multithreaded_io=multithreaded_io,
+ )
+ for video_id in video_infos
+ }
+
+ @staticmethod
+ def _load_encoded_videos(
+ encoded_video_manifest_file_path: str,
+ video_infos: Dict[str, VideoInfo],
+ ):
+ encoded_video_infos: Dict[str, EncodedVideoInfo] = load_dataclass_dict_from_csv(
+ encoded_video_manifest_file_path, EncodedVideoInfo, "video_id"
+ )
+ VideoDataset._remove_video_info_missing_or_incomplete_videos(
+ encoded_video_infos, video_infos
+ )
+
+ return {
+ video_id: EncodedVideo.from_path(encoded_video_info.file_path)
+ for video_id, encoded_video_info in encoded_video_infos.items()
+ }
+
+ @staticmethod
+ def _frame_number_to_filepaths(
+ video_id: str,
+ video_frames: Dict[str, VideoFrameInfo],
+ video_infos: Dict[str, VideoInfo],
+ ) -> Optional[str]:
+ video_info = video_infos[video_id]
+ video_frame_info = video_frames[video_info.video_id]
+
+ frame_filepaths = []
+ num_frames = (
+ video_frame_info.max_frame_number - video_frame_info.min_frame_number + 1
+ )
+ for frame_index in range(num_frames):
+ frame_number = frame_index + video_frame_info.min_frame_number
+ if (
+ frame_number < video_frame_info.min_frame_number
+ or frame_number > video_frame_info.max_frame_number
+ ):
+ return None
+
+ frame_path_index = str(frame_number)
+ frame_prefix = video_frame_info.frame_file_stem
+ num_zero_pad = (
+ video_frame_info.frame_string_length
+ - len(frame_path_index)
+ - len(frame_prefix)
+ )
+ zero_padding = "0" * num_zero_pad
+ frame_component = (
+ f"{frame_prefix}{zero_padding}{frame_path_index}"
+ f".{video_frame_info.file_extension}"
+ )
+ frame_filepaths.append(f"{video_frame_info.location}/{frame_component}")
+ return frame_filepaths
+
+ @staticmethod
+ def _remove_video_info_missing_or_incomplete_videos(
+ video_data_infos: Dict[str, Union[VideoFrameInfo, EncodedVideoInfo]],
+ video_infos: Dict[str, VideoInfo],
+ ) -> None:
+ # Avoid deletion keys from dict during iteration over keys
+ video_ids = list(video_infos)
+ for video_id in video_ids:
+ video_info = video_infos[video_id]
+
+ # Remove videos we have metadata for but don't have video data
+ if video_id not in video_data_infos:
+ del video_infos[video_id]
+ continue
+
+ # Remove videos we have metadata for but don't have the right number of frames
+ if type(video_data_infos[video_id]) == VideoFrameInfo:
+ video_frames_info = video_data_infos[video_id]
+ expected_frames = round(video_info.duration * video_info.fps)
+ num_frames = (
+ video_frames_info.max_frame_number
+ - video_frames_info.min_frame_number
+ )
+ if abs(num_frames - expected_frames) > video_info.fps:
+ del video_data_infos[video_id]
+ del video_infos[video_id]
+
+ video_ids = list(video_data_infos) # Avoid modifying dict during iteration
+ for video_id in video_ids:
+ # Remove videos we have video data for but don't have metadata
+ if video_id not in video_infos:
+
+ del video_data_infos[video_id]
+
+
+def get_seconds_from_hms_time(time_str: str) -> float:
+ """
+ Get Seconds from timestamp of form 'HH:MM:SS'.
+
+ Args:
+ time_str (str)
+
+ Returns:
+ float of seconds
+
+ """
+ for fmt in ("%H:%M:%S.%f", "%H:%M:%S"):
+ try:
+ time_since_min_time = datetime.datetime.strptime(time_str, fmt)
+ min_time = datetime.datetime.strptime("", "")
+ return float((time_since_min_time - min_time).total_seconds())
+ except ValueError:
+ pass
+ raise ValueError(f"No valid data format found for provided string {time_str}.")
+
+
+def save_encoded_video_manifest(
+ encoded_video_infos: Dict[str, EncodedVideoInfo], file_name: str = None
+) -> str:
+ """
+ Saves the encoded video dictionary as a csv file that can be read for future usage.
+
+ Args:
+ video_frames (Dict[str, EncodedVideoInfo]):
+ Dictionary mapping video_ids to metadata about the location of
+ their video data.
+
+ file_name (str):
+ location to save file (will be automatically generated if None).
+
+ Returns:
+ string of the filename where the video info is stored.
+ """
+ file_name = (
+ f"{os.getcwd()}/encoded_video_manifest.csv" if file_name is None else file_name
+ )
+ save_dataclass_objs_to_headered_csv(list(encoded_video_infos.values()), file_name)
+ return file_name
+
+
+def save_video_frame_info(
+ video_frames: Dict[str, VideoFrameInfo], file_name: str = None
+) -> str:
+ """
+ Saves the video frame dictionary as a csv file that can be read for future usage.
+
+ Args:
+ video_frames (Dict[str, VideoFrameInfo]):
+ Dictionary mapping video_ids to metadata about the location of
+ their video frame files.
+
+ file_name (str):
+ location to save file (will be automatically generated if None).
+
+ Returns:
+ string of the filename where the video info is stored.
+ """
+ file_name = (
+ f"{os.getcwd()}/video_frame_metadata.csv" if file_name is None else file_name
+ )
+ save_dataclass_objs_to_headered_csv(list(video_frames.values()), file_name)
+ return file_name
diff --git a/pytorchvideo/data/decoder.py b/pytorchvideo/data/decoder.py
new file mode 100644
index 00000000..b4059c45
--- /dev/null
+++ b/pytorchvideo/data/decoder.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from enum import Enum
+
+
+class DecoderType(Enum):
+ PYAV = "pyav"
+ TORCHVISION = "torchvision"
diff --git a/pytorchvideo/data/domsev.py b/pytorchvideo/data/domsev.py
new file mode 100644
index 00000000..9bfd549c
--- /dev/null
+++ b/pytorchvideo/data/domsev.py
@@ -0,0 +1,321 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import math
+from dataclasses import dataclass, fields as dataclass_fields
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import torch
+from pytorchvideo.data.dataset_manifest_utils import (
+ EncodedVideoInfo,
+ VideoClipInfo,
+ VideoDataset,
+ VideoDatasetType,
+ VideoInfo,
+)
+from pytorchvideo.data.utils import DataclassFieldCaster, load_dataclass_dict_from_csv
+from pytorchvideo.data.video import Video
+
+
+USER_SCENE_MAP = {
+ 0: "none",
+ 1: "indoor",
+ 2: "nature",
+ 3: "crowded_environment",
+ 4: "urban",
+}
+
+USER_ACTIVITY_MAP = {
+ 0: "none",
+ 1: "walking",
+ 2: "running",
+ 3: "standing",
+ 4: "biking",
+ 5: "driving",
+ 6: "playing",
+ 7: "cooking",
+ 8: "eating",
+ 9: "observing",
+ 10: "in_conversation",
+ 11: "browsing",
+ 12: "shopping",
+}
+
+USER_ATTENTION_MAP = {
+ 0: "none",
+ 1: "paying_attention",
+ 2: "interacting",
+}
+
+
+@dataclass
+class ActivityData(DataclassFieldCaster):
+ """
+ Class representing a contiguous activity video segment from the DoMSEV dataset.
+ """
+
+ video_id: str
+ start_time: float # Start time of the activity, in seconds
+ stop_time: float # Stop time of the activity, in seconds
+ start_frame: int # 0-indexed ID of the start frame (inclusive)
+ stop_frame: int # 0-index ID of the stop frame (inclusive)
+ activity_id: int
+ activity_name: str
+
+
+# Utility functions
+def seconds_to_frame_index(
+ time_in_seconds: float, fps: int, zero_indexed: Optional[bool] = True
+) -> int:
+ """Converts a point in time (in seconds) within a video clip to its closest
+ frame indexed (rounding down), based on a specified frame rate.
+
+ Args:
+ time_in_seconds (float): The point in time within the video.
+ fps (int): The frame rate (frames per second) of the video.
+ zero_indexed (Optional[bool]): Whether the returned frame should be
+ zero-indexed (if True) or one-indexed (if False).
+
+ Returns:
+ (int) The index of the nearest frame (rounding down to the nearest integer).
+ """
+ frame_idx = math.floor(time_in_seconds * fps)
+ if not zero_indexed:
+ frame_idx += 1
+ return frame_idx
+
+
+def frame_index_to_seconds(
+ frame_index: int, fps: int, zero_indexed: Optional[bool] = True
+) -> float:
+ """Converts a frame index within a video clip to the corresponding
+ point in time (in seconds) within the video, based on a specified frame rate.
+
+ Args:
+ frame_index (int): The index of the frame within the video.
+ fps (int): The frame rate (frames per second) of the video.
+ zero_indexed (Optional[bool]): Whether the specified frame is zero-indexed
+ (if True) or one-indexed (if False).
+
+ Returns:
+ (float) The point in time within the video.
+ """
+ if not zero_indexed:
+ frame_index -= 1
+ time_in_seconds = frame_index / fps
+ return time_in_seconds
+
+
+def get_overlap_for_time_range_pair(
+ t1_start: float, t1_stop: float, t2_start: float, t2_stop: float
+) -> Optional[Tuple[float, float]]:
+ """Calculates the overlap between two time ranges, if one exists.
+
+ Returns:
+ (Optional[Tuple]) A tuple of if
+ an overlap is found, or None otherwise.
+ """
+ # Check if there is an overlap
+ if (t1_start <= t2_stop) and (t2_start <= t1_stop):
+ # Calculate the overlap period
+ overlap_start_time = max(t1_start, t2_start)
+ overlap_stop_time = min(t1_stop, t2_stop)
+ return (overlap_start_time, overlap_stop_time)
+ else:
+ return None
+
+
+class DomsevDataset(torch.utils.data.Dataset):
+ """
+ Egocentric activity classification video dataset for DoMSEV stored as
+ an encoded video (with frame-level labels).
+
+
+ This dataset handles the loading, decoding, and configurable clip
+ sampling for the videos.
+ """
+
+ def __init__(
+ self,
+ video_data_manifest_file_path: str,
+ video_info_file_path: str,
+ activities_file_path: str,
+ clip_sampler: Callable[
+ [Dict[str, Video], Dict[str, List[ActivityData]]], List[VideoClipInfo]
+ ],
+ dataset_type: VideoDatasetType = VideoDatasetType.Frame,
+ frames_per_second: int = 1,
+ transform: Optional[Callable[[Dict[str, Any]], Any]] = None,
+ frame_filter: Optional[Callable[[List[int]], List[int]]] = None,
+ multithreaded_io: bool = False,
+ ) -> None:
+ f"""
+ Args:
+ video_data_manifest_file_path (str):
+ The path to a json file outlining the available video data for the
+ associated videos. File must be a csv (w/header) with columns:
+ {[f.name for f in dataclass_fields(EncodedVideoInfo)]}
+
+ To generate this file from a directory of video frames, see helper
+ functions in Module: pytorchvideo.data.domsev.utils
+
+ video_info_file_path (str):
+ Path or URI to manifest with basic metadata of each video.
+ File must be a csv (w/header) with columns:
+ {[f.name for f in dataclass_fields(VideoInfo)]}
+
+ activities_file_path (str):
+ Path or URI to manifest with activity annotations for each video.
+ File must be a csv (w/header) with columns:
+ {[f.name for f in dataclass_fields(ActivityData)]}
+
+ clip_sampler: Callable[
+ [Dict[str, Video], Dict[str, List[ActivityData]]], List[VideoClipInfo]
+ ],
+
+ dataset_type (VideoDatasetType): The dataformat in which dataset
+ video data is store (e.g. video frames, encoded video etc).
+
+ frames_per_second (int): The FPS of the stored videos. (NOTE:
+ this is variable and may be different than the original FPS
+ reported on the DoMSEV dataset website -- it depends on the
+ subsampling and frame extraction done internally at Facebook).
+
+ transform (Optional[Callable[[Dict[str, Any]], Any]]):
+ This callable is evaluated on the clip output before the clip is returned.
+ It can be used for user-defined preprocessing and augmentations to the clips.
+
+ The clip input is a dictionary with the following format:
+ {{
+ 'video': ,
+ 'audio': ,
+ 'activities': ,
+ 'start_time': ,
+ 'stop_time':
+ }}
+
+ If transform is None, the raw clip output in the above format is
+ returned unmodified.
+
+ frame_filter (Optional[Callable[[List[int]], List[int]]]):
+ This callable is evaluated on the set of available frame inidices to be
+ included in a sampled clip. This can be used to subselect frames within
+ a clip to be loaded.
+
+ multithreaded_io (bool):
+ Boolean to control whether parllelizable io operations are performed across
+ multiple threads.
+ """
+ assert video_info_file_path
+ assert activities_file_path
+ assert video_data_manifest_file_path
+
+ # Populate video and metadata data providers
+ self._videos: Dict[str, Video] = VideoDataset._load_videos(
+ video_data_manifest_file_path,
+ video_info_file_path,
+ multithreaded_io,
+ dataset_type,
+ )
+
+ self._activities: Dict[str, List[ActivityData]] = load_dataclass_dict_from_csv(
+ activities_file_path, ActivityData, "video_id", list_per_key=True
+ )
+
+ # Sample datapoints
+ self._clips: List[VideoClipInfo] = clip_sampler(self._videos, self._activities)
+
+ self._frames_per_second = frames_per_second
+ self._user_transform = transform
+ self._transform = self._transform_clip
+ self._frame_filter = frame_filter
+
+ def __getitem__(self, index) -> Dict[str, Any]:
+ """
+ Samples a video clip associated to the given index.
+
+ Args:
+ index (int): index for the video clip.
+
+ Returns:
+ A video clip with the following format if transform is None:
+ {{
+ 'video_id': ,
+ 'video': ,
+ 'audio': ,
+ 'activities': ,
+ 'start_time': ,
+ 'stop_time':
+ }}
+ Otherwise, the transform defines the clip output.
+ """
+ clip = self._clips[index]
+
+ # Filter activities by only the ones that appear within the clip boundaries,
+ # and unpack the activities so there is one per frame in the clip
+ activities_in_video = self._activities[clip.video_id]
+ activities_in_clip = []
+ for activity in activities_in_video:
+ overlap_period = get_overlap_for_time_range_pair(
+ clip.start_time, clip.stop_time, activity.start_time, activity.stop_time
+ )
+ if overlap_period is not None:
+ overlap_start_time, overlap_stop_time = overlap_period
+
+ # Convert the overlapping period between clip and activity to
+ # 0-indexed start and stop frame indexes, so we can unpack 1
+ # activity label per frame.
+ overlap_start_frame = seconds_to_frame_index(
+ overlap_start_time, self._frames_per_second
+ )
+ overlap_stop_frame = seconds_to_frame_index(
+ overlap_stop_time, self._frames_per_second
+ )
+
+ # Append 1 activity label per frame
+ for _ in range(overlap_start_frame, overlap_stop_frame):
+ activities_in_clip.append(activity)
+
+ # Convert the list of ActivityData objects to a tensor of just the activity class IDs
+ activity_class_ids = [
+ activities_in_clip[i].activity_id for i in range(len(activities_in_clip))
+ ]
+ activity_class_ids_tensor = torch.tensor(activity_class_ids)
+
+ clip_data = {
+ "video_id": clip.video_id,
+ **self._videos[clip.video_id].get_clip(clip.start_time, clip.stop_time),
+ "activities": activity_class_ids_tensor,
+ "start_time": clip.start_time,
+ "stop_time": clip.stop_time,
+ }
+
+ if self._transform:
+ clip_data = self._transform(clip_data)
+
+ return clip_data
+
+ def __len__(self) -> int:
+ """
+ Returns:
+ The number of video clips in the dataset.
+ """
+ return len(self._clips)
+
+ def _transform_clip(self, clip: Dict[str, Any]) -> Dict[str, Any]:
+ """Transforms a given video clip, according to some pre-defined transforms
+ and an optional user transform function (self._user_transform).
+
+ Args:
+ clip (Dict[str, Any]): The clip that will be transformed.
+
+ Returns:
+ (Dict[str, Any]) The transformed clip.
+ """
+ for key in clip:
+ if clip[key] is None:
+ clip[key] = torch.tensor([])
+
+ if self._user_transform:
+ clip = self._user_transform(clip)
+
+ return clip
diff --git a/pytorchvideo/data/encoded_video.py b/pytorchvideo/data/encoded_video.py
new file mode 100644
index 00000000..a5413c70
--- /dev/null
+++ b/pytorchvideo/data/encoded_video.py
@@ -0,0 +1,123 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import io
+import logging
+import pathlib
+from typing import BinaryIO, Dict, Optional
+
+import torch
+from iopath.common.file_io import g_pathmgr
+from pytorchvideo.data.decoder import DecoderType
+
+from .encoded_video_pyav import EncodedVideoPyAV
+from .encoded_video_torchvision import EncodedVideoTorchVision
+from .video import Video
+
+
+logger = logging.getLogger(__name__)
+
+
+def select_video_class(decoder: str) -> Video:
+ """
+ Select the class for accessing clips based on provided decoder string
+
+ Args:
+ decoder (str): Defines what type of decoder used to decode a video.
+ """
+ if DecoderType(decoder) == DecoderType.PYAV:
+ video_cls = EncodedVideoPyAV
+ elif DecoderType(decoder) == DecoderType.TORCHVISION:
+ video_cls = EncodedVideoTorchVision
+ else:
+ raise NotImplementedError(f"Unknown decoder type {decoder}")
+ return video_cls
+
+
+class EncodedVideo(Video):
+ """
+ EncodedVideo is an abstraction for accessing clips from an encoded video.
+ It supports selective decoding when header information is available.
+ """
+
+ @classmethod
+ def from_path(
+ cls, file_path: str, decode_audio: bool = True, decoder: str = "pyav"
+ ):
+ """
+ Fetches the given video path using PathManager (allowing remote uris to be
+ fetched) and constructs the EncodedVideo object.
+
+ Args:
+ file_path (str): a PathManager file-path.
+ """
+ # We read the file with PathManager so that we can read from remote uris.
+ with g_pathmgr.open(file_path, "rb") as fh:
+ video_file = io.BytesIO(fh.read())
+
+ return cls(video_file, pathlib.Path(file_path).name, decode_audio, decoder)
+
+ def __init__(
+ self,
+ file: BinaryIO,
+ video_name: Optional[str] = None,
+ decode_audio: bool = True,
+ decoder: str = "pyav",
+ ) -> None:
+ """
+ Args:
+ file (BinaryIO): a file-like object (e.g. io.BytesIO or io.StringIO) that
+ contains the encoded video.
+
+ decoder (str): Defines what type of decoder used to decode a video.
+ """
+ video_cls = select_video_class(decoder)
+ self.encoded_video = video_cls(file, video_name, decode_audio)
+
+ @property
+ def name(self) -> Optional[str]:
+ """
+ Returns:
+ name: the name of the stored video if set.
+ """
+ return self.encoded_video.name
+
+ @property
+ def duration(self) -> float:
+ """
+ Returns:
+ duration: the video's duration/end-time in seconds.
+ """
+ return self.encoded_video.duration
+
+ def get_clip(
+ self, start_sec: float, end_sec: float
+ ) -> Dict[str, Optional[torch.Tensor]]:
+ """
+ Retrieves frames from the encoded video at the specified start and end times
+ in seconds (the video always starts at 0 seconds).
+
+ Args:
+ start_sec (float): the clip start time in seconds
+ end_sec (float): the clip end time in seconds
+ Returns:
+ clip_data:
+ A dictionary mapping the entries at "video" and "audio" to a tensors.
+
+ "video": A tensor of the clip's RGB frames with shape:
+ (channel, time, height, width). The frames are of type torch.float32 and
+ in the range [0 - 255].
+
+ "audio": A tensor of the clip's audio samples with shape:
+ (samples). The samples are of type torch.float32 and
+ in the range [0 - 255].
+
+ Returns None if no video or audio found within time range.
+
+ """
+ return self.encoded_video.get_clip(start_sec, end_sec)
+
+ def close(self):
+ """
+ Closes the internal video container.
+ """
+ self.encoded_video.close()
diff --git a/pytorchvideo/data/encoded_video_dataset.py b/pytorchvideo/data/encoded_video_dataset.py
new file mode 100644
index 00000000..c0425c5a
--- /dev/null
+++ b/pytorchvideo/data/encoded_video_dataset.py
@@ -0,0 +1,279 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from __future__ import annotations
+
+import logging
+import multiprocessing
+import pathlib
+from typing import Any, Callable, List, Optional, Tuple, Type
+
+import torch.utils.data
+from pytorchvideo.data.clip_sampling import ClipSampler
+from pytorchvideo.data.encoded_video import EncodedVideo
+
+from .labeled_video_paths import LabeledVideoPaths
+from .utils import MultiProcessSampler
+
+
+logger = logging.getLogger(__name__)
+
+
+class EncodedVideoDataset(torch.utils.data.IterableDataset):
+ """
+ EncodedVideoDataset handles the storage, loading, decoding and clip sampling for a
+ video dataset. It assumes each video is stored as an encoded video (e.g. mp4, avi).
+ """
+
+ _MAX_CONSECUTIVE_FAILURES = 10
+
+ def __init__(
+ self,
+ labeled_video_paths: List[Tuple[str, Optional[dict]]],
+ clip_sampler: ClipSampler,
+ video_sampler: Type[torch.utils.data.Sampler] = torch.utils.data.RandomSampler,
+ transform: Optional[Callable[[dict], Any]] = None,
+ decode_audio: bool = True,
+ decoder: str = "pyav",
+ ) -> None:
+ """
+ Args:
+ labeled_video_paths List[Tuple[str, Optional[dict]]]]) : List containing
+ video file paths and associated labels
+
+ clip_sampler (ClipSampler): Defines how clips should be sampled from each
+ video. See the clip sampling documentation for more information.
+
+ video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal
+ video container. This defines the order videos are decoded and,
+ if necessary, the distributed split.
+
+ transform (Callable): This callable is evaluated on the clip output before
+ the clip is returned. It can be used for user defined preprocessing and
+ augmentations to the clips. The clip output is a dictionary with the
+ following format:
+ {
+ 'video':
+ 'label':
+ 'video_index':
+ 'clip_index':
+ 'aug_index': , augmentation index as augmentations
+ might generate multiple views for one clip.
+ }
+ If transform is None, the raw clip output in the above format is
+ returned unmodified.
+
+ decoder (str): Defines what type of decoder used to decode a video.
+ """
+ self._decode_audio = decode_audio
+ self._transform = transform
+ self._clip_sampler = clip_sampler
+ self._labeled_videos = labeled_video_paths
+ self._decoder = decoder
+
+ # If a RandomSampler is used we need to pass in a custom random generator that
+ # ensures all PyTorch multiprocess workers have the same random seed.
+ self._video_random_generator = None
+ if video_sampler == torch.utils.data.RandomSampler:
+ self._video_random_generator = torch.Generator()
+ self._video_sampler = video_sampler(
+ self._labeled_videos, generator=self._video_random_generator
+ )
+ else:
+ self._video_sampler = video_sampler(self._labeled_videos)
+
+ self._video_sampler_iter = None # Initialized on first call to self.__next__()
+
+ # Depending on the clip sampler type, we may want to sample multiple clips
+ # from one video. In that case, we keep the store video, label and previous sampled
+ # clip time in these variables.
+ self._loaded_video_label = None
+ self._loaded_clip = None
+ self._next_clip_start_time = 0.0
+
+ @property
+ def video_sampler(self):
+ return self._video_sampler
+
+ def __next__(self) -> dict:
+ """
+ Retrieves the next clip based on the clip sampling strategy and video sampler.
+
+ Returns:
+ A video clip with the following format if transform is None:
+ {
+ 'video': ,
+ 'label': ,
+ 'video_index':
+ 'clip_index':
+ 'aug_index': , augmentation index as augmentations
+ might generate multiple views for one clip.
+ }
+ Otherwise, the transform defines the clip output.
+ """
+ if not self._video_sampler_iter:
+ # Setup MultiProcessSampler here - after PyTorch DataLoader workers are spawned.
+ self._video_sampler_iter = iter(MultiProcessSampler(self._video_sampler))
+
+ for i_try in range(self._MAX_CONSECUTIVE_FAILURES):
+ # Reuse previously stored video if there are still clips to be sampled from
+ # the last loaded video.
+ if self._loaded_video_label:
+ video, info_dict, video_index = self._loaded_video_label
+ else:
+ video_index = next(self._video_sampler_iter)
+ try:
+ video_path, info_dict = self._labeled_videos[video_index]
+ video = EncodedVideo.from_path(
+ video_path,
+ decode_audio=self._decode_audio,
+ decoder=self._decoder,
+ )
+ self._loaded_video_label = (video, info_dict, video_index)
+ except Exception as e:
+ logger.debug(
+ "Failed to load video with error: {}; trial {}".format(
+ e,
+ i_try,
+ )
+ )
+ continue
+
+ (
+ clip_start,
+ clip_end,
+ clip_index,
+ aug_index,
+ is_last_clip,
+ ) = self._clip_sampler(self._next_clip_start_time, video.duration)
+ # Only load the clip once and reuse previously stored clip if there are multiple
+ # views for augmentations to perform on the same clip.
+ if aug_index == 0:
+ self._loaded_clip = video.get_clip(clip_start, clip_end)
+ self._next_clip_start_time = clip_end
+
+ clip_is_null = (
+ self._loaded_clip is None
+ or self._loaded_clip["video"] is None
+ or (self._loaded_clip["audio"] is None and self._decode_audio)
+ )
+ if is_last_clip or clip_is_null:
+ # Close the loaded encoded video and reset the last sampled clip time ready
+ # to sample a new video on the next iteration.
+ self._loaded_video_label[0].close()
+ self._loaded_video_label = None
+ self._next_clip_start_time = 0.0
+
+ if clip_is_null:
+ logger.debug(
+ "Failed to load clip {}; trial {}".format(video.name, i_try)
+ )
+ continue
+
+ frames = self._loaded_clip["video"]
+ audio_samples = self._loaded_clip["audio"]
+ sample_dict = {
+ "video": frames,
+ "video_name": video.name,
+ "video_index": video_index,
+ "clip_index": clip_index,
+ "aug_index": aug_index,
+ **info_dict,
+ **({"audio": audio_samples} if audio_samples is not None else {}),
+ }
+ if self._transform is not None:
+ sample_dict = self._transform(sample_dict)
+
+ # User can force dataset to continue by returning None in transform.
+ if sample_dict is None:
+ continue
+
+ return sample_dict
+ else:
+ raise RuntimeError(
+ f"Failed to load video after {self._MAX_CONSECUTIVE_FAILURES} retries."
+ )
+
+ def __iter__(self):
+ self._video_sampler_iter = None # Reset video sampler
+
+ # If we're in a PyTorch DataLoader multiprocessing context, we need to use the
+ # same seed for each worker's RandomSampler generator. The workers at each
+ # __iter__ call are created from the unique value: worker_info.seed - worker_info.id,
+ # which we can use for this seed.
+ worker_info = torch.utils.data.get_worker_info()
+ if self._video_random_generator is not None and worker_info is not None:
+ base_seed = worker_info.seed - worker_info.id
+ self._video_random_generator.manual_seed(base_seed)
+
+ return self
+
+ def num_videos(self):
+ return len(self.video_sampler)
+
+
+def labeled_encoded_video_dataset(
+ data_path: pathlib.path,
+ clip_sampler: ClipSampler,
+ video_sampler: Type[torch.utils.data.Sampler] = torch.utils.data.RandomSampler,
+ transform: Optional[Callable[[dict], Any]] = None,
+ video_path_prefix: str = "",
+ decode_audio: bool = True,
+ decoder: str = "pyav",
+) -> EncodedVideoDataset:
+ """
+ A helper function to create EncodedVideoDataset object for Ucf101 and Kinectis datasets.
+
+ Args:
+ data_path (pathlib.Path): Path to the data. The path type defines how the
+ data should be read:
+ - For a file path, the file is read and each line is parsed into a
+ video path and label.
+ - For a directory, the directory structure defines the classes
+ (i.e. each subdirectory is a class).
+ See the LabeledVideoPaths class documentation for specific formatting
+ details and examples.
+
+ clip_sampler (ClipSampler): Defines how clips should be sampled from each
+ video. See the clip sampling documentation for more information.
+
+ video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal
+ video container. This defines the order videos are decoded and,
+ if necessary, the distributed split.
+
+ transform (Callable): This callable is evaluated on the clip output before
+ the clip is returned. It can be used for user defined preprocessing and
+ augmentations to the clips. The clip output is a dictionary with the
+ following format:
+ {
+ 'video': ,
+ 'label': ,
+ 'video_index':
+ 'clip_index':
+ 'aug_index': , augmentation index as augmentations
+ might generate multiple views for one clip.
+ }
+ If transform is None, the raw clip output in the above format is
+ returned unmodified.
+
+ video_path_prefix (str): Path to root directory with the videos that are
+ loaded in EncodedVideoDataset. All the video paths before loading
+ are prefixed with this path.
+
+ decoder (str): Defines what type of decoder used to decode a video.
+
+ """
+ # PathManager may configure the multiprocessing context in a way that conflicts
+ # with PyTorch DataLoader workers. To avoid this, we make sure the PathManager
+ # calls (made by LabeledVideoPaths) are wrapped in their own sandboxed process.
+ labeled_video_paths = LabeledVideoPaths.from_path(data_path)
+
+ labeled_video_paths.path_prefix = video_path_prefix
+ dataset = EncodedVideoDataset(
+ labeled_video_paths,
+ clip_sampler,
+ video_sampler,
+ transform,
+ decode_audio=decode_audio,
+ decoder=decoder,
+ )
+ return dataset
diff --git a/pytorchvideo/data/encoded_video_pyav.py b/pytorchvideo/data/encoded_video_pyav.py
new file mode 100644
index 00000000..07635f35
--- /dev/null
+++ b/pytorchvideo/data/encoded_video_pyav.py
@@ -0,0 +1,286 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import logging
+import math
+from typing import BinaryIO, Dict, List, Optional, Tuple
+
+import av
+import numpy as np
+import torch
+
+from .utils import pts_to_secs, secs_to_pts, thwc_to_cthw
+from .video import Video
+
+
+logger = logging.getLogger(__name__)
+
+
+class EncodedVideoPyAV(Video):
+ """
+ EncodedVideoPyAV is an abstraction for accessing clips from an encoded video using
+ PyAV as the decoding backend. It supports selective decoding when header information
+ is available.
+ """
+
+ def __init__(
+ self,
+ file: BinaryIO,
+ video_name: Optional[str] = None,
+ decode_audio: bool = True,
+ ) -> None:
+ """
+ Args:
+ file (BinaryIO): a file-like object (e.g. io.BytesIO or io.StringIO) that
+ contains the encoded video.
+ """
+ self._video_name = video_name
+ self._decode_audio = decode_audio
+
+ try:
+ self._container = av.open(file)
+ except Exception as e:
+ raise RuntimeError(f"Failed to open video {video_name}. {e}")
+
+ if self._container is None or len(self._container.streams.video) == 0:
+ raise RuntimeError(f"Video stream not found {video_name}")
+
+ # Retrieve video header information if available.
+ video_stream = self._container.streams.video[0]
+ self._video_time_base = video_stream.time_base
+ self._video_start_pts = video_stream.start_time
+ if self._video_start_pts is None:
+ self._video_start_pts = 0.0
+
+ video_duration = video_stream.duration
+
+ # Retrieve audio header information if available.
+ audio_duration = None
+ self._has_audio = None
+ if self._decode_audio:
+ self._has_audio = self._container.streams.audio
+ if self._has_audio:
+ self._audio_time_base = self._container.streams.audio[0].time_base
+ self._audio_start_pts = self._container.streams.audio[0].start_time
+ if self._audio_start_pts is None:
+ self._audio_start_pts = 0.0
+
+ audio_duration = self._container.streams.audio[0].duration
+
+ # If duration isn't found in header the whole video is decoded to
+ # determine the duration.
+ self._video, self._audio, self._selective_decoding = (None, None, True)
+ if audio_duration is None and video_duration is None:
+ self._selective_decoding = False
+ self._video, self._audio = self._pyav_decode_video()
+ if self._video is None:
+ raise RuntimeError("Unable to decode video stream")
+
+ video_duration = self._video[-1][1]
+ if self._audio is not None:
+ audio_duration = self._audio[-1][1]
+
+ # Take the largest duration of either video or duration stream.
+ if audio_duration is not None and video_duration is not None:
+ self._duration = max(
+ pts_to_secs(
+ video_duration, self._video_time_base, self._video_start_pts
+ ),
+ pts_to_secs(
+ audio_duration, self._audio_time_base, self._audio_start_pts
+ ),
+ )
+ elif video_duration is not None:
+ self._duration = pts_to_secs(
+ video_duration, self._video_time_base, self._video_start_pts
+ )
+
+ elif audio_duration is not None:
+ self._duration = pts_to_secs(
+ audio_duration, self._audio_time_base, self._audio_start_pts
+ )
+
+ @property
+ def name(self) -> Optional[str]:
+ """
+ Returns:
+ name: the name of the stored video if set.
+ """
+ return self._video_name
+
+ @property
+ def duration(self) -> float:
+ """
+ Returns:
+ duration: the video's duration/end-time in seconds.
+ """
+ return self._duration
+
+ def get_clip(
+ self, start_sec: float, end_sec: float
+ ) -> Dict[str, Optional[torch.Tensor]]:
+ """
+ Retrieves frames from the encoded video at the specified start and end times
+ in seconds (the video always starts at 0 seconds).
+
+ Args:
+ start_sec (float): the clip start time in seconds
+ end_sec (float): the clip end time in seconds
+ Returns:
+ clip_data:
+ A dictionary mapping the entries at "video" and "audio" to a tensors.
+
+ "video": A tensor of the clip's RGB frames with shape:
+ (channel, time, height, width). The frames are of type torch.float32 and
+ in the range [0 - 255].
+
+ "audio": A tensor of the clip's audio samples with shape:
+ (samples). The samples are of type torch.float32 and
+ in the range [0 - 255].
+
+ Returns None if no video or audio found within time range.
+
+ """
+ if self._selective_decoding:
+ self._video, self._audio = self._pyav_decode_video(start_sec, end_sec)
+
+ video_frames = None
+ if self._video is not None:
+ video_start_pts = secs_to_pts(
+ start_sec, self._video_time_base, self._video_start_pts
+ )
+ video_end_pts = secs_to_pts(
+ end_sec, self._video_time_base, self._video_start_pts
+ )
+ video_frames = [
+ f
+ for f, pts in self._video
+ if pts >= video_start_pts and pts <= video_end_pts
+ ]
+
+ audio_samples = None
+ if self._has_audio and self._audio is not None:
+ audio_start_pts = secs_to_pts(
+ start_sec, self._audio_time_base, self._audio_start_pts
+ )
+ audio_end_pts = secs_to_pts(
+ end_sec, self._audio_time_base, self._audio_start_pts
+ )
+ audio_samples = [
+ f
+ for f, pts in self._audio
+ if pts >= audio_start_pts and pts <= audio_end_pts
+ ]
+ audio_samples = torch.cat(audio_samples, axis=0)
+ audio_samples = audio_samples.to(torch.float32)
+
+ if video_frames is None or len(video_frames) == 0:
+ logger.debug(
+ f"No video found within {start_sec} and {end_sec} seconds. "
+ f"Video starts at time 0 and ends at {self.duration}."
+ )
+
+ video_frames = None
+
+ if video_frames is not None:
+ video_frames = thwc_to_cthw(torch.stack(video_frames)).to(torch.float32)
+
+ return {
+ "video": video_frames,
+ "audio": audio_samples,
+ }
+
+ def close(self):
+ """
+ Closes the internal video container.
+ """
+ if self._container is not None:
+ self._container.close()
+
+ def _pyav_decode_video(
+ self, start_secs: float = 0.0, end_secs: float = math.inf
+ ) -> float:
+ """
+ Selectively decodes a video between start_pts and end_pts in time units of the
+ self._video's timebase.
+ """
+ video_and_pts = None
+ audio_and_pts = None
+ try:
+ pyav_video_frames, _ = _pyav_decode_stream(
+ self._container,
+ secs_to_pts(start_secs, self._video_time_base, self._video_start_pts),
+ secs_to_pts(end_secs, self._video_time_base, self._video_start_pts),
+ self._container.streams.video[0],
+ {"video": 0},
+ )
+ if len(pyav_video_frames) > 0:
+ video_and_pts = [
+ (torch.from_numpy(frame.to_rgb().to_ndarray()), frame.pts)
+ for frame in pyav_video_frames
+ ]
+
+ if self._has_audio:
+ pyav_audio_frames, _ = _pyav_decode_stream(
+ self._container,
+ secs_to_pts(
+ start_secs, self._audio_time_base, self._audio_start_pts
+ ),
+ secs_to_pts(end_secs, self._audio_time_base, self._audio_start_pts),
+ self._container.streams.audio[0],
+ {"audio": 0},
+ )
+
+ if len(pyav_audio_frames) > 0:
+ audio_and_pts = [
+ (
+ torch.from_numpy(np.mean(frame.to_ndarray(), axis=0)),
+ frame.pts,
+ )
+ for frame in pyav_audio_frames
+ ]
+
+ except Exception as e:
+ logger.debug(f"Failed to decode video: {self._video_name}. {e}")
+
+ return video_and_pts, audio_and_pts
+
+
+def _pyav_decode_stream(
+ container: av.container.input.InputContainer,
+ start_pts: float,
+ end_pts: float,
+ stream: av.video.stream.VideoStream,
+ stream_name: dict,
+ buffer_size: int = 0,
+) -> Tuple[List, float]:
+ """
+ Decode the video with PyAV decoder.
+ Args:
+ container (container): PyAV container.
+ start_pts (int): the starting Presentation TimeStamp to fetch the
+ video frames.
+ end_pts (int): the ending Presentation TimeStamp of the decoded frames.
+ stream (stream): PyAV stream.
+ stream_name (dict): a dictionary of streams. For example, {"video": 0}
+ means video stream at stream index 0.
+ Returns:
+ result (list): list of decoded frames.
+ max_pts (int): max Presentation TimeStamp of the video sequence.
+ """
+
+ # Seeking in the stream is imprecise. Thus, seek to an earlier pts by a
+ # margin pts.
+ margin = 1024
+ seek_offset = max(start_pts - margin, 0)
+ container.seek(int(seek_offset), any_frame=False, backward=True, stream=stream)
+ frames = {}
+ max_pts = 0
+ for frame in container.decode(**stream_name):
+ max_pts = max(max_pts, frame.pts)
+ if frame.pts >= start_pts and frame.pts <= end_pts:
+ frames[frame.pts] = frame
+ elif frame.pts > end_pts:
+ break
+
+ result = [frames[pts] for pts in sorted(frames)]
+ return result, max_pts
diff --git a/pytorchvideo/data/encoded_video_torchvision.py b/pytorchvideo/data/encoded_video_torchvision.py
new file mode 100644
index 00000000..7f71c157
--- /dev/null
+++ b/pytorchvideo/data/encoded_video_torchvision.py
@@ -0,0 +1,255 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import logging
+from typing import BinaryIO, Dict, Optional
+
+import numpy as np
+import torch
+
+from .utils import pts_to_secs, secs_to_pts, thwc_to_cthw
+from .video import Video
+
+
+logger = logging.getLogger(__name__)
+
+
+class EncodedVideoTorchVision(Video):
+ """
+
+ Accessing clips from an encoded video using Torchvision video reading API
+ (torch.ops.video_reader.read_video_from_memory) as the decoding backend.
+ """
+
+ """
+ av_seek_frame is imprecise so seek to a timestamp earlier by a margin
+ The unit of margin is second
+ """
+ SEEK_FRAME_MARGIN = 0.25
+
+ def __init__(
+ self,
+ file: BinaryIO,
+ video_name: Optional[str] = None,
+ decode_audio: bool = True,
+ ) -> None:
+ self._video_tensor = torch.tensor(
+ np.frombuffer(file.getvalue(), dtype=np.uint8)
+ )
+ self._video_name = video_name
+ self._decode_audio = decode_audio
+
+ (
+ self._video,
+ self._video_time_base,
+ self._video_start_pts,
+ video_duration,
+ self._audio,
+ self._audio_time_base,
+ self._audio_start_pts,
+ audio_duration,
+ ) = self._torch_vision_decode_video()
+
+ # Take the largest duration of either video or duration stream.
+ if audio_duration is not None and video_duration is not None:
+ self._duration = max(
+ pts_to_secs(
+ video_duration, self._video_time_base, self._video_start_pts
+ ),
+ pts_to_secs(
+ audio_duration, self._audio_time_base, self._audio_start_pts
+ ),
+ )
+ elif video_duration is not None:
+ self._duration = pts_to_secs(
+ video_duration, self._video_time_base, self._video_start_pts
+ )
+
+ elif audio_duration is not None:
+ self._duration = pts_to_secs(
+ audio_duration, self._audio_time_base, self._audio_start_pts
+ )
+
+ @property
+ def name(self) -> Optional[str]:
+ """
+ Returns:
+ name: the name of the stored video if set.
+ """
+ return self._video_name
+
+ @property
+ def duration(self) -> float:
+ """
+ Returns:
+ duration: the video's duration/end-time in seconds.
+ """
+ return self._duration
+
+ def close(self):
+ pass
+
+ def get_clip(
+ self, start_sec: float, end_sec: float
+ ) -> Dict[str, Optional[torch.Tensor]]:
+ """
+ Retrieves frames from the encoded video at the specified start and end times
+ in seconds (the video always starts at 0 seconds).
+
+ Args:
+ start_sec (float): the clip start time in seconds
+ end_sec (float): the clip end time in seconds
+ Returns:
+ clip_data:
+ A dictionary mapping the entries at "video" and "audio" to a tensors.
+
+ "video": A tensor of the clip's RGB frames with shape:
+ (channel, time, height, width). The frames are of type torch.float32 and
+ in the range [0 - 255].
+
+ "audio": A tensor of the clip's audio samples with shape:
+ (samples). The samples are of type torch.float32 and
+ in the range [0 - 255].
+
+ Returns None if no video or audio found within time range.
+
+ """
+ video_frames = None
+ if self._video is not None:
+ video_start_pts = secs_to_pts(
+ start_sec, self._video_time_base, self._video_start_pts
+ )
+ video_end_pts = secs_to_pts(
+ end_sec, self._video_time_base, self._video_start_pts
+ )
+ video_frames = [
+ f
+ for f, pts in self._video
+ if pts >= video_start_pts and pts <= video_end_pts
+ ]
+
+ audio_samples = None
+ if self._decode_audio and self._audio:
+ audio_start_pts = secs_to_pts(
+ start_sec, self._audio_time_base, self._audio_start_pts
+ )
+ audio_end_pts = secs_to_pts(
+ end_sec, self._audio_time_base, self._audio_start_pts
+ )
+ audio_samples = [
+ f
+ for f, pts in self._audio
+ if pts >= audio_start_pts and pts <= audio_end_pts
+ ]
+ audio_samples = torch.cat(audio_samples, axis=0)
+ audio_samples = audio_samples.to(torch.float32)
+
+ if video_frames is None or len(video_frames) == 0:
+ logger.warning(
+ f"No video found within {start_sec} and {end_sec} seconds. "
+ f"Video starts at time 0 and ends at {self.duration}."
+ )
+
+ video_frames = None
+
+ if video_frames is not None:
+ video_frames = thwc_to_cthw(torch.stack(video_frames)).to(torch.float32)
+
+ return {
+ "video": video_frames,
+ "audio": audio_samples,
+ }
+
+ def _torch_vision_decode_video(
+ self, start_pts: int = 0, end_pts: int = -1
+ ) -> float:
+ """
+ Decode the video in the PTS range [start_pts, end_pts]
+ """
+ video_and_pts = None
+ audio_and_pts = None
+
+ width, height, min_dimension, max_dimension = 0, 0, 0, 0
+ video_start_pts, video_end_pts = start_pts, end_pts
+ video_timebase_num, video_timebase_den = 0, 1
+
+ samples, channels = 0, 0
+ audio_start_pts, audio_end_pts = start_pts, end_pts
+ audio_timebase_num, audio_timebase_den = 0, 1
+
+ try:
+ tv_result = torch.ops.video_reader.read_video_from_memory(
+ self._video_tensor,
+ self.SEEK_FRAME_MARGIN,
+ # Set getPtsOnly=0, i.e., read full video rather than just header
+ 0,
+ # Read video stream
+ 1,
+ width,
+ height,
+ min_dimension,
+ max_dimension,
+ video_start_pts,
+ video_end_pts,
+ video_timebase_num,
+ video_timebase_den,
+ # Read audio stream
+ self._decode_audio,
+ samples,
+ channels,
+ audio_start_pts,
+ audio_end_pts,
+ audio_timebase_num,
+ audio_timebase_den,
+ )
+ except Exception as e:
+ logger.warning(f"Failed to decode video of name {self.video_name}. {e}")
+ raise e
+
+ (
+ vframes,
+ vframes_pts,
+ vtimebase,
+ _,
+ vduration,
+ aframes,
+ aframe_pts,
+ atimebase,
+ _,
+ aduration,
+ ) = tv_result
+
+ if vduration < 0:
+ # No header information to infer video duration
+ video_duration = float(vframes_pts[-1])
+ else:
+ video_duration = float(vduration)
+
+ video_and_pts = list(zip(vframes, vframes_pts))
+ video_start_pts = int(vframes_pts[0])
+ video_time_base = float(vtimebase[0] / vtimebase[1])
+
+ audio_and_pts = None
+ audio_time_base = None
+ audio_start_pts = None
+ audio_duration = None
+ if self._decode_audio:
+ if aduration < 0:
+ # No header information to infer audio duration
+ audio_duration = float(aframe_pts[-1])
+ else:
+ audio_duration = float(aduration)
+
+ audio_and_pts = list(zip(aframes, aframe_pts))
+ audio_start_pts = int(aframe_pts[0])
+ audio_time_base = float(atimebase[0] / atimebase[1])
+
+ return (
+ video_and_pts,
+ video_time_base,
+ video_start_pts,
+ video_duration,
+ audio_and_pts,
+ audio_time_base,
+ audio_start_pts,
+ audio_duration,
+ )
diff --git a/pytorchvideo/data/epic_kitchen/__init__.py b/pytorchvideo/data/epic_kitchen/__init__.py
new file mode 100644
index 00000000..dea20a04
--- /dev/null
+++ b/pytorchvideo/data/epic_kitchen/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from .epic_kitchen_dataset import ActionData, EpicKitchenDataset
diff --git a/pytorchvideo/data/epic_kitchen/epic_kitchen_dataset.py b/pytorchvideo/data/epic_kitchen/epic_kitchen_dataset.py
new file mode 100644
index 00000000..7cdbbf18
--- /dev/null
+++ b/pytorchvideo/data/epic_kitchen/epic_kitchen_dataset.py
@@ -0,0 +1,195 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import ast
+from dataclasses import dataclass, fields as dataclass_fields
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+from pytorchvideo.data.dataset_manifest_utils import (
+ EncodedVideoInfo,
+ VideoClipInfo,
+ VideoDataset,
+ VideoDatasetType,
+ VideoFrameInfo,
+ VideoInfo,
+ get_seconds_from_hms_time,
+)
+from pytorchvideo.data.utils import DataclassFieldCaster, load_dataclass_dict_from_csv
+from pytorchvideo.data.video import Video
+
+
+@dataclass
+class ActionData(DataclassFieldCaster):
+ """
+ Class representing an action from the Epic Kitchen dataset.
+ """
+
+ participant_id: str
+ video_id: str
+ narration: str
+ start_timestamp: str
+ stop_timestamp: str
+ start_frame: int
+ stop_frame: int
+ verb: str
+ verb_class: int
+ noun: str
+ noun_class: int
+ all_nouns: list = DataclassFieldCaster.complex_initialized_dataclass_field(
+ ast.literal_eval
+ )
+ all_noun_classes: list = DataclassFieldCaster.complex_initialized_dataclass_field(
+ ast.literal_eval
+ )
+
+ @property
+ def start_time(self) -> float:
+ return get_seconds_from_hms_time(self.start_timestamp)
+
+ @property
+ def stop_time(self) -> float:
+ return get_seconds_from_hms_time(self.stop_timestamp)
+
+
+class EpicKitchenDataset(torch.utils.data.Dataset):
+ """
+ Video dataset for EpicKitchen-55 Dataset
+
+
+ This dataset handles the loading, decoding, and configurable clip
+ sampling for the videos.
+ """
+
+ def __init__(
+ self,
+ video_info_file_path: str,
+ actions_file_path: str,
+ clip_sampler: Callable[
+ [Dict[str, Video], Dict[str, List[ActionData]]], List[VideoClipInfo]
+ ],
+ video_data_manifest_file_path: str,
+ dataset_type: VideoDatasetType = VideoDatasetType.Frame,
+ transform: Optional[Callable[[Dict[str, Any]], Any]] = None,
+ frame_filter: Optional[Callable[[List[int]], List[int]]] = None,
+ multithreaded_io: bool = True,
+ ) -> None:
+ f"""
+ Args:
+ video_info_file_path (str):
+ Path or URI to manifest with basic metadata of each video.
+ File must be a csv (w/header) with columns:
+ {[f.name for f in dataclass_fields(VideoInfo)]}
+
+ actions_file_path (str):
+ Path or URI to manifest with action annotations for each video.
+ File must ber a csv (w/header) with columns:
+ {[f.name for f in dataclass_fields(ActionData)]}
+
+ clip_sampler (Callable[[Dict[str, Video]], List[VideoClipInfo]]):
+ This callable takes as input all available videos and outputs a list of clips to
+ be loaded by the dataset.
+
+ video_data_manifest_file_path (str):
+ The path to a json file outlining the available video data for the
+ associated videos. File must be a csv (w/header) with columns:
+ {[f.name for f in dataclass_fields(VideoFrameInfo)]}
+
+ or
+ {[f.name for f in dataclass_fields(EncodedVideoInfo)]}
+
+ To generate this file from a directory of video frames, see helper
+ functions in Module: pytorchvideo.data.epic_kitchen.utils
+
+ dataset_type (VideoDatasetType): The dataformat in which dataset
+ video data is store (e.g. video frames, encoded video etc).
+
+ transform (Optional[Callable[[Dict[str, Any]], Any]]):
+ This callable is evaluated on the clip output before the clip is returned.
+ It can be used for user-defined preprocessing and augmentations to the clips.
+
+ The clip input is a dictionary with the following format:
+ {{
+ 'video': ,
+ 'audio': ,
+ 'actions': ,
+ 'start_time': ,
+ 'stop_time':
+ }}
+
+ If transform is None, the raw clip output in the above format is
+ returned unmodified.
+
+ frame_filter (Optional[Callable[[List[int]], List[int]]]):
+ This callable is evaluated on the set of available frame inidices to be
+ included in a sampled clip. This can be used to subselect frames within
+ a clip to be loaded.
+
+ multithreaded_io (bool):
+ Boolean to control whether parllelizable io operations are performed across
+ multiple threads.
+
+ """
+ assert video_info_file_path
+ assert actions_file_path
+ assert video_data_manifest_file_path
+ assert clip_sampler
+
+ # Populate video and metadata data providers
+ self._videos: Dict[str, Video] = VideoDataset._load_videos(
+ video_data_manifest_file_path,
+ video_info_file_path,
+ multithreaded_io,
+ dataset_type,
+ )
+
+ self._actions: Dict[str, List[ActionData]] = load_dataclass_dict_from_csv(
+ actions_file_path, ActionData, "video_id", list_per_key=True
+ )
+ # Sample datapoints
+ self._clips: List[VideoClipInfo] = clip_sampler(self._videos, self._actions)
+
+ self._transform = transform
+ self._frame_filter = frame_filter
+
+ def __getitem__(self, index) -> Dict[str, Any]:
+ """
+ Samples a video clip associated to the given index.
+
+ Args:
+ index (int): index for the video clip.
+
+ Returns:
+ A video clip with the following format if transform is None:
+ {{
+ 'video_id': ,
+ 'video': ,
+ 'audio': ,
+ 'actions': ,
+ 'start_time': ,
+ 'stop_time':
+ }}
+ Otherwise, the transform defines the clip output.
+ """
+ clip = self._clips[index]
+
+ clip_data = {
+ "video_id": clip.video_id,
+ **self._videos[clip.video_id].get_clip(
+ clip.start_time, clip.stop_time, self._frame_filter
+ ),
+ "actions": self._actions[clip.video_id],
+ "start_time": clip.start_time,
+ "stop_time": clip.stop_time,
+ }
+
+ if self._transform:
+ clip_data = self._transform(clip_data)
+
+ return clip_data
+
+ def __len__(self) -> int:
+ """
+ Returns:
+ The number of video clips in the dataset.
+ """
+ return len(self._clips)
diff --git a/pytorchvideo/data/epic_kitchen/utils.py b/pytorchvideo/data/epic_kitchen/utils.py
new file mode 100644
index 00000000..4c1b3d3c
--- /dev/null
+++ b/pytorchvideo/data/epic_kitchen/utils.py
@@ -0,0 +1,197 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Dict
+
+from iopath.common.file_io import g_pathmgr
+from pytorchvideo.data.dataset_manifest_utils import EncodedVideoInfo, VideoFrameInfo
+from pytorchvideo.data.utils import (
+ optional_threaded_foreach,
+)
+
+
+def build_frame_manifest_from_flat_directory(
+ data_directory_path: str, multithreaded: bool
+) -> Dict[str, VideoFrameInfo]:
+ """
+ Args:
+ data_directory_path (str): Path or URI to EpicKitchenDataset data.
+ Data at this path must be a folder of structure:
+ {
+ "{video_id}": [
+ "frame_{frame_number}.{file_extension}",
+ "frame_{frame_number}.{file_extension}",
+ "frame_{frame_number}.{file_extension}",
+ ...]
+ ...}
+ multithreaded (bool):
+ controls whether io operations are performed across multiple threads.
+
+ Returns:
+ Dictionary mapping video_id of available videos to the locations of their
+ underlying frame files.
+ """
+
+ video_frames = {}
+ video_ids = g_pathmgr.ls(str(data_directory_path))
+
+ def add_video_frames(video_id: str, video_path: str) -> None:
+ video_frame_file_names = sorted(g_pathmgr.ls(video_path))
+ for frame in video_frame_file_names:
+ file_extension = frame.split(".")[-1]
+ frame_name = frame[: -(len(file_extension) + 1)]
+ stem, path_frame_id = frame_name.split("_")
+ if video_id not in video_frames:
+ video_frames[video_id] = VideoFrameInfo(
+ video_id=video_id,
+ location=video_path,
+ frame_file_stem=f"{stem}_",
+ frame_string_length=len(frame_name),
+ min_frame_number=int(path_frame_id),
+ max_frame_number=int(path_frame_id),
+ file_extension=file_extension,
+ )
+ else:
+ video_frame_info = video_frames[video_id]
+ # Check that this new frame is of the same format as other frames for this video
+ # and that it is the next frame in order, if so update the frame info for this
+ # video to reflect there is an additional frame.
+ # We don't need to check video_id or frame_file_stem as they are function of
+ # video_id which is aligned within the dictionary
+ assert video_frame_info.frame_string_length == len(frame_name)
+ assert video_frame_info.location == video_path, (
+ f"Frames for {video_id} found in two paths: "
+ f"{video_frame_info.location} and {video_path}"
+ )
+ assert video_frame_info.max_frame_number + 1 == int(path_frame_id)
+ assert (
+ video_frame_info.file_extension == file_extension
+ ), f"Frames with two different file extensions found for video {video_id}"
+ video_frames[video_id] = VideoFrameInfo(
+ video_id=video_frame_info.video_id,
+ location=video_frame_info.location,
+ frame_file_stem=video_frame_info.frame_file_stem,
+ frame_string_length=video_frame_info.frame_string_length,
+ min_frame_number=video_frame_info.min_frame_number,
+ max_frame_number=int(path_frame_id), # Update
+ file_extension=video_frame_info.file_extension,
+ )
+
+ video_paths = [
+ (video_id, f"{data_directory_path}/{video_id}") for video_id in video_ids
+ ]
+ # Kick off frame indexing for all participants
+ optional_threaded_foreach(add_video_frames, video_paths, multithreaded)
+
+ return video_frames
+
+
+def build_frame_manifest_from_nested_directory(
+ data_directory_path: str, multithreaded: bool
+) -> Dict[str, VideoFrameInfo]:
+ """
+ Args:
+ data_directory_path (str): Path or URI to EpicKitchenDataset data.
+ If this dataset is to load from the frame-based dataset:
+ Data at this path must be a folder of structure:
+ {
+ "{participant_id}" : [
+ "{participant_id}_{participant_video_id}_{frame_number}.{file_extension}",
+
+ ...],
+ ...}
+
+ multithreaded (bool):
+ controls whether io operations are performed across multiple threads.
+
+ Returns:
+ Dictionary mapping video_id of available videos to the locations of their
+ underlying frame files.
+ """
+
+ participant_ids = g_pathmgr.ls(str(data_directory_path))
+ video_frames = {}
+
+ # Create function to execute in parallel that lists files available for each participant
+ def add_participant_video_frames(
+ participant_id: str, participant_path: str
+ ) -> None:
+ participant_frames = sorted(g_pathmgr.ls(str(participant_path)))
+ for frame_file_name in participant_frames:
+ file_extension = frame_file_name.split(".")[-1]
+ frame_name = frame_file_name[: -(len(file_extension) + 1)]
+ [path_participant_id, path_video_id, path_frame_id] = frame_name.split("_")
+ assert path_participant_id == participant_id
+ video_id = f"{path_participant_id}_{path_video_id}"
+ if (
+ video_id not in video_frames
+ ): # This is the first frame we have seen from video w/ video_id
+ video_frames[video_id] = VideoFrameInfo(
+ video_id=video_id,
+ location=participant_path,
+ frame_file_stem=f"{video_id}_",
+ frame_string_length=len(frame_name),
+ min_frame_number=int(path_frame_id),
+ max_frame_number=int(path_frame_id),
+ file_extension=file_extension,
+ )
+ else:
+ video_frame_info = video_frames[video_id]
+ # Check that this new frame is of the same format as other frames for this video
+ # and that it is the next frame in order, if so update the frame info for this
+ # video to reflect there is an additional frame.
+ # We don't need to check video_id or frame_file_stem as they are function of
+ # video_id which is aligned within the dictionary
+ assert video_frame_info.frame_string_length == len(frame_name)
+ assert video_frame_info.location == participant_path, (
+ f"Frames for {video_id} found in two paths: "
+ f"{video_frame_info.location} and {participant_path}"
+ )
+ assert video_frame_info.max_frame_number + 1 == int(path_frame_id)
+ assert (
+ video_frame_info.file_extension == file_extension
+ ), f"Frames with two different file extensions found for video {video_id}"
+ video_frames[video_id] = VideoFrameInfo(
+ video_id=video_frame_info.video_id,
+ location=video_frame_info.location,
+ frame_file_stem=video_frame_info.frame_file_stem,
+ frame_string_length=video_frame_info.frame_string_length,
+ min_frame_number=video_frame_info.min_frame_number,
+ max_frame_number=int(path_frame_id), # Update
+ file_extension=video_frame_info.file_extension,
+ )
+
+ particpant_paths = [
+ (participant_id, f"{data_directory_path}/{participant_id}")
+ for participant_id in participant_ids
+ ]
+ # Kick off frame indexing for all participants
+ optional_threaded_foreach(
+ add_participant_video_frames, particpant_paths, multithreaded
+ )
+
+ return video_frames
+
+
+def build_encoded_manifest_from_nested_directory(
+ data_directory_path: str,
+) -> Dict[str, EncodedVideoInfo]:
+ """
+ Creates a dictionary from video_id to EncodedVideoInfo for
+ encoded videos in the given directory.
+
+ Args:
+ data_directory_path (str): The folder to ls to find encoded
+ video files.
+
+ Returns:
+ Dict[str, EncodedVideoInfo] mapping video_id to EncodedVideoInfo
+ for each file in 'data_directory_path'
+ """
+ encoded_video_infos = {}
+ for participant_id in g_pathmgr.ls(data_directory_path):
+ participant_folder_path = f"{data_directory_path}/{participant_id}"
+ for video_file_name in g_pathmgr.ls(participant_folder_path):
+ video_id = video_file_name[:6]
+ video_full_path = f"{participant_folder_path}/{video_file_name}"
+ encoded_video_infos[video_id] = EncodedVideoInfo(video_id, video_full_path)
+ return encoded_video_infos
diff --git a/pytorchvideo/data/epic_kitchen_forecasting.py b/pytorchvideo/data/epic_kitchen_forecasting.py
new file mode 100644
index 00000000..8a6ad5e6
--- /dev/null
+++ b/pytorchvideo/data/epic_kitchen_forecasting.py
@@ -0,0 +1,295 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from dataclasses import fields as dataclass_fields
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+from pytorchvideo.data.dataset_manifest_utils import (
+ EncodedVideoInfo,
+ VideoClipInfo,
+ VideoDatasetType,
+ VideoFrameInfo,
+ VideoInfo,
+)
+from pytorchvideo.data.epic_kitchen import ActionData, EpicKitchenDataset
+from pytorchvideo.data.video import Video
+
+
+class ClipSampling(Enum):
+ Random = 1
+
+
+class EpicKitchenForecasting(EpicKitchenDataset):
+ """
+ Action forecasting video data set for EpicKitchen-55 Dataset.
+
+
+ This dataset handles the loading, decoding, and clip sampling for the videos.
+ """
+
+ def __init__(
+ self,
+ video_info_file_path: str,
+ actions_file_path: str,
+ video_data_manifest_file_path: str,
+ clip_sampling: ClipSampling = ClipSampling.Random,
+ dataset_type: VideoDatasetType = VideoDatasetType.Frame,
+ seconds_per_clip: float = 2.0,
+ clip_time_stride: float = 10.0,
+ num_input_clips: int = 1,
+ frames_per_clip: Optional[int] = None,
+ num_forecast_actions: int = 1,
+ transform: Callable[[Dict[str, Any]], Any] = None,
+ multithreaded_io: bool = True,
+ ):
+ f"""
+ Args:
+ video_info_file_path (str):
+ Path or URI to manifest with basic metadata of each video.
+ File must be a csv (w/header) with columns:
+ {[f.name for f in dataclass_fields(VideoInfo)]}
+
+ actions_file_path (str):
+ Path or URI to manifest with action annotations for each video.
+ File must ber a csv (w/header) with columns:
+ {[f.name for f in dataclass_fields(ActionData)]}
+
+ video_data_manifest_file_path (str):
+ The path to a json file outlining the available video data for the
+ associated videos. File must be a csv (w/header) with columns either:
+
+ For Frame Videos:
+ {[f.name for f in dataclass_fields(VideoFrameInfo)]}
+
+ For Encoded Videos:
+ {[f.name for f in dataclass_fields(EncodedVideoInfo)]}
+
+ To generate this file from a directory of video frames, see helper
+ functions in Module: pytorchvideo.data.epic_kitchen.utils
+
+ clip_sampling (ClipSampling):
+ The type of sampling to perform to perform on the videos of the dataset.
+
+ dataset_type (VideoDatasetType): The dataformat in which dataset
+ video data is store (e.g. video frames, encoded video etc).
+
+ seconds_per_clip (float): The length of each sampled subclip in seconds.
+
+ clip_time_stride (float): The time difference in seconds between the start of
+ each input subclip.
+
+ num_input_clips (int): The number of subclips to be included in the input
+ video data.
+
+ frames_per_clip (Optional[int]): The number of frames per clip to sample.
+ If None, all frames in the clip will be included.
+
+ num_forecast_actions (int): The number of actions to be included in the
+ action vector.
+
+ transform (Callable[[Dict[str, Any]], Any]):
+ This callable is evaluated on the clip output before the clip is returned.
+ It can be used for user-defined preprocessing and augmentations to the clips.
+ The clip input is a dictionary with the following format:
+ {{
+ 'video_id': ,
+ 'video': ,
+ 'audio': ,
+ 'label': ,
+ 'start_time': ,
+ 'stop_time':
+ }}
+
+ If transform is None, the raw clip output in the above format is
+ returned unmodified.
+
+ multithreaded_io (bool):
+ Boolean to control whether parllelizable io operations are performed across
+ multiple threads.
+ """
+ define_clip_structure_fn = (
+ EpicKitchenForecasting._define_clip_structure_generator(
+ clip_sampling,
+ seconds_per_clip,
+ clip_time_stride,
+ num_input_clips,
+ num_forecast_actions,
+ )
+ )
+ frame_filter = (
+ EpicKitchenForecasting._frame_filter_generator(
+ frames_per_clip, seconds_per_clip, clip_time_stride, num_input_clips
+ )
+ if frames_per_clip is not None
+ else None
+ )
+ transform = EpicKitchenForecasting._transform_generator(
+ transform, num_forecast_actions, frames_per_clip, num_input_clips
+ )
+
+ super().__init__(
+ video_info_file_path=video_info_file_path,
+ actions_file_path=actions_file_path,
+ video_data_manifest_file_path=video_data_manifest_file_path,
+ dataset_type=dataset_type,
+ transform=transform,
+ frame_filter=frame_filter,
+ clip_sampler=define_clip_structure_fn,
+ multithreaded_io=multithreaded_io,
+ )
+
+ @staticmethod
+ def _transform_generator(
+ transform: Callable[[Dict[str, Any]], Dict[str, Any]],
+ num_forecast_actions: int,
+ frames_per_clip: int,
+ num_input_clips: int,
+ ) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
+ """
+ Args:
+ transform (Callable[[Dict[str, Any]], Dict[str, Any]]): A function that performs
+ any operation on a clip before it is returned in the default transform function.
+ num_forecast_actions: (int) The number of actions to be included in the
+ action vector.
+ frames_per_clip (int): The number of frames per clip to sample.
+ num_input_clips (int): The number of subclips to be included in the video data.
+
+ Returns:
+ A function that performs any operation on a clip and returns the transformed clip.
+ """
+
+ def transform_clip(clip: Dict[str, Any]) -> Dict[str, Any]:
+ assert all(
+ clip["actions"][i].start_time <= clip["actions"][i + 1].start_time
+ for i in range(len(clip["actions"]) - 1)
+ ), "Actions must be sorted"
+ next_k_actions: List[ActionData] = [
+ a for a in clip["actions"] if (a.start_time > clip["stop_time"])
+ ][:num_forecast_actions]
+ clip["actions"] = next_k_actions
+
+ assert clip["video"].size()[1] == num_input_clips * frames_per_clip
+ clip_video_tensor = torch.stack(
+ [
+ clip["video"][
+ :, (i * frames_per_clip) : ((i + 1) * frames_per_clip), :, :
+ ]
+ for i in range(num_input_clips)
+ ]
+ )
+ clip["video"] = clip_video_tensor
+
+ for key in clip:
+ if clip[key] is None:
+ clip[key] = torch.tensor([])
+
+ if transform:
+ clip = transform(clip)
+
+ return clip
+
+ return transform_clip
+
+ @staticmethod
+ def _frame_filter_generator(
+ frames_per_clip: int,
+ seconds_per_clip: float,
+ clip_time_stride: float,
+ num_input_clips: int,
+ ) -> Callable[[List[int]], List[int]]:
+ """
+ Args:
+ frames_per_clip (int): The number of frames per clip to sample.
+ seconds_per_clip (float): The length of each sampled subclip in seconds.
+ clip_time_stride (float): The time difference in seconds between the start of
+ each input subclip.
+ num_input_clips (int): The number of subclips to be included in the video data.
+
+ Returns:
+ A function that takes in a list of frame indicies and outputs a subsampled list.
+ """
+ time_window_length = seconds_per_clip + (num_input_clips - 1) * clip_time_stride
+ desired_frames_per_second = frames_per_clip / seconds_per_clip
+
+ def frame_filter(frame_indices: List[int]) -> List[int]:
+ num_available_frames_for_all_clips = len(frame_indices)
+ available_frames_per_second = (
+ num_available_frames_for_all_clips / time_window_length
+ )
+ intra_clip_sampling_stride = int(
+ available_frames_per_second // desired_frames_per_second
+ )
+ selected_frames = set()
+ for i in range(num_input_clips):
+ clip_start_index = int(
+ i * clip_time_stride * available_frames_per_second
+ )
+ for j in range(frames_per_clip):
+ selected_frames.add(
+ clip_start_index + j * intra_clip_sampling_stride
+ )
+ return [x for i, x in enumerate(frame_indices) if i in selected_frames]
+
+ return frame_filter
+
+ @staticmethod
+ def _define_clip_structure_generator(
+ clip_sampling: str,
+ seconds_per_clip: float,
+ clip_time_stride: float,
+ num_input_clips: int,
+ num_forecast_actions: int,
+ ) -> Callable[[Dict[str, Video], Dict[str, List[ActionData]]], List[VideoClipInfo]]:
+ """
+ Args:
+ clip_sampling (ClipSampling):
+ The type of sampling to perform to perform on the videos of the dataset.
+ seconds_per_clip (float): The length of each sampled clip in seconds.
+ clip_time_stride: The time difference in seconds between the start of
+ each input subclip.
+ num_input_clips (int): The number of subclips to be included in the video data.
+ num_forecast_actions (int): The number of actions to be included in the
+ action vector.
+
+ Returns:
+ A function that takes a dictionary of videos and outputs a list of sampled
+ clips.
+ """
+ # TODO(T77683480)
+ if not clip_sampling == ClipSampling.Random:
+ raise NotImplementedError(
+ f"Only {ClipSampling.Random} is implemented. "
+ f"{clip_sampling} not implemented."
+ )
+
+ time_window_length = seconds_per_clip + (num_input_clips - 1) * clip_time_stride
+
+ def define_clip_structure(
+ videos: Dict[str, Video], video_actions: Dict[str, List[ActionData]]
+ ) -> List[VideoClipInfo]:
+ candidate_sample_clips = []
+ for video_id, actions in video_actions.items():
+ for i, action in enumerate(actions[: (-1 * num_forecast_actions)]):
+ # Only actions with num_forecast_actions after to predict
+ # Confirm there are >= num_forecast_actions available
+ # (it is possible for actions to overlap)
+ number_valid_actions = 0
+ for j in range(i + 1, len(actions)):
+ if actions[j].start_time > action.stop_time:
+ number_valid_actions += 1
+ if number_valid_actions == num_forecast_actions:
+ if (
+ action.start_time - time_window_length >= 0
+ ): # Only add clips that have the full input video available
+ candidate_sample_clips.append(
+ VideoClipInfo(
+ video_id,
+ action.stop_time - time_window_length,
+ action.stop_time,
+ )
+ )
+ break
+ return candidate_sample_clips
+
+ return define_clip_structure
diff --git a/pytorchvideo/data/epic_kitchen_recognition.py b/pytorchvideo/data/epic_kitchen_recognition.py
new file mode 100644
index 00000000..8a6f688e
--- /dev/null
+++ b/pytorchvideo/data/epic_kitchen_recognition.py
@@ -0,0 +1,212 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import random
+from dataclasses import fields as dataclass_fields
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+from pytorchvideo.data.dataset_manifest_utils import (
+ EncodedVideoInfo,
+ VideoClipInfo,
+ VideoDatasetType,
+ VideoFrameInfo,
+ VideoInfo,
+)
+from pytorchvideo.data.epic_kitchen import ActionData, EpicKitchenDataset
+from pytorchvideo.data.video import Video
+
+
+class ClipSampling(Enum):
+ RandomOffsetUniform = 1
+
+
+class EpicKitchenRecognition(EpicKitchenDataset):
+ """
+ Action recognition video data set for EpicKitchen-55 Dataset.
+
+
+ This dataset handles the loading, decoding, and clip sampling for the videos.
+ """
+
+ def __init__(
+ self,
+ video_info_file_path: str,
+ actions_file_path: str,
+ video_data_manifest_file_path: str,
+ clip_sampling: ClipSampling = ClipSampling.RandomOffsetUniform,
+ dataset_type: VideoDatasetType = VideoDatasetType.Frame,
+ seconds_per_clip: float = 2.0,
+ frames_per_clip: Optional[int] = None,
+ transform: Callable[[Dict[str, Any]], Any] = None,
+ multithreaded_io: bool = True,
+ ):
+ f"""
+ Args:
+ video_info_file_path (str):
+ Path or URI to manifest with basic metadata of each video.
+ File must be a csv (w/header) with columns:
+ {[f.name for f in dataclass_fields(VideoInfo)]}
+
+ actions_file_path (str):
+ Path or URI to manifest with action annotations for each video.
+ File must ber a csv (w/header) with columns:
+ {[f.name for f in dataclass_fields(ActionData)]}
+
+ video_data_manifest_file_path (str):
+ The path to a json file outlining the available video data for the
+ associated videos. File must be a csv (w/header) with columns either:
+
+ For Frame Videos:
+ {[f.name for f in dataclass_fields(VideoFrameInfo)]}
+
+ For Encoded Videos:
+ {[f.name for f in dataclass_fields(EncodedVideoInfo)]}
+
+ To generate this file from a directory of video frames, see helper
+ functions in Module: pytorchvideo.data.epic_kitchen.utils
+
+ clip_sampling (ClipSampling):
+ The type of sampling to perform to perform on the videos of the dataset.
+
+ dataset_type (VideoDatasetType): The dataformat in which dataset
+ video data is store (e.g. video frames, encoded video etc).
+
+ seconds_per_clip (float): The length of each sampled clip in seconds.
+
+ frames_per_clip (Optional[int]): The number of frames per clip to sample.
+
+ transform (Callable[[Dict[str, Any]], Any]):
+ This callable is evaluated on the clip output before the clip is returned.
+ It can be used for user-defined preprocessing and augmentations to the clips.
+ The clip input is a dictionary with the following format:
+ {{
+ 'video_id': ,
+ 'video': ,
+ 'audio': ,
+ 'label': ,
+ 'start_time': ,
+ 'stop_time':
+ }}
+
+ If transform is None, the raw clip output in the above format is
+ returned unmodified.
+
+ multithreaded_io (bool):
+ Boolean to control whether parllelizable io operations are performed across
+ multiple threads.
+ """
+ define_clip_structure_fn = (
+ EpicKitchenRecognition._define_clip_structure_generator(
+ seconds_per_clip, clip_sampling
+ )
+ )
+ transform = EpicKitchenRecognition._transform_generator(transform)
+ frame_filter = (
+ EpicKitchenRecognition._frame_filter_generator(frames_per_clip)
+ if frames_per_clip is not None
+ else None
+ )
+
+ super().__init__(
+ video_info_file_path=video_info_file_path,
+ actions_file_path=actions_file_path,
+ dataset_type=dataset_type,
+ video_data_manifest_file_path=video_data_manifest_file_path,
+ transform=transform,
+ frame_filter=frame_filter,
+ clip_sampler=define_clip_structure_fn,
+ multithreaded_io=multithreaded_io,
+ )
+
+ @staticmethod
+ def _transform_generator(
+ transform: Callable[[Dict[str, Any]], Dict[str, Any]]
+ ) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
+ """
+ Args:
+ transform (Callable[[Dict[str, Any]], Dict[str, Any]]): A function that performs
+ any operation on a clip before it is returned in the default transform function.
+
+ Returns:
+ A function that performs any operation on a clip and returns the transformed clip.
+ """
+
+ def transform_clip(clip: Dict[str, Any]) -> Dict[str, Any]:
+ actions_in_clip: List[ActionData] = [
+ a
+ for a in clip["actions"]
+ if (
+ a.start_time <= clip["stop_time"]
+ and a.stop_time >= clip["start_time"]
+ )
+ ]
+ clip["actions"] = actions_in_clip
+
+ for key in clip:
+ if clip[key] is None:
+ clip[key] = torch.tensor([])
+
+ if transform:
+ clip = transform(clip)
+
+ return clip
+
+ return transform_clip
+
+ @staticmethod
+ def _frame_filter_generator(
+ frames_per_clip: int,
+ ) -> Callable[[List[int]], List[int]]:
+ """
+ Args:
+ frames_per_clip (int): The number of frames per clip to sample.
+
+ Returns:
+ A function that takes in a list of frame indicies and outputs a subsampled list.
+ """
+
+ def frame_filer(frame_indices: List[int]) -> List[int]:
+ num_frames = len(frame_indices)
+ frame_step = int(num_frames // frames_per_clip)
+ selected_frames = set(range(0, num_frames, frame_step))
+ return [x for i, x in enumerate(frame_indices) if i in selected_frames]
+
+ return frame_filer
+
+ @staticmethod
+ def _define_clip_structure_generator(
+ seconds_per_clip: float, clip_sampling: ClipSampling
+ ) -> Callable[[Dict[str, Video], Dict[str, List[ActionData]]], List[VideoClipInfo]]:
+ """
+ Args:
+ seconds_per_clip (float): The length of each sampled clip in seconds.
+ clip_sampling (ClipSampling):
+ The type of sampling to perform to perform on the videos of the dataset.
+
+ Returns:
+ A function that takes a dictionary of videos and a dictionary of the actions
+ for each video and outputs a list of sampled clips.
+ """
+ if not clip_sampling == ClipSampling.RandomOffsetUniform:
+ raise NotImplementedError(
+ f"Only {ClipSampling.RandomOffsetUniform} is implemented. "
+ f"{clip_sampling} not implemented."
+ )
+
+ def define_clip_structure(
+ videos: Dict[str, Video], actions: Dict[str, List[ActionData]]
+ ) -> List[VideoClipInfo]:
+ clips = []
+ for video_id, video in videos.items():
+ offset = random.random() * seconds_per_clip
+ num_clips = int((video.duration - offset) // seconds_per_clip)
+
+ for i in range(num_clips):
+ start_time = i * seconds_per_clip + offset
+ stop_time = start_time + seconds_per_clip
+ clip = VideoClipInfo(video_id, start_time, stop_time)
+ clips.append(clip)
+ return clips
+
+ return define_clip_structure
diff --git a/pytorchvideo/data/frame_video.py b/pytorchvideo/data/frame_video.py
new file mode 100644
index 00000000..3b354f8d
--- /dev/null
+++ b/pytorchvideo/data/frame_video.py
@@ -0,0 +1,210 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from __future__ import annotations
+
+import logging
+import time
+from typing import Callable, Dict, List, Optional
+
+import numpy as np
+import torch
+import torch.utils.data
+from iopath.common.file_io import g_pathmgr
+from pytorchvideo.data.utils import optional_threaded_foreach
+
+from .utils import thwc_to_cthw
+from .video import Video
+
+
+try:
+ import cv2
+except ImportError:
+ _HAS_CV2 = False
+else:
+ _HAS_CV2 = True
+
+
+logger = logging.getLogger(__name__)
+
+
+class FrameVideo(Video):
+ """
+ FrameVideo is an abstractions for accessing clips based on their start and end
+ time for a video where each frame is stored as an image. PathManager is used for
+ frame image reading, allowing non-local uri's to be used.
+ """
+
+ def __init__(
+ self,
+ duration: float,
+ fps: float,
+ video_frame_to_path_fn: Callable[[int], str] = None,
+ video_frame_paths: List[str] = None,
+ multithreaded_io: bool = False,
+ ) -> None:
+ """
+ Args:
+ duration (float): the duration of the video in seconds.
+ fps (float): the target fps for the video. This is needed to link the frames
+ to a second timestamp in the video.
+ video_frame_to_path_fn (Callable[[int], str]): a function that maps from a frame
+ index integer to the file path where the frame is located.
+ video_frame_paths (List[str]): Dictionary of frame paths for each index of a video.
+ multithreaded_io (bool): controls whether parllelizable io operations are
+ performed across multiple threads.
+ """
+ if not _HAS_CV2:
+ raise ImportError(
+ "opencv2 is required to use FrameVideo. Please "
+ "install with 'pip install opencv-python'"
+ )
+
+ self._duration = duration
+ self._fps = fps
+
+ assert (video_frame_to_path_fn is None) != (
+ video_frame_paths is None
+ ), "Only one of video_frame_to_path_fn or video_frame_paths can be provided"
+ self._video_frame_to_path_fn = video_frame_to_path_fn
+ self._video_frame_paths = video_frame_paths
+
+ self._multithreaded_io = multithreaded_io
+
+ @classmethod
+ def from_frame_paths(
+ cls,
+ video_frame_paths: List[str],
+ fps: float = 30.0,
+ multithreaded_io: bool = False,
+ ):
+ """
+ Args:
+ video_frame_paths (List[str]): a list of paths to each frames in the video.
+ fps (float): the target fps for the video. This is needed to link the frames
+ to a second timestamp in the video.
+ multithreaded_io (bool): controls whether parllelizable io operations are
+ performed across multiple threads.
+ """
+ assert len(video_frame_paths) != 0, "video_frame_paths is empty"
+
+ return cls(
+ len(video_frame_paths) / fps,
+ fps,
+ video_frame_paths=video_frame_paths,
+ multithreaded_io=multithreaded_io,
+ )
+
+ @property
+ def duration(self) -> float:
+ """
+ Returns:
+ duration: the video's duration/end-time in seconds.
+ """
+ return self._duration
+
+ def _get_frame_index_for_time(self, time_sec: float) -> int:
+ return int(np.round(self._fps * time_sec))
+
+ def get_clip(
+ self,
+ start_sec: float,
+ end_sec: float,
+ frame_filter: Optional[Callable[[List[int]], List[int]]] = None,
+ ) -> Dict[str, Optional[torch.Tensor]]:
+ """
+ Retrieves frames from the stored video at the specified start and end times
+ in seconds (the video always starts at 0 seconds). Given that PathManager may
+ be fetching the frames from network storage, to handle transient errors, frame
+ reading is retried N times.
+
+ Args:
+ start_sec (float): the clip start time in seconds
+ end_sec (float): the clip end time in seconds
+ frame_filter (Optional[Callable[List[int], List[int]]]):
+ function to subsample frames in a clip before loading.
+ If None, no subsampling is peformed.
+ Returns:
+ clip_frames: A tensor of the clip's RGB frames with shape:
+ (channel, time, height, width). The frames are of type torch.float32 and
+ in the range [0 - 255]. Raises an exception if unable to load images.
+
+ clip_data:
+ "video": A tensor of the clip's RGB frames with shape:
+ (channel, time, height, width). The frames are of type torch.float32 and
+ in the range [0 - 255]. Raises an exception if unable to load images.
+
+ "frame_indices": A list of indices for each frame relative to all frames in the
+ video.
+
+ Returns None if no frames are found.
+ """
+ if start_sec < 0 or start_sec > self._duration:
+ logger.warning(
+ f"No frames found within {start_sec} and {end_sec} seconds. Video starts"
+ f"at time 0 and ends at {self._duration}."
+ )
+ return None
+
+ end_sec = min(end_sec, self._duration)
+
+ start_frame_index = self._get_frame_index_for_time(start_sec)
+ end_frame_index = self._get_frame_index_for_time(end_sec)
+ frame_indices = list(range(start_frame_index, end_frame_index))
+ # Frame filter function to allow for subsampling before loading
+ if frame_filter:
+ frame_indices = frame_filter(frame_indices)
+
+ clip_paths = [self._video_frame_to_path(i) for i in frame_indices]
+ clip_frames = _load_images_with_retries(
+ clip_paths, multithreaded=self._multithreaded_io
+ )
+ clip_frames = thwc_to_cthw(clip_frames).to(torch.float32)
+ return {"video": clip_frames, "frame_indices": frame_indices}
+
+ def _video_frame_to_path(self, frame_index: int) -> str:
+ if self._video_frame_to_path_fn:
+ return self._video_frame_to_path_fn(frame_index)
+ elif self._video_frame_paths:
+ return self._video_frame_paths[frame_index]
+ else:
+ raise Exception(
+ "One of _video_frame_to_path_fn or _video_frame_paths must be set"
+ )
+
+
+def _load_images_with_retries(
+ image_paths: List[str], num_retries: int = 10, multithreaded: bool = True
+) -> torch.Tensor:
+ """
+ Loads the given image paths using PathManager, decodes them as RGB images and
+ returns them as a stacked tensors.
+ Args:
+ image_paths (List[str]): a list of paths to images.
+ num_retries (int): number of times to retry image reading to handle transient error.
+ multithreaded (bool): if images are fetched via multiple threads in parallel.
+ Returns:
+ A tensor of the clip's RGB frames with shape:
+ (time, height, width, channel). The frames are of type torch.uint8 and
+ in the range [0 - 255]. Raises an exception if unable to load images.
+ """
+ imgs = [None for i in image_paths]
+
+ def fetch_image(image_index: int, image_path: str) -> None:
+ for i in range(num_retries):
+ with g_pathmgr.open(image_path, "rb") as f:
+ img_str = np.frombuffer(f.read(), np.uint8)
+ img_bgr = cv2.imdecode(img_str, flags=cv2.IMREAD_COLOR)
+ img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+ if img_rgb is not None:
+ imgs[image_index] = img_rgb
+ return
+ else:
+ logging.warning(f"Reading attempt {i}/{num_retries} failed.")
+ time.sleep(1e-6)
+
+ optional_threaded_foreach(fetch_image, enumerate(image_paths), multithreaded)
+
+ if any((img is None for img in imgs)):
+ raise Exception("Failed to load images from {}".format(image_paths))
+
+ return torch.as_tensor(np.stack(imgs))
diff --git a/pytorchvideo/data/hmdb51.py b/pytorchvideo/data/hmdb51.py
new file mode 100644
index 00000000..c04ed917
--- /dev/null
+++ b/pytorchvideo/data/hmdb51.py
@@ -0,0 +1,230 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from __future__ import annotations
+
+import logging
+import os
+import pathlib
+from typing import Any, Callable, List, Optional, Tuple, Type, Union
+
+import torch.utils.data
+from iopath.common.file_io import g_pathmgr
+
+from .clip_sampling import ClipSampler
+from .encoded_video_dataset import EncodedVideoDataset
+
+
+logger = logging.getLogger(__name__)
+
+
+class Hmdb51LabeledVideoPaths:
+ """
+ Pre-processor for Hmbd51 dataset mentioned here -
+ https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/
+
+ This dataset consists of classwise folds with each class consisting of 3
+ folds (splits).
+
+ The videos directory is of the format,
+ video_dir_path/class_x/.avi
+ ...
+ video_dir_path/class_y/.avi
+
+ The splits/fold directory is of the format,
+ folds_dir_path/class_x_test_split_1.txt
+ folds_dir_path/class_x_test_split_2.txt
+ folds_dir_path/class_x_test_split_3.txt
+ ...
+ folds_dir_path/class_y_test_split_1.txt
+ folds_dir_path/class_y_test_split_2.txt
+ folds_dir_path/class_y_test_split_3.txt
+
+ And each text file in the splits directory class_x_test_split_<1 or 2 or 3>.txt
+ <0 or 1 or 2>
+ where 0,1,2 corresponds to unused, train split respectively.
+
+ Each video has name of format
+ ______.avi
+ For more details on tags -
+ https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/
+ """
+
+ _allowed_splits = [1, 2, 3]
+ _split_type_dict = {"train": 1, "test": 2, "unused": 0}
+
+ @classmethod
+ def from_dir(
+ cls, data_path: str, split_id: int = 1, split_type: str = "train"
+ ) -> Hmdb51LabeledVideoPaths:
+ """
+ Factory function that creates Hmdb51LabeledVideoPaths object form a splits/folds
+ directory.
+
+ Args:
+ data_path (str): The path to the splits/folds directory of HMDB51.
+ split_id (int): Fold id to be loaded. Belongs to [1,2,3]
+ split_type (str): Split/Fold type to be loaded. It belongs to one of the
+ following,
+ - "train"
+ - "test"
+ - "unused" (This is a small set of videos that are neither
+ of part of test or train fold.)
+ """
+ data_path = pathlib.Path(data_path)
+ if not data_path.is_dir():
+ return RuntimeError(f"{data_path} not found or is not a directory.")
+ if not int(split_id) in cls._allowed_splits:
+ return RuntimeError(
+ f"{split_id} not found in allowed split id's {cls._allowed_splits}."
+ )
+ file_name_format = "_test_split" + str(int(split_id))
+ file_paths = sorted(
+ (
+ f
+ for f in data_path.iterdir()
+ if f.is_file() and f.suffix == ".txt" and file_name_format in f.stem
+ )
+ )
+ return cls.from_csvs(file_paths, split_type)
+
+ @classmethod
+ def from_csvs(
+ cls, file_paths: List[Union[pathlib.Path, str]], split_type: str = "train"
+ ) -> Hmdb51LabeledVideoPaths:
+ """
+ Factory function that creates Hmdb51LabeledVideoPaths object form a list of
+ split files of .txt type
+
+ Args:
+ file_paths (List[Union[pathlib.Path, str]]) : The path to the splits/folds
+ directory of HMDB51.
+ split_type (str): Split/Fold type to be loaded.
+ - "train"
+ - "test"
+ - "unused"
+ """
+ video_paths_and_label = []
+ for file_path in file_paths:
+ file_path = pathlib.Path(file_path)
+ assert g_pathmgr.exists(file_path), f"{file_path} not found."
+ if not (file_path.suffix == ".txt" and "_test_split" in file_path.stem):
+ return RuntimeError(f"Ivalid file: {file_path}")
+
+ action_name = "_"
+ action_name = action_name.join((file_path.stem).split("_")[:-2])
+ with g_pathmgr.open(file_path, "r") as f:
+ for path_label in f.read().splitlines():
+ line_split = path_label.rsplit(None, 1)
+
+ if not int(line_split[1]) == cls._split_type_dict[split_type]:
+ continue
+
+ file_path = os.path.join(action_name, line_split[0])
+ meta_tags = line_split[0].split("_")[-6:-1]
+ video_paths_and_label.append(
+ (file_path, {"label": action_name, "meta_tags": meta_tags})
+ )
+
+ assert (
+ len(video_paths_and_label) > 0
+ ), f"Failed to load dataset from {file_path}."
+ return cls(video_paths_and_label)
+
+ def __init__(
+ self, paths_and_labels: List[Tuple[str, Optional[dict]]], path_prefix=""
+ ) -> None:
+ """
+ Args:
+ paths_and_labels [(str, int)]: a list of tuples containing the video
+ path and integer label.
+ """
+ self._paths_and_labels = paths_and_labels
+ self._path_prefix = path_prefix
+
+ def path_prefix(self, prefix):
+ self._path_prefix = prefix
+
+ path_prefix = property(None, path_prefix)
+
+ def __getitem__(self, index: int) -> Tuple[str, dict]:
+ """
+ Args:
+ index (int): the path and label index.
+
+ Returns:
+ The path and label tuple for the given index.
+ """
+ path, label = self._paths_and_labels[index]
+ return (os.path.join(self._path_prefix, path), label)
+
+ def __len__(self) -> int:
+ """
+ Returns:
+ The number of video paths and label pairs.
+ """
+ return len(self._paths_and_labels)
+
+
+def Hmdb51(
+ data_path: pathlib.Path,
+ clip_sampler: ClipSampler,
+ video_sampler: Type[torch.utils.data.Sampler] = torch.utils.data.RandomSampler,
+ transform: Optional[Callable[[dict], Any]] = None,
+ video_path_prefix: str = "",
+ split_id: int = 1,
+ split_type: str = "train",
+ decode_audio=True,
+ decoder: str = "pyav",
+) -> EncodedVideoDataset:
+ """
+ A helper function to create EncodedVideoDataset object for HMDB51 dataset
+
+ Args:
+ data_path (pathlib.Path): Path to the data. The path type defines how the
+ data should be read:
+ - For a file path, the file is read and each line is parsed into a
+ video path and label.
+ - For a directory, the directory structure defines the classes
+ (i.e. each subdirectory is a class).
+ See the LabeledVideoPaths class documentation for specific formatting
+ details and examples.
+
+ clip_sampler (ClipSampler): Defines how clips should be sampled from each
+ video. See the clip sampling documentation for more information.
+
+ video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal
+ video container. This defines the order videos are decoded and,
+ if necessary, the distributed split.
+
+ transform (Callable): This callable is evaluated on the clip output before
+ the clip is returned. It can be used for user defined preprocessing and
+ augmentations to the clips. The clip output is a dictionary with the
+ following format:
+ {
+ 'video': ,
+ 'label': ,
+ 'index':
+ }
+ If transform is None, the raw clip output in the above format is
+ returned unmodified.
+
+ video_path_prefix (str): Path to root directory with the videos that are
+ loaded in EncodedVideoDataset. All the video paths before loading
+ are prefixed with this path.
+
+ decoder (str): Defines which backend should be used to decode videos.
+ """
+ labeled_video_paths = Hmdb51LabeledVideoPaths.from_dir(
+ data_path, split_id=split_id, split_type=split_type
+ )
+ labeled_video_paths.path_prefix = video_path_prefix
+ dataset = EncodedVideoDataset(
+ labeled_video_paths,
+ clip_sampler,
+ video_sampler,
+ transform,
+ decode_audio=decode_audio,
+ decoder=decoder,
+ )
+
+ return dataset
diff --git a/pytorchvideo/data/kinetics.py b/pytorchvideo/data/kinetics.py
new file mode 100644
index 00000000..0aec18d2
--- /dev/null
+++ b/pytorchvideo/data/kinetics.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from .encoded_video_dataset import labeled_encoded_video_dataset
+
+
+"""
+ Action recognition video dataset for Kinetics-{400,600,700} stored as encoded videos.
+
+"""
+Kinetics = labeled_encoded_video_dataset
diff --git a/pytorchvideo/data/labeled_video_paths.py b/pytorchvideo/data/labeled_video_paths.py
new file mode 100644
index 00000000..68eda354
--- /dev/null
+++ b/pytorchvideo/data/labeled_video_paths.py
@@ -0,0 +1,139 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from __future__ import annotations
+
+import os
+import pathlib
+from typing import List, Optional, Tuple
+
+from iopath.common.file_io import g_pathmgr
+from torchvision.datasets.folder import make_dataset
+
+
+class LabeledVideoPaths:
+ """
+ LabeledVideoPaths contains pairs of video path and integer index label.
+ """
+
+ @classmethod
+ def from_path(cls, data_path: str) -> LabeledVideoPaths:
+ """
+ Factory function that creates a LabeledVideoPaths object depending on the path
+ type.
+ - If it is a directory path it uses the LabeledVideoPaths.from_directory function.
+ - If it's a file it uses the LabeledVideoPaths.from_csv file.
+ Args:
+ file_path (str): The path to the file to be read.
+ """
+
+ if g_pathmgr.isfile(data_path):
+ return LabeledVideoPaths.from_csv(data_path)
+ elif g_pathmgr.isdir(data_path):
+ return LabeledVideoPaths.from_directory(data_path)
+ else:
+ raise FileNotFoundError(f"{data_path} not found.")
+
+ @classmethod
+ def from_csv(cls, file_path: str) -> LabeledVideoPaths:
+ """
+ Factory function that creates a LabeledVideoPaths object by reading a file with the
+ following format:
+
+ ...
+
+
+ Args:
+ file_path (str): The path to the file to be read.
+ """
+ assert g_pathmgr.exists(file_path), f"{file_path} not found."
+ video_paths_and_label = []
+ with g_pathmgr.open(file_path, "r") as f:
+ for path_label in f.read().splitlines():
+ line_split = path_label.rsplit(None, 1)
+
+ # The video path file may not contain labels (e.g. for a test split). We
+ # assume this is the case if only 1 path is found and set the label to
+ # -1 if so.
+ if len(line_split) == 1:
+ file_path = line_split[0]
+ label = -1
+ else:
+ file_path, label = line_split
+
+ video_paths_and_label.append((file_path, int(label)))
+
+ assert (
+ len(video_paths_and_label) > 0
+ ), f"Failed to load dataset from {file_path}."
+ return cls(video_paths_and_label)
+
+ @classmethod
+ def from_directory(cls, dir_path: str) -> LabeledVideoPaths:
+ """
+ Factory function that creates a LabeledVideoPaths object by parsing the structure
+ of the given directory's subdirectories into the classification labels. It
+ expects the directory format to be the following:
+ dir_path//.mp4
+
+ Classes are indexed from 0 to the number of classes, alphabetically.
+
+ E.g.
+ dir_path/class_x/xxx.ext
+ dir_path/class_x/xxy.ext
+ dir_path/class_x/xxz.ext
+ dir_path/class_y/123.ext
+ dir_path/class_y/nsdf3.ext
+ dir_path/class_y/asd932_.ext
+
+ Would produce two classes labeled 0 and 1 with 3 videos paths associated with each.
+
+ Args:
+ dir_path (str): Root directory to the video class directories .
+ """
+ assert g_pathmgr.exists(dir_path), f"{dir_path} not found."
+
+ # Find all classes based on directory names. These classes are then sorted and indexed
+ # from 0 to the number of classes.
+ classes = sorted((f for f in pathlib.Path(dir_path).iterdir() if f.is_dir()))
+ class_to_idx = {classes[i]: i for i in range(len(classes))}
+ video_paths_and_label = make_dataset(
+ dir_path, class_to_idx, extensions=("mp4", "avi")
+ )
+ assert (
+ len(video_paths_and_label) > 0
+ ), f"Failed to load dataset from {dir_path}."
+ return cls(video_paths_and_label)
+
+ def __init__(
+ self, paths_and_labels: List[Tuple[str, Optional[int]]], path_prefix=""
+ ) -> None:
+ """
+ Args:
+ paths_and_labels [(str, int)]: a list of tuples containing the video
+ path and integer label.
+ """
+ self._paths_and_labels = paths_and_labels
+ self._path_prefix = path_prefix
+
+ def path_prefix(self, prefix):
+ self._path_prefix = prefix
+
+ path_prefix = property(None, path_prefix)
+
+ def __getitem__(self, index: int) -> Tuple[str, int]:
+ """
+ Args:
+ index (int): the path and label index.
+
+ Returns:
+ The path and label tuple for the given index.
+ """
+ path, label = self._paths_and_labels[index]
+ return (os.path.join(self._path_prefix, path), {"label": label})
+
+ def __len__(self) -> int:
+ """
+ Returns:
+ The number of video paths and label pairs.
+ """
+ return len(self._paths_and_labels)
diff --git a/pytorchvideo/data/ssv2.py b/pytorchvideo/data/ssv2.py
new file mode 100644
index 00000000..1d6bcd0d
--- /dev/null
+++ b/pytorchvideo/data/ssv2.py
@@ -0,0 +1,254 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import csv
+import functools
+import json
+import os
+import random
+from collections import defaultdict
+from typing import Any, Callable, List, Optional, Tuple, Type
+
+import numpy as np
+import torch
+import torch.utils.data
+from iopath.common.file_io import g_pathmgr
+from pytorchvideo.data.clip_sampling import ClipSampler
+from pytorchvideo.data.frame_video import FrameVideo
+
+from .utils import MultiProcessSampler
+
+
+class SSv2(torch.utils.data.IterableDataset):
+ """
+ Action recognition video dataset for Something-something v2 (SSv2) stored
+ as image frames.
+
+ This dataset handles the parsing of frames, loading and clip sampling for the
+ videos. All io reading is done with PathManager, enabling non-local storage
+ uri's to be used.
+ """
+
+ def __init__(
+ self,
+ label_name_file: str,
+ video_label_file: str,
+ video_path_label_file: str,
+ clip_sampler: ClipSampler,
+ video_sampler: Type[torch.utils.data.Sampler] = torch.utils.data.RandomSampler,
+ transform: Optional[Callable[[dict], Any]] = None,
+ video_path_prefix: str = "",
+ frames_per_clip: Optional[int] = None,
+ rand_sample_frames: bool = False,
+ ) -> None:
+ """
+ Args:
+ label_name_file (str): ssv2 label file that contians the label names and
+ indexes.
+
+ video_label_file (str): a file that contains video ids and the corresponding
+ video label.
+
+ video_path_label_file (str): a file that contains frame paths for each
+ video and the corresponding frame label. The file must be a space separated
+ csv of the format:
+ `original_vido_id video_id frame_id path labels`
+
+ clip_sampler (ClipSampler): Defines how clips should be sampled from each
+ video. See the clip sampling documentation for more information.
+
+ video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal
+ video container. This defines the order videos are decoded and,
+ if necessary, the distributed split.
+
+ transform (Optional[Callable]): This callable is evaluated on the clip output before
+ the clip is returned. It can be used for user defined preprocessing and
+ augmentations to the clips. The clip output is a dictionary with the
+ following format:
+ {
+ 'video': ,
+ 'label': ,
+ 'video_index': ,
+ 'clip_index': ,
+ 'aug_index': , augmentation index as augmentations
+ might generate multiple views for one clip.
+ }
+ If transform is None, the raw clip output in the above format is
+ returned unmodified.
+
+ video_path_prefix (str): prefix path to add to all paths from data_path.
+
+ frames_per_clip (Optional[int]): The number of frames per clip to sample.
+
+ rand_sample_frames (bool): If True, randomly sampling frames for each clip.
+ """
+ self._transform = transform
+ self._clip_sampler = clip_sampler
+ self._path_to_videos, self._labels = _read_video_paths_and_labels(
+ label_name_file,
+ video_label_file,
+ video_path_label_file,
+ prefix=video_path_prefix,
+ )
+ self._video_sampler = video_sampler(self._path_to_videos)
+ self._video_sampler_iter = None # Initialized on first call to self.__next__()
+ self._frame_filter = (
+ functools.partial(
+ SSv2._sample_clip_frames,
+ frames_per_clip=frames_per_clip,
+ rand_sample=rand_sample_frames,
+ )
+ if frames_per_clip is not None
+ else None
+ )
+
+ # Depending on the clip sampler type, we may want to sample multiple clips
+ # from one video. In that case, we keep the store video, label and previous sampled
+ # clip time in these variables.
+ self._loaded_video = None
+ self._next_clip_start_time = 0.0
+
+ @staticmethod
+ def _sample_clip_frames(
+ frame_indices: List[int], frames_per_clip: int, rand_sample: bool
+ ) -> List[int]:
+ """
+ Use segment-based input frame sampling that splits eachvideo into segments,
+ and from each of them, we sample one frame to form a clip.
+
+ Args:
+ frame_indices (list): list of frame indices.
+ frames_per_clip (int): The number of frames per clip to sample.
+ rand_sample (bool): if True, randomly sampling frames.
+
+ Returns:
+ (list): Outputs a subsampled list with num_samples frames.
+ """
+ num_frames = len(frame_indices)
+
+ seg_size = float(num_frames - 1) / frames_per_clip
+ seq = []
+ for i in range(frames_per_clip):
+ start = int(np.round(seg_size * i))
+ end = int(np.round(seg_size * (i + 1)))
+ if rand_sample:
+ seq.append(random.randint(start, end))
+ else:
+ seq.append((start + end) // 2)
+
+ return [frame_indices[idx] for idx in seq]
+
+ @property
+ def video_sampler(self):
+ return self._video_sampler
+
+ def __next__(self) -> dict:
+ """
+ Retrieves the next clip based on the clip sampling strategy and video sampler.
+
+ Returns:
+ A video clip with the following format if transform is None:
+ {
+ 'video': ,
+ 'label': ,
+ 'video_index': ,
+ 'clip_index': ,
+ 'aug_index': , augmentation index as augmentations
+ might generate multiple views for one clip.
+ }
+ Otherwise, the transform defines the clip output.
+ """
+ if not self._video_sampler_iter:
+ # Setup MultiProcessSampler here - after PyTorch DataLoader workers are spawned.
+ self._video_sampler_iter = iter(MultiProcessSampler(self._video_sampler))
+
+ if self._loaded_video:
+ video, video_index = self._loaded_video
+ else:
+ video_index = next(self._video_sampler_iter)
+ path_to_video_frames = self._path_to_videos[video_index]
+ video = FrameVideo.from_frame_paths(path_to_video_frames)
+ self._loaded_video = (video, video_index)
+
+ clip_start, clip_end, clip_index, aug_index, is_last_clip = self._clip_sampler(
+ self._next_clip_start_time, video.duration
+ )
+ # Only load the clip once and reuse previously stored clip if there are multiple
+ # views for augmentations to perform on the same clip.
+ if aug_index == 0:
+ self._loaded_clip = video.get_clip(0, video.duration, self._frame_filter)
+ self._next_clip_start_time = clip_end
+
+ if is_last_clip:
+ self._loaded_video = None
+ self._next_clip_start_time = 0.0
+
+ sample_dict = {
+ "video": self._loaded_clip["video"],
+ "label": self._labels[video_index],
+ "video_name": str(video_index),
+ "video_index": video_index,
+ "clip_index": clip_index,
+ "aug_index": aug_index,
+ }
+ if self._transform is not None:
+ sample_dict = self._transform(sample_dict)
+
+ return sample_dict
+
+ def __iter__(self):
+ return self
+
+
+def _read_video_paths_and_labels(
+ label_name_file: str,
+ video_label_file: str,
+ video_path_label_file: str,
+ prefix: str = "",
+) -> Tuple[List[str], List[int]]:
+ """
+ Args:
+ label_name_file (str): ssv2 label file that contians the label names and
+ indexes. ('/path/to/folder/something-something-v2-labels.json')
+ video_label_file (str): a file that contains video ids and the corresponding
+ video label. (e.g., '/path/to/folder/something-something-v2-train.json')
+ video_path_label_file (str): a file that contains frame paths for each
+ video and the corresponding frame label. The file must be a space separated
+ csv of the format:
+ `original_vido_id video_id frame_id path labels`
+ prefix (str): prefix path to add to all paths from video_path_label_file.
+
+ Returns:
+ image_paths (list): list of list containing path to each frame.
+ labels (list): list containing label of each video.
+ """
+ # Loading image paths.
+ paths = defaultdict(list)
+ with g_pathmgr.open(video_path_label_file, "r") as f:
+ # Space separated CSV with format: original_vido_id video_id frame_id path labels
+ csv_reader = csv.DictReader(f, delimiter=" ")
+ for row in csv_reader:
+ assert len(row) == 5
+ video_name = row["original_vido_id"]
+ path = os.path.join(prefix, row["path"])
+ paths[video_name].append(path)
+
+ # Loading label names.
+ with g_pathmgr.open(label_name_file, "r") as f:
+ label_name_dict = json.load(f)
+
+ with g_pathmgr.open(video_label_file, "r") as f:
+ video_label_json = json.load(f)
+
+ labels = []
+ image_paths = []
+ for video in video_label_json:
+ video_name = video["id"]
+ if video_name in paths:
+ template = video["template"]
+ template = template.replace("[", "")
+ template = template.replace("]", "")
+ label = int(label_name_dict[template])
+ image_paths.append(paths[video_name])
+ labels.append(label)
+
+ return image_paths, labels
diff --git a/pytorchvideo/data/ucf101.py b/pytorchvideo/data/ucf101.py
new file mode 100644
index 00000000..c9b39d35
--- /dev/null
+++ b/pytorchvideo/data/ucf101.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from .encoded_video_dataset import labeled_encoded_video_dataset
+
+
+"""
+ Action recognition video dataset for UCF101 stored as an encoded video.
+
+"""
+Ucf101 = labeled_encoded_video_dataset
diff --git a/pytorchvideo/data/utils.py b/pytorchvideo/data/utils.py
new file mode 100644
index 00000000..ac53653c
--- /dev/null
+++ b/pytorchvideo/data/utils.py
@@ -0,0 +1,278 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from __future__ import annotations
+
+import csv
+import itertools
+import logging
+import math
+import threading
+from collections import defaultdict
+from dataclasses import Field, field as dataclass_field, fields as dataclass_fields
+from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
+
+import numpy as np
+import torch
+from iopath.common.file_io import g_pathmgr
+
+
+logger = logging.getLogger(__name__)
+
+
+def thwc_to_cthw(data: torch.Tensor) -> torch.Tensor:
+ """
+ Permute tensor from (time, height, weight, channel) to
+ (channel, height, width, time).
+ """
+ return data.permute(3, 0, 1, 2)
+
+
+def secs_to_pts(time_in_seconds: float, time_base: float, start_pts: float) -> float:
+ """
+ Converts a time (in seconds) to the given time base and start_pts offset
+ presentation time.
+
+ Returns:
+ pts (float): The time in the given time base.
+ """
+ if time_in_seconds == math.inf:
+ return math.inf
+
+ time_base = float(time_base)
+ return int(time_in_seconds / time_base) + start_pts
+
+
+def pts_to_secs(time_in_seconds: float, time_base: float, start_pts: float) -> float:
+ """
+ Converts a present time with the given time base and start_pts offset to seconds.
+
+ Returns:
+ time_in_seconds (float): The corresponding time in seconds.
+ """
+ if time_in_seconds == math.inf:
+ return math.inf
+
+ return (time_in_seconds - start_pts) * float(time_base)
+
+
+class MultiProcessSampler(torch.utils.data.Sampler):
+ """
+ MultiProcessSampler splits sample indices from a PyTorch Sampler evenly across
+ workers spawned by a PyTorch DataLoader.
+ """
+
+ def __init__(self, sampler: torch.utils.data.Sampler) -> None:
+ self._sampler = sampler
+
+ def __iter__(self):
+ """
+ Returns:
+ Iterator for underlying PyTorch Sampler indices split by worker id.
+ """
+ worker_info = torch.utils.data.get_worker_info()
+ if worker_info is not None and worker_info.num_workers != 0:
+
+ # Split sampler indexes by worker.
+ video_indexes = range(len(self._sampler))
+ worker_splits = np.array_split(video_indexes, worker_info.num_workers)
+ worker_id = worker_info.id
+ worker_split = worker_splits[worker_id]
+ if len(worker_split) == 0:
+ logger.warning(
+ f"More data workers({worker_info.num_workers}) than videos"
+ f"({len(self._sampler)}). For optimal use of processes "
+ "reduce num_workers."
+ )
+ return iter(())
+
+ iter_start = worker_split[0]
+ iter_end = worker_split[-1] + 1
+ worker_sampler = itertools.islice(iter(self._sampler), iter_start, iter_end)
+ else:
+
+ # If no worker processes found, we return the full sampler.
+ worker_sampler = iter(self._sampler)
+
+ return worker_sampler
+
+
+def optional_threaded_foreach(
+ target: Callable, args_iterable: Iterable[Tuple], multithreaded: bool
+):
+ """
+ Applies 'target' function to each Tuple args in 'args_iterable'.
+ If 'multithreaded' a thread is spawned for each function application.
+
+ Args:
+ target (Callable):
+ A function that takes as input the parameters in each args_iterable Tuple.
+
+ args_iterable (Iterable[Tuple]):
+ An iterable of the tuples each containing a set of parameters to pass to
+ target.
+
+ multithreaded (bool):
+ Whether or not the target applications are parallelized by thread.
+ """
+
+ if multithreaded:
+ threads = []
+ for args in args_iterable:
+ thread = threading.Thread(target=target, args=args)
+ thread.start()
+ threads.append(thread)
+
+ for t in threads: # Wait for all threads to complete
+ t.join()
+ else:
+ for args in args_iterable:
+ target(*args)
+
+
+class DataclassFieldCaster:
+ """
+ Class to allow subclasses wrapped in @dataclass to automatically
+ cast fields to their relevant type by default.
+
+ Also allows for an arbitrary intialization function to be applied
+ for a given field.
+ """
+
+ COMPLEX_INITIALIZER = "DataclassFieldCaster__complex_initializer"
+
+ def __post_init__(self) -> None:
+ f"""
+ This function is run by the dataclass library after '__init__'.
+
+ Here we use this to ensure all fields are casted to their declared types
+ and to apply any complex field_initializer functions that have been
+ declared via the 'complex_initialized_dataclass_field' method of
+ this class.
+
+ A complex field_initializer for a given field would be stored in the
+ field.metadata dictionary at:
+ key = '{self.COMPLEX_INITIALIZER}' (self.COMPLEX_INITIALIZER)
+
+ """
+ for field in dataclass_fields(self):
+ value = getattr(self, field.name)
+ # First check if the datafield has been set to the declared type or
+ # if the datafield has a declared complex field_initializer.
+ if (
+ not isinstance(value, field.type)
+ or DataclassFieldCaster.COMPLEX_INITIALIZER in field.metadata
+ ):
+ # Apply the complex field_initializer function for this field's value,
+ # assert that the resultant type is the declared type of the field.
+ if DataclassFieldCaster.COMPLEX_INITIALIZER in field.metadata:
+ setattr(
+ self,
+ field.name,
+ field.metadata[DataclassFieldCaster.COMPLEX_INITIALIZER](value),
+ )
+ assert isinstance(getattr(self, field.name), field.type), (
+ f"'field_initializer' function of {field.name} must return "
+ f"type {field.type} but returned type {type(getattr(self, field.name))}"
+ )
+ else:
+ # Otherwise attempt to cast the field's value to its declared type.
+ setattr(self, field.name, field.type(value))
+
+ @staticmethod
+ def complex_initialized_dataclass_field(
+ field_initializer: Callable, **kwargs
+ ) -> Field:
+ """
+ Allows for the setting of a function to be called on the
+ named parameter associated with a field during initialization,
+ after __init__() completes.
+
+ Args:
+ field_initializer (Callable):
+ The function to be called on the field
+
+ **kwargs: To be passed downstream to the dataclasses.field method
+
+ Returns:
+ (dataclasses.Field) that contains the field_initializer and kwargs infoÎ
+ """
+ metadata = kwargs.get("metadata") or {}
+ assert DataclassFieldCaster.COMPLEX_INITIALIZER not in metadata
+ metadata[DataclassFieldCaster.COMPLEX_INITIALIZER] = field_initializer
+ kwargs["metadata"] = metadata
+ return dataclass_field(**kwargs)
+
+
+def load_dataclass_dict_from_csv(
+ input_csv_file_path: str,
+ dataclass_class: type,
+ dict_key_field: str,
+ list_per_key: bool = False,
+) -> Dict[Any, Union[Any, List[Any]]]:
+ """
+ Args:
+ input_csv_file_path (str): File path of the csv to read from
+ dataclass_class (type): The dataclass to read each row into.
+ dict_key_field (str): The field of 'dataclass_class' to use as
+ the dictionary key.
+ list_per_key (bool) = False: If the output data structure
+ contains a list of dataclass objects per key, rather than a
+ single unique dataclass object.
+
+ Returns:
+ Dict[Any, Union[Any, List[Any]] mapping from the dataclass
+ value at attr = dict_key_field to either:
+
+ if 'list_per_key', a list of all dataclass objects that
+ have equal values at attr = dict_key_field, equal to the key
+
+ if not 'list_per_key', the unique dataclass object
+ for which the value at attr = dict_key_field is equal to the key
+
+ Raises:
+ AssertionError: if not 'list_per_key' and there are
+ dataclass obejcts with equal values at attr = dict_key_field
+ """
+
+ output_dict = defaultdict(list) if list_per_key else {}
+ with g_pathmgr.open(input_csv_file_path) as dataclass_file:
+ reader = csv.reader(dataclass_file, delimiter=",", quotechar='"')
+ column_index = {header: i for i, header in enumerate(next(reader))}
+ for line in reader:
+ datum = dataclass_class(
+ *(
+ line[column_index[field.name]]
+ for field in dataclass_fields(dataclass_class)
+ )
+ )
+ dict_key = getattr(datum, dict_key_field)
+ if list_per_key:
+ output_dict[dict_key].append(datum)
+ else:
+ assert (
+ dict_key not in output_dict
+ ), f"Multiple entries for {output_dict} in {dataclass_file}"
+ output_dict[dict_key] = datum
+ return output_dict
+
+
+def save_dataclass_objs_to_headered_csv(
+ dataclass_objs: List[Any], file_name: str
+) -> None:
+ """
+ Saves a list of @dataclass objects to the specified csv file.
+
+ Args:
+ dataclass_objs (List[Any]):
+ A list of @dataclass objects to be saved.
+
+ file_name (str):
+ file_name to save csv data to.
+ """
+ dataclass_type = type(dataclass_objs[0])
+ field_names = [f.name for f in dataclass_fields(dataclass_type)]
+ with g_pathmgr.open(file_name, "w") as f:
+ writer = csv.writer(f, delimiter=",", quotechar='"')
+ writer.writerow(field_names)
+ for obj in dataclass_objs:
+ writer.writerow([getattr(obj, f) for f in field_names])
diff --git a/pytorchvideo/data/video.py b/pytorchvideo/data/video.py
new file mode 100644
index 00000000..077d1a00
--- /dev/null
+++ b/pytorchvideo/data/video.py
@@ -0,0 +1,72 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import io
+import pathlib
+from abc import ABC, abstractmethod
+from typing import BinaryIO, Dict, Optional
+
+import torch
+from iopath.common.file_io import g_pathmgr
+
+
+class Video(ABC):
+ """
+ Video provides an interface to access clips from a video container.
+ """
+
+ @classmethod
+ def from_path(cls, file_path: str, decode_audio: bool = True):
+ """
+ Fetches the given video path using PathManager (allowing remote uris to be
+ fetched) and constructs the EncodedVideo object.
+
+ Args:
+ file_path (str): a PathManager file-path.
+ """
+ # We read the file with PathManager rather than pyav so that we can read from
+ # remote uris.
+ with g_pathmgr.open(file_path, "rb") as fh:
+ video_file = io.BytesIO(fh.read())
+
+ return cls(video_file, pathlib.Path(file_path).name, decode_audio)
+
+ @property
+ @abstractmethod
+ def duration(self) -> float:
+ """
+ Returns:
+ duration of the video in seconds
+ """
+ pass
+
+ @abstractmethod
+ def get_clip(
+ self, start_sec: float, end_sec: float
+ ) -> Dict[str, Optional[torch.Tensor]]:
+ """
+ Retrieves frames from the internal video at the specified start and end times
+ in seconds (the video always starts at 0 seconds).
+
+ Args:
+ start_sec (float): the clip start time in seconds
+ end_sec (float): the clip end time in seconds
+ Returns:
+ video_data_dictonary: A dictionary mapping strings to tensor of the clip's
+ underlying data.
+
+ """
+ pass
+
+ @abstractmethod
+ def __init__(
+ self,
+ file: BinaryIO,
+ video_name: Optional[str] = None,
+ decode_audio: bool = True,
+ ) -> None:
+ """
+ Args:
+ file (BinaryIO): a file-like object (e.g. io.BytesIO or io.StringIO) that
+ contains the encoded video.
+ """
+ pass
diff --git a/pytorchvideo/layers/__init__.py b/pytorchvideo/layers/__init__.py
new file mode 100644
index 00000000..d41f6a68
--- /dev/null
+++ b/pytorchvideo/layers/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from .fusion import ConcatFusion, ReduceFusion, make_fusion_layer
+from .mlp import make_multilayer_perceptron
+from .positional_encoding import PositionalEncoding
diff --git a/pytorchvideo/layers/accelerator/__init__.py b/pytorchvideo/layers/accelerator/__init__.py
new file mode 100644
index 00000000..5c7f19c6
--- /dev/null
+++ b/pytorchvideo/layers/accelerator/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/pytorchvideo/layers/accelerator/mobile_cpu/__init__.py b/pytorchvideo/layers/accelerator/mobile_cpu/__init__.py
new file mode 100644
index 00000000..5c7f19c6
--- /dev/null
+++ b/pytorchvideo/layers/accelerator/mobile_cpu/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/pytorchvideo/layers/accelerator/mobile_cpu/activation_functions.py b/pytorchvideo/layers/accelerator/mobile_cpu/activation_functions.py
new file mode 100644
index 00000000..db74384c
--- /dev/null
+++ b/pytorchvideo/layers/accelerator/mobile_cpu/activation_functions.py
@@ -0,0 +1,103 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+"""
+This file contains supported activation functions in efficient block and helper code.
+All supported activation functions are child class of EfficientBlockBase, and included
+in supported_act_functions.
+"""
+import torch
+import torch.nn as nn
+from pytorchvideo.accelerator.efficient_blocks.efficient_block_base import (
+ EfficientBlockBase,
+)
+from pytorchvideo.layers.swish import Swish as SwishCustomOp
+
+
+class _NaiveSwish(nn.Module):
+ """
+ Helper class to implement naive swish for deploy. It is not intended to be used to
+ build network.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.mul_func = nn.quantized.FloatFunctional()
+
+ def forward(self, x):
+ return self.mul_func.mul(x, torch.sigmoid(x))
+
+
+class Swish(EfficientBlockBase):
+ """
+ Swish activation function for efficient block. When in original form for training,
+ using custom op version of swish for better training memory efficiency. When in
+ deployable form, use naive swish as custom op is not supported to run on Pytorch
+ Mobile. For better latency on mobile CPU, use HardSwish instead.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.act = SwishCustomOp()
+
+ def forward(self, x):
+ return self.act(x)
+
+ def convert(self, *args, **kwarg):
+ self.act = _NaiveSwish()
+
+
+class HardSwish(EfficientBlockBase):
+ """
+ Hardswish activation function. It is natively supported by Pytorch Mobile, and has
+ better latency than Swish in int8 mode.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.act = nn.Hardswish()
+
+ def forward(self, x):
+ return self.act(x)
+
+ def convert(self, *args, **kwarg):
+ pass
+
+
+class ReLU(EfficientBlockBase):
+ """
+ ReLU activation function for EfficientBlockBase.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.act = nn.ReLU(inplace=True)
+
+ def forward(self, x):
+ return self.act(x)
+
+ def convert(self, *args, **kwarg):
+ pass
+
+
+class Identity(EfficientBlockBase):
+ """
+ Identity operation for EfficientBlockBase.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.act = nn.Identity()
+
+ def forward(self, x):
+ return self.act(x)
+
+ def convert(self, *args, **kwarg):
+ pass
+
+
+supported_act_functions = {
+ "relu": ReLU,
+ "swish": Swish,
+ "hswish": HardSwish,
+ "identity": Identity,
+}
diff --git a/pytorchvideo/layers/accelerator/mobile_cpu/attention.py b/pytorchvideo/layers/accelerator/mobile_cpu/attention.py
new file mode 100644
index 00000000..3a6309e4
--- /dev/null
+++ b/pytorchvideo/layers/accelerator/mobile_cpu/attention.py
@@ -0,0 +1,109 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from copy import deepcopy
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from fvcore.nn.squeeze_excitation import SqueezeExcitation as SqueezeExcitationFVCore
+from pytorchvideo.accelerator.efficient_blocks.efficient_block_base import (
+ EfficientBlockBase,
+)
+
+from .conv_helper import _Reshape, _SkipConnectMul
+
+
+class SqueezeExcitation(EfficientBlockBase):
+ """
+ Efficient Squeeze-Excitation (SE). The Squeeze-Excitation block is described in:
+ *Hu et al., Squeeze-and-Excitation Networks, arXiv:1709.01507*
+ This implementation has the same instantiation interface as SE implementation in
+ fvcore, and in original mode for training it is just a wrapped version of SE in
+ fvcore. Since conv3d in original SE implementation of fvcore is not well supported
+ by QNNPACK, here convert() method is implemented which converts class instance into
+ a equivalent efficient deployable form.
+
+ convert_flag variable is to record whether the SqueezeExcitation instance
+ has been converted; SqueezeExcitation is in original form if convert_flag is false,
+ while it is in deployable form if convert_flag is true.
+ """
+
+ def __init__(
+ self,
+ num_channels: int,
+ num_channels_reduced: Optional[int] = None,
+ reduction_ratio: float = 2.0,
+ is_3d: bool = False,
+ activation: Optional[nn.Module] = None,
+ ) -> None:
+ """
+ Args:
+ num_channels (int): Number of input channels.
+ num_channels_reduced (int):
+ Number of reduced channels. If none, uses reduction_ratio to calculate.
+ reduction_ratio (float):
+ How much num_channels should be reduced if num_channels_reduced is not provided.
+ is_3d (bool): Whether we're operating on 3d data (or 2d), default 2d.
+ activation (nn.Module): Activation function used, defaults to ReLU.
+ """
+ super().__init__()
+ # Implement SE from FVCore here for training.
+ self.se = SqueezeExcitationFVCore(
+ num_channels,
+ num_channels_reduced=num_channels_reduced,
+ reduction_ratio=reduction_ratio,
+ is_3d=is_3d,
+ activation=activation,
+ )
+ self.is_3d = is_3d
+ self.convert_flag = False
+
+ def convert(self, input_blob_size, **kwargs):
+ """
+ Converts into efficient version of squeeze-excite (SE) for CPU.
+ It changes conv in original SE into linear layer (better supported by CPU).
+ """
+ if self.is_3d:
+ avg_pool = nn.AdaptiveAvgPool3d(1)
+ else:
+ avg_pool = nn.AdaptiveAvgPool2d(1)
+ """
+ Reshape tensor size to (B, C) for linear layer.
+ """
+ reshape0 = _Reshape((input_blob_size[0], input_blob_size[1]))
+ fc0 = nn.Linear(
+ self.se.block[0].in_channels,
+ self.se.block[0].out_channels,
+ bias=(not (self.se.block[0].bias is None)),
+ )
+ state_dict_fc0 = deepcopy(self.se.block[0].state_dict())
+ state_dict_fc0["weight"] = state_dict_fc0["weight"].squeeze()
+ fc0.load_state_dict(state_dict_fc0)
+ activation = deepcopy(self.se.block[1])
+ fc1 = nn.Linear(
+ self.se.block[2].in_channels,
+ self.se.block[2].out_channels,
+ bias=(not (self.se.block[2].bias is None)),
+ )
+ state_dict_fc1 = deepcopy(self.se.block[2].state_dict())
+ state_dict_fc1["weight"] = state_dict_fc1["weight"].squeeze()
+ fc1.load_state_dict(state_dict_fc1)
+ sigmoid = deepcopy(self.se.block[3])
+ """
+ Output of linear layer has output shape of (B, C). Need to reshape to proper
+ shape before multiplying with input tensor.
+ """
+ reshape_size_after_sigmoid = (input_blob_size[0], input_blob_size[1], 1, 1) + (
+ (1,) if self.is_3d else ()
+ )
+ reshape1 = _Reshape(reshape_size_after_sigmoid)
+ se_layers = nn.Sequential(
+ avg_pool, reshape0, fc0, activation, fc1, sigmoid, reshape1
+ )
+ # Add final elementwise multiplication and replace self.se
+ self.se = _SkipConnectMul(se_layers)
+ self.convert_flag = True
+
+ def forward(self, x) -> torch.Tensor:
+ out = self.se(x)
+ return out
diff --git a/pytorchvideo/layers/accelerator/mobile_cpu/conv_helper.py b/pytorchvideo/layers/accelerator/mobile_cpu/conv_helper.py
new file mode 100644
index 00000000..9d9d7c22
--- /dev/null
+++ b/pytorchvideo/layers/accelerator/mobile_cpu/conv_helper.py
@@ -0,0 +1,556 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+"""
+This file contains helper classes for building conv3d efficient blocks.
+The helper classes are intended to be instantiated inside efficient block,
+not to be used by user to build network.
+"""
+
+from copy import deepcopy
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+
+
+class _Reshape(nn.Module):
+ """
+ Helper class to implement data reshape as a module.
+ Args:
+ reshape_size (tuple): size of data after reshape.
+ """
+
+ def __init__(
+ self,
+ reshape_size: Tuple,
+ ):
+ super().__init__()
+ self.reshape_size = reshape_size
+
+ def forward(self, x):
+ return torch.reshape(x, self.reshape_size)
+
+
+class _SkipConnectMul(nn.Module):
+ """
+ Helper class to implement skip multiplication.
+ Args:
+ layer (nn.Module): layer for skip multiplication. With input x, _SkipConnectMul
+ implements layer(x)*x.
+ """
+
+ def __init__(
+ self,
+ layer: nn.Module,
+ ):
+ super().__init__()
+ self.layer = layer
+ self.mul_func = nn.quantized.FloatFunctional()
+
+ def forward(self, x):
+ return self.mul_func.mul(x, self.layer(x))
+
+
+class _Conv3dTemporalKernel3Decomposed(nn.Module):
+ """
+ Helper class for decomposing conv3d with temporal kernel of 3 into equivalent conv2ds.
+ In conv3d with temporal kernel 3 and input I, for output temporal index of t (O[:,:,t,:,:]),
+ the conv can be expressed as:
+ O[:,:,t,:,:] = conv3d(I[:,:,t:t+3,:,:])
+ = conv2d_0(I[:,:,t,:,:]) + conv2d_1(I[:,:,t+1,:,:]) + conv2d_2(I[:,:,t+2,:,:])
+ If bias is considered:
+ O[:,:,t,:,:] = conv3d_w_bias(I[:,:,t:t+3,:,:])
+ = conv2d_0_wo_bias(I[:,:,t,:,:])
+ + conv2d_1_w_bias(I[:,:,t+1,:,:]) + conv2d_2_wo_bias(I[:,:,t+2,:,:])
+ The input Conv3d also needs zero padding of size 1 in temporal dimension.
+ """
+
+ def __init__(
+ self,
+ conv3d_in: nn.Conv3d,
+ input_THW_tuple: Tuple,
+ ):
+ """
+ Args:
+ conv3d_in (nn.Module): input nn.Conv3d module to be converted
+ into equivalent conv2d.
+ input_THW_tuple (tuple): input THW size for conv3d_in during forward.
+ """
+ super().__init__()
+ assert conv3d_in.padding[0] == 1, (
+ "_Conv3dTemporalKernel3Eq only support temporal padding of 1, "
+ f"but got {conv3d_in.padding[0]}"
+ )
+ assert conv3d_in.padding_mode == "zeros", (
+ "_Conv3dTemporalKernel3Eq only support zero padding, "
+ f"but got {conv3d_in.padding_mode}"
+ )
+ self._input_THW_tuple = input_THW_tuple
+ padding_2d = conv3d_in.padding[1:]
+ in_channels = conv3d_in.in_channels
+ out_channels = conv3d_in.out_channels
+ kernel_size = conv3d_in.kernel_size[1:]
+ groups = conv3d_in.groups
+ stride_2d = conv3d_in.stride[1:]
+ # Create 3 conv2d to emulate conv3d.
+ if (
+ self._input_THW_tuple[0] > 1
+ ): # Those two conv2d are needed only when temporal input > 1.
+ self._conv2d_3_3_0 = nn.Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ padding=padding_2d,
+ stride=stride_2d,
+ groups=groups,
+ bias=False,
+ )
+ self._conv2d_3_3_2 = nn.Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ padding=padding_2d,
+ stride=stride_2d,
+ groups=groups,
+ bias=False,
+ )
+ self._conv2d_3_3_1 = nn.Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ padding=padding_2d,
+ stride=stride_2d,
+ groups=groups,
+ bias=(conv3d_in.bias is not None),
+ )
+
+ state_dict = conv3d_in.state_dict()
+ state_dict_1 = deepcopy(state_dict)
+ state_dict_1["weight"] = state_dict["weight"][:, :, 1]
+ self._conv2d_3_3_1.load_state_dict(state_dict_1)
+
+ if self._input_THW_tuple[0] > 1:
+ state_dict_0 = deepcopy(state_dict)
+ state_dict_0["weight"] = state_dict["weight"][:, :, 0]
+ if conv3d_in.bias is not None:
+ """
+ Don't need bias for other conv2d instances to avoid duplicated addition of bias.
+ """
+ state_dict_0.pop("bias")
+ self._conv2d_3_3_0.load_state_dict(state_dict_0)
+
+ state_dict_2 = deepcopy(state_dict)
+ state_dict_2["weight"] = state_dict["weight"][:, :, 2]
+ if conv3d_in.bias is not None:
+ state_dict_2.pop("bias")
+ self._conv2d_3_3_2.load_state_dict(state_dict_2)
+
+ self._add_funcs = nn.ModuleList(
+ [
+ nn.quantized.FloatFunctional()
+ for _ in range(2 * (self._input_THW_tuple[0] - 1))
+ ]
+ )
+ self._cat_func = nn.quantized.FloatFunctional()
+
+ def forward(self, x):
+ """
+ Use three conv2d to emulate conv3d.
+ This forward assumes zero padding of size 1 in temporal dimension.
+ """
+ if self._input_THW_tuple[0] > 1:
+ out_tensor_list = []
+ """
+ First output plane in temporal dimension,
+ conv2d_3_3_0 is skipped due to zero padding.
+ """
+ cur_tensor = (
+ self._add_funcs[0]
+ .add(self._conv2d_3_3_1(x[:, :, 0]), self._conv2d_3_3_2(x[:, :, 1]))
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ for idx in range(2, self._input_THW_tuple[0]):
+ cur_tensor = (
+ self._add_funcs[2 * idx - 3]
+ .add(
+ self._add_funcs[2 * idx - 2].add(
+ self._conv2d_3_3_0(x[:, :, idx - 2]),
+ self._conv2d_3_3_1(x[:, :, idx - 1]),
+ ),
+ self._conv2d_3_3_2(x[:, :, idx]),
+ )
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ """
+ Last output plane in temporal domain, conv2d_3_3_2 is skipped due to zero padding.
+ """
+ cur_tensor = (
+ self._add_funcs[-1]
+ .add(self._conv2d_3_3_0(x[:, :, -2]), self._conv2d_3_3_1(x[:, :, -1]))
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ return self._cat_func.cat(out_tensor_list, 2)
+ else: # Degenerated to simple conv2d
+ return self._conv2d_3_3_1(x[:, :, 0]).unsqueeze(2)
+
+
+class _Conv3dTemporalKernel5Decomposed(nn.Module):
+ """
+ Helper class for decomposing conv3d with kernel size of (5, k, k) into equivalent conv2ds.
+ In such conv3d and input I, for output temporal index of t (O[:,:,t,:,:]), the conv
+ can be expressed as:
+ O[:,:,t,:,:] = conv3d(I[:,:,t:t+5,:,:])
+ = conv2d_0(I[:,:,t,:,:]) + conv2d_1(I[:,:,t+1,:,:]) + conv2d_2(I[:,:,t+2,:,:])
+ + conv2d_3(I[:,:,t+3,:,:]) + conv2d_4(I[:,:,t+4,:,:])
+ If bias is considered:
+ O[:,:,t,:,:] = conv3d_w_bias(I[:,:,t:t+3,:,:])
+ = conv2d_0_wo_bias(I[:,:,t,:,:])
+ + conv2d_1_wo_bias(I[:,:,t+1,:,:]) + conv2d_2_w_bias(I[:,:,t+2,:,:])
+ + conv2d_3_wo_bias(I[:,:,t+1,:,:]) + conv2d_4_wo_bias(I[:,:,t+2,:,:])
+ The input Conv3d also needs zero padding of size 2 in temporal dimension at begin and end.
+ """
+
+ def __init__(
+ self,
+ conv3d_in: nn.Conv3d,
+ thw_shape: Tuple[int, int, int],
+ ):
+ """
+ Args:
+ conv3d_in (nn.Module): input nn.Conv3d module to be converted
+ into equivalent conv2d.
+ thw_shape (tuple): input THW size for conv3d_in during forward.
+ """
+ super().__init__()
+ assert conv3d_in.padding[0] == 2, (
+ "_Conv3dTemporalKernel5Eq only support temporal padding of 2, "
+ f"but got {conv3d_in.padding[0]}"
+ )
+ assert conv3d_in.padding_mode == "zeros", (
+ "_Conv3dTemporalKernel5Eq only support zero padding, "
+ f"but got {conv3d_in.padding_mode}"
+ )
+ self._thw_shape = thw_shape
+ padding_2d = conv3d_in.padding[1:]
+ in_channels = conv3d_in.in_channels
+ out_channels = conv3d_in.out_channels
+ kernel_size = conv3d_in.kernel_size[1:]
+ groups = conv3d_in.groups
+ stride_2d = conv3d_in.stride[1:]
+ # Create 3 conv2d to emulate conv3d.
+ t, h, w = self._thw_shape
+ args_dict = {
+ "in_channels": in_channels,
+ "out_channels": out_channels,
+ "kernel_size": kernel_size,
+ "padding": padding_2d,
+ "stride": stride_2d,
+ "groups": groups,
+ }
+
+ for iter_idx in range(5):
+ if iter_idx != 2:
+ if t > 1: # Those four conv2d are needed only when temporal input > 1.
+ self.add_module(
+ f"_conv2d_{iter_idx}", nn.Conv2d(**args_dict, bias=False)
+ )
+ else: # _conv2d_2 is needed for all circumstances.
+ self.add_module(
+ f"_conv2d_{iter_idx}",
+ nn.Conv2d(**args_dict, bias=(conv3d_in.bias is not None)),
+ )
+
+ # State dict for _conv2d_2
+ original_state_dict = conv3d_in.state_dict()
+ state_dict_to_load = deepcopy(original_state_dict)
+ state_dict_to_load["weight"] = original_state_dict["weight"][:, :, 2]
+ self._conv2d_2.load_state_dict(state_dict_to_load)
+
+ if t > 1:
+ if conv3d_in.bias is not None:
+ # Don't need bias for other conv2d instances to avoid duplicated
+ # addition of bias.
+ state_dict_to_load.pop("bias")
+ # State dict for _conv2d_0, _conv2d_1, _conv2d_3, _conv2d_4
+ state_dict_to_load["weight"] = original_state_dict["weight"][:, :, 0]
+ self._conv2d_0.load_state_dict(state_dict_to_load)
+
+ state_dict_to_load["weight"] = original_state_dict["weight"][:, :, 1]
+ self._conv2d_1.load_state_dict(state_dict_to_load)
+
+ state_dict_to_load["weight"] = original_state_dict["weight"][:, :, 3]
+ self._conv2d_3.load_state_dict(state_dict_to_load)
+
+ state_dict_to_load["weight"] = original_state_dict["weight"][:, :, 4]
+ self._conv2d_4.load_state_dict(state_dict_to_load)
+ # Elementwise add are needed in forward function, use nn.quantized.FloatFunctional()
+ # for better quantization support. One convolution needs at most 4 elementwise adds
+ # without zero padding; for boundary planes fewer elementwise adds are needed.
+ # See forward() for more details.
+ self._add_funcs = nn.ModuleList(
+ [nn.quantized.FloatFunctional() for _ in range(4 * t - 6)]
+ )
+ self._cat_func = nn.quantized.FloatFunctional()
+
+ def forward(self, x):
+ """
+ Use three conv2d to emulate conv3d.
+ Args:
+ x (torch.Tensor): 5D tensor of (B, C, T, H, W)
+ """
+ t, h, w = self._thw_shape
+ out_tensor_list = []
+ if (
+ t == 1
+ ): # Degenerated to simple conv2d, but make sure output still has T dimension
+ return self._conv2d_2(x[:, :, 0]).unsqueeze(2)
+ elif t == 2:
+ # out_tensor_list[0]: conv2d_1_1_0, conv2d_1_1_1 and conv2d_1_1_4 are
+ # applied to zero padding.
+ cur_tensor = (
+ self._add_funcs[0]
+ .add(self._conv2d_2(x[:, :, 0]), self._conv2d_3(x[:, :, 1]))
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ # out_tensor_list[1]: conv2d_1_1_0, conv2d_1_1_3 and conv2d_1_1_4 are
+ # applied to zero padding.
+
+ cur_tensor = (
+ self._add_funcs[1]
+ .add(self._conv2d_1(x[:, :, 0]), self._conv2d_2(x[:, :, 1]))
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ elif t == 3:
+ # out_tensor_list[0]: conv2d_1_1_0, conv2d_1_1_1 are applied to zero padding.
+ cur_tensor = (
+ self._add_funcs[0]
+ .add(
+ self._add_funcs[1].add(
+ self._conv2d_2(x[:, :, 0]), self._conv2d_3(x[:, :, 1])
+ ),
+ self._conv2d_4(x[:, :, 2]),
+ )
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ # out_tensor_list[1]: conv2d_1_1_0, conv2d_1_1_4 are applied to zero padding.
+ cur_tensor = (
+ self._add_funcs[2]
+ .add(
+ self._add_funcs[3].add(
+ self._conv2d_1(x[:, :, 0]), self._conv2d_2(x[:, :, 1])
+ ),
+ self._conv2d_3(x[:, :, 2]),
+ )
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ # out_tensor_list[2]: conv2d_1_1_3, conv2d_1_1_4 are applied to zero padding.
+ cur_tensor = (
+ self._add_funcs[4]
+ .add(
+ self._add_funcs[5].add(
+ self._conv2d_0(x[:, :, 0]), self._conv2d_1(x[:, :, 1])
+ ),
+ self._conv2d_2(x[:, :, 2]),
+ )
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ elif t == 4:
+ # out_tensor_list[0]: conv2d_1_1_0, conv2d_1_1_1 are applied to zero padding.
+ cur_tensor = (
+ self._add_funcs[0]
+ .add(
+ self._add_funcs[1].add(
+ self._conv2d_2(x[:, :, 0]), self._conv2d_3(x[:, :, 1])
+ ),
+ self._conv2d_4(x[:, :, 2]),
+ )
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ # out_tensor_list[1]: conv2d_1_1_0 is applied to zero padding.
+ cur_tensor = (
+ self._add_funcs[2]
+ .add(
+ self._add_funcs[3].add(
+ self._add_funcs[4].add(
+ self._conv2d_1(x[:, :, 0]),
+ self._conv2d_2(x[:, :, 1]),
+ ),
+ self._conv2d_3(x[:, :, 2]),
+ ),
+ self._conv2d_4(x[:, :, 3]),
+ )
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ # out_tensor_list[2]: conv2d_1_1_4 is applied to zero padding.
+ cur_tensor = (
+ self._add_funcs[5]
+ .add(
+ self._add_funcs[6].add(
+ self._add_funcs[7].add(
+ self._conv2d_0(x[:, :, 0]),
+ self._conv2d_1(x[:, :, 1]),
+ ),
+ self._conv2d_2(x[:, :, 2]),
+ ),
+ self._conv2d_3(x[:, :, 3]),
+ )
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ # out_tensor_list[3]: conv2d_1_1_3, conv2d_1_1_4 are applied to zero padding.
+ cur_tensor = (
+ self._add_funcs[8]
+ .add(
+ self._add_funcs[9].add(
+ self._conv2d_0(x[:, :, 1]), self._conv2d_1(x[:, :, 2])
+ ),
+ self._conv2d_2(x[:, :, 3]),
+ )
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ else: # t >= 5
+ # out_tensor_list[0]: conv2d_1_1_0, conv2d_1_1_1 are applied to zero padding.
+ add_func_idx_base = 0
+ cur_tensor = (
+ self._add_funcs[add_func_idx_base]
+ .add(
+ self._add_funcs[add_func_idx_base + 1].add(
+ self._conv2d_2(x[:, :, 0]), self._conv2d_3(x[:, :, 1])
+ ),
+ self._conv2d_4(x[:, :, 2]),
+ )
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ add_func_idx_base += 2
+ # out_tensor_list[1]: conv2d_1_1_0 is applied to zero padding.
+ cur_tensor = (
+ self._add_funcs[add_func_idx_base]
+ .add(
+ self._add_funcs[add_func_idx_base + 1].add(
+ self._add_funcs[add_func_idx_base + 2].add(
+ self._conv2d_1(x[:, :, 0]),
+ self._conv2d_2(x[:, :, 1]),
+ ),
+ self._conv2d_3(x[:, :, 2]),
+ ),
+ self._conv2d_4(x[:, :, 3]),
+ )
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ add_func_idx_base += 3
+ # out_tensor_list[2:-2]: zero padding has no effect.
+ for idx in range(4, t):
+ cur_tensor = (
+ self._add_funcs[add_func_idx_base]
+ .add(
+ self._add_funcs[add_func_idx_base + 1].add(
+ self._add_funcs[add_func_idx_base + 2].add(
+ self._add_funcs[add_func_idx_base + 3].add(
+ self._conv2d_0(x[:, :, idx - 4]),
+ self._conv2d_1(x[:, :, idx - 3]),
+ ),
+ self._conv2d_2(x[:, :, idx - 2]),
+ ),
+ self._conv2d_3(x[:, :, idx - 1]),
+ ),
+ self._conv2d_4(x[:, :, idx]),
+ )
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ add_func_idx_base += 4
+ # out_tensor_list[-2]: conv2d_1_1_4 is applied to zero padding.
+ cur_tensor = (
+ self._add_funcs[add_func_idx_base]
+ .add(
+ self._add_funcs[add_func_idx_base + 1].add(
+ self._add_funcs[add_func_idx_base + 2].add(
+ self._conv2d_0(x[:, :, -4]),
+ self._conv2d_1(x[:, :, -3]),
+ ),
+ self._conv2d_2(x[:, :, -2]),
+ ),
+ self._conv2d_3(x[:, :, -1]),
+ )
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ add_func_idx_base += 3
+ # out_tensor_list[-1]: conv2d_1_1_3, conv2d_1_1_4 are applied to zero padding.
+ cur_tensor = (
+ self._add_funcs[add_func_idx_base]
+ .add(
+ self._add_funcs[add_func_idx_base + 1].add(
+ self._conv2d_0(x[:, :, -3]),
+ self._conv2d_1(x[:, :, -2]),
+ ),
+ self._conv2d_2(x[:, :, -1]),
+ )
+ .unsqueeze(2)
+ )
+ out_tensor_list.append(cur_tensor)
+ return self._cat_func.cat(out_tensor_list, 2)
+
+
+class _Conv3dTemporalKernel1Decomposed(nn.Module):
+ """
+ Helper class for decomposing conv3d with temporal kernel of 1 into conv2d on
+ multiple temporal planes.
+ In conv3d with temporal kernel 1 and input I, for output temporal index of t (O[:,:,t,:,:]),
+ the conv can be expressed as:
+ O[:,:,t,:,:] = conv3d(I[:,:,t,:,:])
+ = conv2d(I[:,:,t,:,:])
+ The full output can be obtained by concat O[:,:,t,:,:] for t in 0...T,
+ where T is the length of I in temporal dimension.
+ """
+
+ def __init__(
+ self,
+ conv3d_eq: nn.Conv3d,
+ input_THW_tuple: Tuple,
+ ):
+ """
+ Args:
+ conv3d_eq (nn.Module): input nn.Conv3d module to be converted
+ into equivalent conv2d.
+ input_THW_tuple (tuple): input THW size for conv3d_eq during forward.
+ """
+ super().__init__()
+ # create equivalent conv2d module
+ in_channels = conv3d_eq.in_channels
+ out_channels = conv3d_eq.out_channels
+ bias_flag = conv3d_eq.bias is not None
+ self.conv2d_eq = nn.Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size=(conv3d_eq.kernel_size[1], conv3d_eq.kernel_size[2]),
+ stride=(conv3d_eq.stride[1], conv3d_eq.stride[2]),
+ groups=conv3d_eq.groups,
+ bias=bias_flag,
+ padding=(conv3d_eq.padding[1], conv3d_eq.padding[2]),
+ dilation=(conv3d_eq.dilation[1], conv3d_eq.dilation[2]),
+ )
+ state_dict = conv3d_eq.state_dict()
+ state_dict["weight"] = state_dict["weight"].squeeze(2)
+ self.conv2d_eq.load_state_dict(state_dict)
+ self.input_THW_tuple = input_THW_tuple
+
+ def forward(self, x):
+ out_tensor_list = []
+ for idx in range(self.input_THW_tuple[0]):
+ cur_tensor = self.conv2d_eq(x[:, :, idx]).unsqueeze(2)
+ out_tensor_list.append(cur_tensor)
+ return torch.cat(out_tensor_list, 2)
diff --git a/pytorchvideo/layers/accelerator/mobile_cpu/convolutions.py b/pytorchvideo/layers/accelerator/mobile_cpu/convolutions.py
new file mode 100644
index 00000000..a0134bc0
--- /dev/null
+++ b/pytorchvideo/layers/accelerator/mobile_cpu/convolutions.py
@@ -0,0 +1,592 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import logging
+from collections import OrderedDict
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+from pytorchvideo.accelerator.efficient_blocks.efficient_block_base import (
+ EfficientBlockBase,
+)
+
+from .activation_functions import supported_act_functions
+from .conv_helper import (
+ _Conv3dTemporalKernel1Decomposed,
+ _Conv3dTemporalKernel3Decomposed,
+ _Conv3dTemporalKernel5Decomposed,
+ _Reshape,
+)
+
+
+class Conv3dPwBnAct(EfficientBlockBase):
+ """
+ Implements Conv3d + Bn + Activation for pointwise layers.
+ The conv layer has fixed kernel_size = (1,1,1),
+ groups = 1, padding = 0, stride = 1, dilation = 1.
+
+ Input
+ |
+ ↓
+ conv3d (1x1x1)
+ ↓
+ BatchNorm (optional)
+ ↓
+ Activation
+
+ Conv3dPwBnAct is in original form (for training) once instantiated. User can
+ call convert() method to convert it into deployable form for deployment.
+
+ convert_flag variable is to record whether the Conv3dPwBnAct instance
+ has been converted; Conv3dPwBnAct is in original form if convert_flag is false,
+ while it is in deployable form if convert_flag is true.
+
+ Current implementation of this layer in QNNPACK is very efficient.
+ Args:
+ in_channels (int): number of input channels for conv3d 1x1x1.
+ out_channels (int): number of output channels for conv3d 1x1x1.
+ bias (bool): if true, use bias for conv.
+ activation (str): applies selected activation from supported_act_functions.
+ See activation_functions.py for more info about supported activations.
+ Currently ReLU ('relu'), Swish ('swish'), Hardswish ('hswish'), Identity
+ ('identity') are supported.
+ use_bn (bool): if true, use batchnorm.
+ norm_eps (float): epsilon for batchnorm.
+ norm_momentum (float): momentum for batchnorm.
+
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ bias=False,
+ activation: str = "relu",
+ use_bn=True,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ ):
+ super().__init__()
+ self._in_channels = in_channels
+ self._out_channels = out_channels
+ self.act = activation
+ kernel = OrderedDict()
+ kernel["conv"] = nn.Conv3d(in_channels, out_channels, kernel_size=1, bias=bias)
+ if use_bn:
+ kernel["bn"] = nn.BatchNorm3d(
+ out_channels, eps=norm_eps, momentum=norm_momentum
+ )
+ assert (
+ activation in supported_act_functions
+ ), f"Conv3dPwBnAct: {activation} is not in supported_act_functions."
+ kernel["act"] = supported_act_functions[activation]()
+ self.kernel = nn.Sequential(kernel)
+ self.convert_flag = False
+
+ def convert(
+ self,
+ input_blob_size: Tuple,
+ **kwargs,
+ ):
+ """
+ Converts Conv3d into equivalent Conv2d for Pytorch Mobile deployment.
+ This conversion is done by first fuse conv3d with bn,
+ convert conv3d into equivalent conv2d,
+ and optionally fuse conv2d with relu.
+ After conversion, the forwarding of this module becomes:
+ Input (5d tensor) --> reshape (4d tensor) --> conv2d (4d tensor)
+ --> reshape (5d tensor) --> output (5d tensor)
+ Args:
+ input_blob_size (tuple): blob size at the input of Conv3dPwBnAct instance.
+ kwargs (any): any extra keyword arguments from upstream unused by convert().
+ """
+ assert (
+ self.convert_flag is False
+ ), "Conv3dPwBnAct: already converted, cannot be converted again"
+ self.kernel.eval()
+ # First fuse conv and bn if bn exists.
+ if hasattr(self.kernel, "bn"):
+ self.kernel = torch.quantization.fuse_modules(self.kernel, ["conv", "bn"])
+
+ batch_size = input_blob_size[0]
+ input_THW_tuple = input_blob_size[2:]
+ self._input_tensor_reshape_size = (
+ batch_size,
+ self._in_channels, # C
+ input_THW_tuple[0] * input_THW_tuple[1], # T*H
+ input_THW_tuple[2], # W
+ )
+ self._output_tensor_size = (
+ batch_size,
+ self._out_channels, # C
+ input_THW_tuple[0], # T
+ input_THW_tuple[1], # H
+ input_THW_tuple[2], # W
+ )
+ conv2d_eq = nn.Conv2d(
+ self._in_channels,
+ self._out_channels,
+ kernel_size=1,
+ bias=(self.kernel.conv.bias is not None),
+ )
+ conv_state_dict = self.kernel.conv.state_dict()
+ conv_state_dict["weight"] = conv_state_dict["weight"].squeeze(2)
+ conv2d_eq.load_state_dict(conv_state_dict)
+ self.kernel.conv = conv2d_eq
+ # Convert activatiopn function
+ self.kernel.act.convert(input_blob_size, **kwargs)
+ # Fuse act with conv after conv3d -> conv2d if act is relu
+ if self.act == "relu":
+ self.kernel = torch.quantization.fuse_modules(
+ self.kernel, ["conv", "act.act"]
+ )
+ # Insert reshape layers before/after conv2d
+ self.kernel = nn.Sequential(
+ _Reshape(self._input_tensor_reshape_size),
+ self.kernel,
+ _Reshape(self._output_tensor_size),
+ )
+ self.convert_flag = True
+ # Set new kernel in eval mode again
+ self.kernel.eval()
+
+ def forward(self, x):
+ x = self.kernel(x)
+ return x
+
+
+class Conv3d3x3x3DwBnAct(EfficientBlockBase):
+ """
+ Implements Conv3d (3x3x3 dw) + (optional) Bn + Activation layers.
+ The conv layer has fixed kernel_size = (3,3,3), depthwise, zero padding size of
+ (1,1,1), temporal stride = 1, dilation = 1
+
+ Input
+ |
+ ↓
+ conv3d (3x3x3 dw)
+ ↓
+ BatchNorm (optional)
+ ↓
+ Activation
+
+ Current implementation of this layer in QNNPACK is reasonably efficient.
+
+ convert_flag variable is to record whether the Conv3d3x3x3DwBnAct instance
+ has been converted; Conv3d3x3x3DwBnAct is in original form if convert_flag is false,
+ while it is in deployable form if convert_flag is true.
+
+ Args:
+ in_channels (int): number of channels for conv3d 3x3x3 dw.
+ spatial_stride (tuple length of 2): spatial stride for conv.
+ bias (bool): if true, use bias for conv.
+ activation (str): applies selected activation from supported_act_functions.
+ See activation_functions.py for more info about supported activations.
+ Currently ReLU ('relu'), Swish ('swish'), Hardswish ('hswish'), Identity
+ ('identity') are supported.
+ use_bn (bool): if true, use batchnorm.
+ norm_eps (float): epsilon for batchnorm.
+ norm_momentum (float): momentum for batchnorm.
+
+ Current implementation of this layer in Pytorch Mobile is efficient.
+ Sidenote: QNNPACK has best support for dw with 3x3 spatial kernel.
+ For other spatial kernels like 7x7 dw, the efficiency may be lower.
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ spatial_stride: int = 1,
+ bias=False,
+ activation: str = "relu",
+ use_bn=True,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ ):
+ super().__init__()
+ kernel = OrderedDict()
+ conv_stride = (1, spatial_stride, spatial_stride)
+ kernel["conv"] = nn.Conv3d(
+ in_channels,
+ in_channels,
+ kernel_size=(3, 3, 3),
+ stride=conv_stride,
+ groups=in_channels,
+ padding=1,
+ bias=bias,
+ )
+ if use_bn:
+ kernel["bn"] = nn.BatchNorm3d(
+ in_channels, eps=norm_eps, momentum=norm_momentum
+ )
+ assert (
+ activation in supported_act_functions
+ ), f"Conv3d3x3x3DwBnAct: {activation} is not in supported_act_functions."
+ kernel["act"] = supported_act_functions[activation]()
+ self.kernel = nn.Sequential(kernel)
+
+ self.convert_flag = False
+
+ def convert(
+ self,
+ input_blob_size: Tuple,
+ **kwargs,
+ ):
+ """
+ Converts Conv3d into equivalent Conv2d for efficient Pytorch Mobile deployment.
+ Args:
+ input_blob_size (tuple): blob size at the input of Conv3d3x3x3DwBnAct
+ instance during forward.
+ kwargs (any): any keyword argument (unused).
+ """
+ assert (
+ self.convert_flag is False
+ ), "Conv3d3x3x3DwBnAct: already converted, cannot be converted twice."
+ self.kernel.eval()
+ # Fuse conv and bn if bn exists.
+ if hasattr(self.kernel, "bn"):
+ self.kernel = torch.quantization.fuse_modules(self.kernel, ["conv", "bn"])
+ self.kernel.conv = _Conv3dTemporalKernel3Decomposed(
+ self.kernel.conv, input_blob_size[2:]
+ )
+ # Convert activatiopn function
+ self.kernel.act.convert(input_blob_size, **kwargs)
+ """
+ Since conv3d is converted into multiple conv2d,
+ will not fuse conv with act to keep arithmetic equivalency.
+ """
+ self.convert_flag = True
+ # Set new kernel in eval mode again
+ self.kernel.eval()
+
+ def forward(self, x):
+ x = self.kernel(x)
+ return x
+
+
+class Conv3dTemporalKernel1BnAct(EfficientBlockBase):
+ """
+ Implements Conv3d + Bn + Activation where Conv3d has temporal kernel of 1.
+ The conv layer has padding[0] = 0, stride[0] = 1, dilation[0] = 1.
+
+ Input
+ |
+ ↓
+ conv3d (1xkxk)
+ ↓
+ BatchNorm (optional)
+ ↓
+ Activation
+
+ Current implementation of this layer in QNNPACK is reasonably efficient
+ (not as efficient as Conv3dPwBnAct for 1x1x1 kernel).
+ Args:
+ in_channels (int): number of input channels for conv3d 1x1x1.
+ out_channels (int): number of output channels for conv3d 1x1x1.
+ bias (bool): if true, use bias for conv.
+ groups (int): number of groups for conv.
+ spstial_kernel (int): spatial kernel for conv3d.
+ spstial_stride (int): spatial stride for conv3d.
+ spatial_padding (int): spatial padding for conv3d.
+ spatial_dilation (int): spatial dilation for conv3d.
+ activation (str): applies selected activation from supported_act_functions.
+ See activation_functions.py for more info about supported activations.
+ Currently ReLU ('relu'), Swish ('swish'), Hardswish ('hswish'), Identity
+ ('identity') are supported.
+ use_bn (bool): if true, use batchnorm.
+ norm_eps (float): epsilon for batchnorm.
+ norm_momentum (float): momentum for batchnorm.
+
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ bias=False,
+ groups: int = 1,
+ spatial_kernel: int = 1,
+ spatial_stride: int = 1,
+ spatial_padding: int = 0,
+ spatial_dilation: int = 1,
+ activation: str = "relu",
+ use_bn=True,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ ):
+ super().__init__()
+
+ kernel_size = (1, spatial_kernel, spatial_kernel)
+ stride = (1, spatial_stride, spatial_stride)
+ padding = (0, spatial_padding, spatial_padding)
+ dilation = (1, spatial_dilation, spatial_dilation)
+ kernel = OrderedDict()
+ kernel["conv"] = nn.Conv3d(
+ in_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ padding=padding,
+ stride=stride,
+ dilation=dilation,
+ groups=groups,
+ bias=bias,
+ )
+ if use_bn:
+ kernel["bn"] = nn.BatchNorm3d(
+ out_channels, eps=norm_eps, momentum=norm_momentum
+ )
+ assert (
+ activation in supported_act_functions
+ ), f"Conv3dTemporalKernel1BnAct: {activation} is not in supported_act_functions."
+ kernel["act"] = supported_act_functions[activation]()
+ self.kernel = nn.Sequential(kernel)
+
+ self.convert_flag = False
+
+ def convert(
+ self,
+ input_blob_size: Tuple,
+ **kwargs,
+ ):
+ """
+ Converts Conv3d into equivalent Conv2d for QNNPACK deployment.
+ This conversion is done by first fuse conv3d with bn,
+ convert conv3d into equivalent conv2d,
+ and optionally fuse conv2d with relu.
+ Args:
+ input_blob_size (tuple): blob size at the input of
+ Conv3dTemporalKernel1BnAct instance during forward.
+ kwargs (any): any keyword argument (unused).
+ """
+ assert (
+ self.convert_flag is False
+ ), "Conv3dTemporalKernel1BnAct: already converted, cannot be converted again"
+ self.kernel.eval()
+ # First fuse conv and bn if bn exists.
+ if hasattr(self.kernel, "bn"):
+ self.kernel = torch.quantization.fuse_modules(self.kernel, ["conv", "bn"])
+
+ self.kernel.conv = _Conv3dTemporalKernel1Decomposed(
+ self.kernel.conv, input_blob_size[2:]
+ )
+ # Convert activatiopn function
+ self.kernel.act.convert(input_blob_size, **kwargs)
+
+ self.convert_flag = True
+ # Set new kernel in eval mode again
+ self.kernel.eval()
+
+ def forward(self, x):
+ x = self.kernel(x)
+ return x
+
+
+class Conv3d3x1x1BnAct(EfficientBlockBase):
+ """
+ Implements Conv3d (3x1x1) + (optional) Bn + Activation for pointwise layers.
+ The conv layer has fixed kernel of (3, 1, 1), zero padding size of
+ (1, 0, 0), stride = (1, 1, 1), dilation = 1.
+
+ Input
+ |
+ ↓
+ conv3d (3x1x1)
+ ↓
+ BatchNorm (optional)
+ ↓
+ Activation
+
+ For regular convolution (i.e., groups=1), current implementation of this layer in
+ QNNPACK is reasonably efficient.
+ For depthwise convolution (i.e., groups=out_channels), current implementation of this
+ layer in QNNPACK is not efficient as Conv3d3x3x3DwBnRelu, as QNNPACK does not have
+ optimization for 1x1 depthwise convolution. The latencies of fp32 operation are similar
+ for Conv3d3x1x1BnAct and Conv3d3x3x3DwBnRelu, while with int8 operation Conv3d3x1x1BnAct
+ is 1.5X slower than Conv3d3x3x3DwBnRelu.
+
+ self.convert_flag property records whether the Conv3d3x1x1BnAct instance has been
+ converted; Conv3d3x1x1BnAct is in original form if convert_flag is false, while it
+ is in deployable form if convert_flag is true.
+
+ Args:
+ in_channels (int): number of input channels for conv3d 3x1x1.
+ out_channels (int): number of output channels for conv3d 3x1x1.
+ groups (int): number of groups for conv.
+ bias (bool): if true, use bias for conv.
+ activation (str): applies selected activation from supported_act_functions.
+ See activation_functions.py for more info about supported activations.
+ Currently ReLU ('relu'), Swish ('swish'), Hardswish ('hswish'), Identity
+ ('identity') are supported.
+ use_bn (bool): if true, use batchnorm.
+ norm_eps (float): epsilon for batchnorm.
+ norm_momentum (float): momentum for batchnorm.
+
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ groups: int = 1,
+ bias=False,
+ activation: str = "relu",
+ use_bn=True,
+ norm_eps=1e-5,
+ norm_momentum=0.1,
+ ):
+ super().__init__()
+ kernel = OrderedDict()
+ kernel["conv"] = nn.Conv3d(
+ in_channels,
+ out_channels,
+ kernel_size=(3, 1, 1),
+ groups=groups,
+ padding=(1, 0, 0),
+ bias=bias,
+ )
+
+ if groups == out_channels:
+ logging.warn(
+ (
+ "Conv3d3x1x1BnAct has low efficiency for depthwise conv. "
+ "Consider using Conv3d3x3x3DwBnRelu instead."
+ )
+ )
+
+ if use_bn:
+ kernel["bn"] = nn.BatchNorm3d(
+ out_channels, eps=norm_eps, momentum=norm_momentum
+ )
+ assert (
+ activation in supported_act_functions
+ ), f"Conv3d3x1x1BnAct: {activation} is not in supported_act_functions."
+ kernel["act"] = supported_act_functions[activation]()
+ self.kernel = nn.Sequential(kernel)
+ self.convert_flag = False
+
+ def convert(
+ self,
+ input_blob_size,
+ **kwargs,
+ ):
+ """
+ Converts Conv3d into equivalent Conv2d for Pytorch Mobile deployment
+
+ """
+ assert (
+ self.convert_flag is False
+ ), "Conv3d3x1x1BnAct: already converted, cannot be converted twice"
+ self.kernel.eval()
+ # Fuse conv and bn if bn exists.
+ if hasattr(self.kernel, "bn"):
+ self.kernel = torch.quantization.fuse_modules(self.kernel, ["conv", "bn"])
+ self.kernel.conv = _Conv3dTemporalKernel3Decomposed(
+ self.kernel.conv, input_blob_size[2:]
+ )
+ # Convert activation function
+ self.kernel.act.convert(input_blob_size, **kwargs)
+ # Since conv3d is converted into multiple conv2d, will not fuse conv with relu
+ # to keep arithmetic equivalency.
+ self.convert_flag = True
+ self.kernel.eval()
+
+ def forward(self, x):
+ x = self.kernel(x)
+ return x
+
+
+class Conv3d5x1x1BnAct(EfficientBlockBase):
+ """
+ Implements Conv3d (5x1x1) + (optional) Bn + Activation for pointwise layers.
+ The conv layer has fixed kernel of (5, 1, 1), zero padding size of
+ (2, 0, 0), stride = (1, 1, 1), dilation = 1.
+
+ Input
+ |
+ ↓
+ conv3d (5x1x1)
+ ↓
+ BatchNorm (optional)
+ ↓
+ Activation
+
+ For regular convolution (i.e., groups=1), current implementation of this layer in
+ QNNPACK is reasonably efficient.
+
+ self.convert_flag property records whether the Conv3d5x1x1BnAct instance has been
+ converted; Conv3d5x1x1BnAct is in original form if convert_flag is false, while it
+ is in deployable form if convert_flag is true.
+
+ Args:
+ in_channels (int): number of input channels for conv3d 3x1x1.
+ out_channels (int): number of output channels for conv3d 3x1x1.
+ groups (int): number of groups for conv.
+ bias (bool): if true, use bias for conv.
+ activation (str): applies selected activation from supported_act_functions.
+ See activation_functions.py for more info about supported activations.
+ Currently ReLU ('relu'), Swish ('swish'), Hardswish ('hswish'), Identity
+ ('identity') are supported.
+ use_bn (bool): if true, use batchnorm.
+ norm_eps (float): epsilon for batchnorm.
+ norm_momentum (float): momentum for batchnorm.
+
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ groups: int = 1,
+ bias=False,
+ activation: str = "relu",
+ use_bn=True,
+ norm_eps=1e-5,
+ norm_momentum=0.1,
+ ):
+ super().__init__()
+ kernel = OrderedDict()
+ kernel["conv"] = nn.Conv3d(
+ in_channels,
+ out_channels,
+ kernel_size=(5, 1, 1),
+ groups=groups,
+ padding=(2, 0, 0),
+ bias=bias,
+ )
+
+ if use_bn:
+ kernel["bn"] = nn.BatchNorm3d(
+ out_channels, eps=norm_eps, momentum=norm_momentum
+ )
+ assert (
+ activation in supported_act_functions
+ ), f"Conv3d5x1x1BnAct: {activation} is not in supported_act_functions."
+ kernel["act"] = supported_act_functions[activation]()
+ self.kernel = nn.Sequential(kernel)
+ self.convert_flag = False
+
+ def convert(self, input_blob_size, **kwargs):
+ """
+ Converts Conv3d into equivalent Conv2d for Pytorch Mobile deployment
+
+ """
+ assert (
+ self.convert_flag is False
+ ), "Conv3d5x1x1BnAct: already converted, cannot be converted twice"
+ self.kernel.eval()
+ # Fuse conv and bn if bn exists.
+ if hasattr(self.kernel, "bn"):
+ self.kernel = torch.quantization.fuse_modules(self.kernel, ["conv", "bn"])
+ self.kernel.conv = _Conv3dTemporalKernel5Decomposed(
+ self.kernel.conv, input_blob_size[2:]
+ )
+ # Convert activatiopn function
+ self.kernel.act.convert(input_blob_size, **kwargs)
+ # Since conv3d is converted into multiple conv2d, will not fuse conv with relu
+ # to keep arithmetic equivalency.
+ self.convert_flag = True
+ self.kernel.eval()
+
+ def forward(self, x):
+ x = self.kernel(x)
+ return x
diff --git a/pytorchvideo/layers/accelerator/mobile_cpu/fully_connected.py b/pytorchvideo/layers/accelerator/mobile_cpu/fully_connected.py
new file mode 100644
index 00000000..83421d4f
--- /dev/null
+++ b/pytorchvideo/layers/accelerator/mobile_cpu/fully_connected.py
@@ -0,0 +1,26 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import torch.nn as nn
+from pytorchvideo.accelerator.efficient_blocks.no_op_convert_block import (
+ NoOpConvertBlock,
+)
+
+
+class FullyConnected(NoOpConvertBlock):
+ """
+ Implements fully connected layer. This operator is natively supported by QNNPACK for
+ mobile CPU with good efficiency, and no change is made upon convert().
+ Args:
+ in_features (int): input channels for FC layer.
+ out_features (int): output channels for FC layer.
+ bias (bool): if True, bias is applied
+ """
+
+ def __init__(
+ self,
+ in_features: int,
+ out_features: int,
+ bias: bool = True,
+ ):
+
+ super().__init__(model=nn.Linear(in_features, out_features, bias=bias))
diff --git a/pytorchvideo/layers/accelerator/mobile_cpu/pool.py b/pytorchvideo/layers/accelerator/mobile_cpu/pool.py
new file mode 100644
index 00000000..1e92ee9e
--- /dev/null
+++ b/pytorchvideo/layers/accelerator/mobile_cpu/pool.py
@@ -0,0 +1,113 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Tuple, Union
+
+import torch.nn as nn
+from pytorchvideo.accelerator.efficient_blocks.efficient_block_base import (
+ EfficientBlockBase,
+)
+from pytorchvideo.accelerator.efficient_blocks.no_op_convert_block import (
+ NoOpConvertBlock,
+)
+
+
+class AdaptiveAvgPool3dOutSize1(EfficientBlockBase):
+ """
+ Implements AdaptiveAvgPool3d with output (T, H, W) = (1, 1, 1). This operator has
+ better efficiency than AdaptiveAvgPool for mobile CPU.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.pool = nn.AdaptiveAvgPool3d(1)
+ self.convert_flag = False
+
+ def convert(self, input_blob_size: Tuple, **kwargs):
+ """
+ Converts AdaptiveAvgPool into AvgPool with constant kernel size for better
+ efficiency.
+ Args:
+ input_blob_size (tuple): blob size at the input of
+ AdaptiveAvgPool3dOutSize1 instance during forward.
+ kwargs (any): any keyword argument (unused).
+ """
+ assert (
+ self.convert_flag is False
+ ), "AdaptiveAvgPool3dOutSize1: already converted, cannot be converted again"
+ kernel_size = input_blob_size[2:]
+ self.pool = nn.AvgPool3d(kernel_size)
+ self.convert_flag = True
+
+ def forward(self, x):
+ return self.pool(x)
+
+
+class AdaptiveAvgPool2dOutSize1(EfficientBlockBase):
+ """
+ Implements AdaptiveAvgPool2d with output (H, W) = (1, 1). This operator has
+ better efficiency than AdaptiveAvgPool for mobile CPU.
+ """
+
+ def __init__(
+ self,
+ ):
+ super().__init__()
+ self.pool = nn.AdaptiveAvgPool2d(1)
+ self.convert_flag = False
+
+ def convert(self, input_blob_size: Tuple, **kwargs):
+ """
+ Converts AdaptiveAvgPool into AvgPool with constant kernel size for better
+ efficiency.
+ Args:
+ input_blob_size (tuple): blob size at the input of
+ AdaptiveAvgPool2dOutSize1 instance during forward.
+ kwargs (any): any keyword argument (unused).
+ """
+ assert (
+ self.convert_flag is False
+ ), "AdaptiveAvgPool2dOutSize1: already converted, cannot be converted again"
+ kernel_size = input_blob_size[2:]
+ self.pool = nn.AvgPool2d(kernel_size)
+ self.convert_flag = True
+
+ def forward(self, x):
+ return self.pool(x)
+
+
+class AdaptiveAvgPool3d(NoOpConvertBlock):
+ """
+ Implements AdaptiveAvgPool3d with any output (T, H, W) size. This operator is
+ supported by QNNPACK for mobile CPU with resonable efficiency, and no change is
+ made upon convert(). If the output (T, H, W) = (1, 1, 1), use AdaptiveAvgPool3dOutSize1
+ for better efficiency.
+ Args:
+ output_size (int or tuple): when it is a tuple, the output (T, H, W) of pool
+ will be equal to output_size. When it is an int, the output (T, H, W)
+ will be equal to (output_size, output_size, output_size).
+ """
+
+ def __init__(
+ self,
+ output_size: Union[int, Tuple],
+ ):
+ super().__init__(model=nn.AdaptiveAvgPool3d(output_size))
+
+
+class AdaptiveAvgPool2d(NoOpConvertBlock):
+ """
+ Implements AdaptiveAvgPool2d with any output (H, W) size. This operator is
+ supported by QNNPACK for mobile CPU with resonable efficiency, and no change is
+ made upon convert(). If the output (H, W) = (1, 1), use AdaptiveAvgPool2dOutSize1
+ for better efficiency.
+ Args:
+ output_size (int or tuple): when it is a tuple, the output (H, W) of pool
+ will be equal to output_size. When it is an int, the output (H, W)
+ will be equal to (output_size, output_size).
+ """
+
+ def __init__(
+ self,
+ output_size: Union[int, Tuple],
+ ):
+ super().__init__(model=nn.AdaptiveAvgPool2d(output_size))
diff --git a/pytorchvideo/layers/batch_norm.py b/pytorchvideo/layers/batch_norm.py
new file mode 100644
index 00000000..ed4a24f5
--- /dev/null
+++ b/pytorchvideo/layers/batch_norm.py
@@ -0,0 +1,120 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import torch
+import torch.distributed as dist
+from fvcore.nn.distributed import differentiable_all_reduce
+from pytorchvideo.layers.distributed import get_world_size
+from torch import nn
+
+
+class NaiveSyncBatchNorm1d(nn.BatchNorm1d):
+ """
+ An implementation of 1D naive sync batch normalization. See details in
+ NaiveSyncBatchNorm2d below.
+ """
+
+ def forward(self, input):
+ if get_world_size() == 1 or not self.training:
+ return super().forward(input)
+
+ B, C = input.shape[0], input.shape[1]
+
+ mean = torch.mean(input, dim=[0, 2])
+ meansqr = torch.mean(input * input, dim=[0, 2])
+
+ assert B > 0, "SyncBatchNorm does not support zero batch size."
+
+ vec = torch.cat([mean, meansqr], dim=0)
+ vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size())
+ mean, meansqr = torch.split(vec, C)
+ var = meansqr - mean * mean
+
+ invstd = torch.rsqrt(var + self.eps)
+ scale = self.weight * invstd
+ bias = self.bias - mean * scale
+ scale = scale.reshape(1, -1, 1)
+ bias = bias.reshape(1, -1, 1)
+
+ self.running_mean += self.momentum * (mean.detach() - self.running_mean)
+ self.running_var += self.momentum * (var.detach() - self.running_var)
+
+ return input * scale + bias
+
+
+class NaiveSyncBatchNorm2d(nn.BatchNorm2d):
+ """
+ An implementation of 2D naive sync batch normalization.
+ In PyTorch<=1.5, ``nn.SyncBatchNorm`` has incorrect gradient
+ when the batch size on each worker is different.
+ (e.g., when scale augmentation is used, or when it is applied to mask head).
+
+ This is a slower but correct alternative to `nn.SyncBatchNorm`.
+
+ Note:
+ This module computes overall statistics by using
+ statistics of each worker with equal weight. The result is true statistics
+ of all samples (as if they are all on one worker) only when all workers
+ have the same (N, H, W). This mode does not support inputs with zero batch size.
+ """
+
+ def forward(self, input):
+ if get_world_size() == 1 or not self.training:
+ return super().forward(input)
+
+ B, C = input.shape[0], input.shape[1]
+
+ mean = torch.mean(input, dim=[0, 2, 3])
+ meansqr = torch.mean(input * input, dim=[0, 2, 3])
+
+ assert B > 0, "SyncBatchNorm does not support zero batch size."
+
+ vec = torch.cat([mean, meansqr], dim=0)
+ vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size())
+ mean, meansqr = torch.split(vec, C)
+ var = meansqr - mean * mean
+
+ invstd = torch.rsqrt(var + self.eps)
+ scale = self.weight * invstd
+ bias = self.bias - mean * scale
+ scale = scale.reshape(1, -1, 1, 1)
+ bias = bias.reshape(1, -1, 1, 1)
+
+ self.running_mean += self.momentum * (mean.detach() - self.running_mean)
+ self.running_var += self.momentum * (var.detach() - self.running_var)
+
+ return input * scale + bias
+
+
+class NaiveSyncBatchNorm3d(nn.BatchNorm3d):
+
+ """
+ An implementation of 3D naive sync batch normalization. See details in
+ NaiveSyncBatchNorm2d above.
+ """
+
+ def forward(self, input):
+ if get_world_size() == 1 or not self.training:
+ return super().forward(input)
+
+ B, C = input.shape[0], input.shape[1]
+
+ mean = torch.mean(input, dim=[0, 2, 3, 4])
+ meansqr = torch.mean(input * input, dim=[0, 2, 3, 4])
+
+ assert B > 0, "SyncBatchNorm does not support zero batch size."
+
+ vec = torch.cat([mean, meansqr], dim=0)
+ vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size())
+ mean, meansqr = torch.split(vec, C)
+ var = meansqr - mean * mean
+
+ invstd = torch.rsqrt(var + self.eps)
+ scale = self.weight * invstd
+ bias = self.bias - mean * scale
+ scale = scale.reshape(1, -1, 1, 1, 1)
+ bias = bias.reshape(1, -1, 1, 1, 1)
+
+ self.running_mean += self.momentum * (mean.detach() - self.running_mean)
+ self.running_var += self.momentum * (var.detach() - self.running_var)
+
+ return input * scale + bias
diff --git a/pytorchvideo/layers/convolutions.py b/pytorchvideo/layers/convolutions.py
new file mode 100644
index 00000000..35d2ebc9
--- /dev/null
+++ b/pytorchvideo/layers/convolutions.py
@@ -0,0 +1,237 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Callable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from pytorchvideo.layers.utils import set_attributes
+from torch.nn.common_types import _size_3_t
+
+
+class ConvReduce3D(nn.Module):
+ """
+ Builds a list of convolutional operators and performs summation on the outputs.
+
+ ::
+
+ Conv3d, Conv3d, ..., Conv3d
+ ↓
+ Sum
+ """
+
+ def __init__(
+ self,
+ *,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: Tuple[_size_3_t],
+ stride: Optional[Tuple[_size_3_t]] = None,
+ padding: Optional[Tuple[_size_3_t]] = None,
+ padding_mode: Optional[Tuple[str]] = None,
+ dilation: Optional[Tuple[_size_3_t]] = None,
+ groups: Optional[Tuple[int]] = None,
+ bias: Optional[Tuple[bool]] = None,
+ reduction_method: str = "sum",
+ ) -> None:
+ """
+ Args:
+ in_channels int: number of input channels.
+ out_channels int: number of output channels produced by the convolution(s).
+ kernel_size tuple(_size_3_t): Tuple of sizes of the convolutionaling kernels.
+ stride tuple(_size_3_t): Tuple of strides of the convolutions.
+ padding tuple(_size_3_t): Tuple of paddings added to all three sides of the
+ input.
+ padding_mode tuple(string): Tuple of padding modes for each convs.
+ Options include `zeros`, `reflect`, `replicate` or `circular`.
+ dilation tuple(_size_3_t): Tuple of spacings between kernel elements.
+ groups tuple(_size_3_t): Tuple of numbers of blocked connections from input
+ channels to output channels.
+ bias tuple(bool): If `True`, adds a learnable bias to the output.
+ reduction_method str: Options include `sum` and `cat`.
+ """
+ super().__init__()
+ assert reduction_method in ("sum", "cat")
+ self.reduction_method = reduction_method
+ conv_list = []
+ for ind in range(len(kernel_size)):
+ conv_param = {
+ "in_channels": in_channels,
+ "out_channels": out_channels,
+ "kernel_size": kernel_size[ind],
+ }
+ if stride is not None and stride[ind] is not None:
+ conv_param["stride"] = stride[ind]
+ if padding is not None and padding[ind] is not None:
+ conv_param["padding"] = padding[ind]
+ if dilation is not None and dilation[ind] is not None:
+ conv_param["dilation"] = dilation[ind]
+ if groups is not None and groups[ind] is not None:
+ conv_param["groups"] = groups[ind]
+ if bias is not None and bias[ind] is not None:
+ conv_param["bias"] = bias[ind]
+ if padding_mode is not None and padding_mode[ind] is not None:
+ conv_param["padding_mode"] = padding_mode[ind]
+ conv_list.append(nn.Conv3d(**conv_param))
+ self.convs = nn.ModuleList(conv_list)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ output = []
+ for ind in range(len(self.convs)):
+ output.append(self.convs[ind](x))
+ if self.reduction_method == "sum":
+ output = torch.stack(output, dim=0).sum(dim=0, keepdim=False)
+ elif self.reduction_method == "cat":
+ output = torch.cat(output, dim=1)
+ return output
+
+
+def create_conv_2plus1d(
+ *,
+ # Conv configs.
+ in_channels: int,
+ out_channels: int,
+ inner_channels: int = None,
+ conv_xy_first: bool = False,
+ kernel_size: Tuple[int] = (3, 3, 3),
+ stride: Tuple[int] = (2, 2, 2),
+ padding: Tuple[int] = (1, 1, 1),
+ bias: bool = False,
+ dilation: Tuple[int] = (1, 1, 1),
+ groups: int = 1,
+ # BN configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+) -> nn.Module:
+ """
+ Create a 2plus1d conv layer. It performs spatiotemporal Convolution, BN, and
+ Relu following by a spatiotemporal pooling.
+
+ ::
+
+ Conv_t (or Conv_xy if conv_xy_first = True)
+ ↓
+ Normalization
+ ↓
+ Activation
+ ↓
+ Conv_xy (or Conv_t if conv_xy_first = True)
+
+ Normalization options include: BatchNorm3d and None (no normalization).
+ Activation options include: ReLU, Softmax, Sigmoid, and None (no activation).
+
+ Args:
+ in_channels (int): input channel size of the convolution.
+ out_channels (int): output channel size of the convolution.
+ kernel_size (tuple): convolutional kernel size(s).
+ stride (tuple): convolutional stride size(s).
+ padding (tuple): convolutional padding size(s).
+ bias (bool): convolutional bias. If true, adds a learnable bias to the
+ output.
+ groups (int): Number of groups in convolution layers. value >1 is unsupported.
+ dilation (tuple): dilation value in convolution layers. value >1 is unsupported.
+ conv_xy_first (bool): If True, spatial convolution comes before temporal conv
+
+ norm (callable): a callable that constructs normalization layer, options
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+
+ activation (callable): a callable that constructs activation layer, options
+ include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
+ activation).
+
+ Returns:
+ (nn.Module): 2plus1d conv layer.
+ """
+ if inner_channels is None:
+ inner_channels = out_channels
+
+ assert (
+ groups == 1
+ ), "Support for groups is not implemented in R2+1 convolution layer"
+ assert (
+ max(dilation) == 1 and min(dilation) == 1
+ ), "Support for dillaiton is not implemented in R2+1 convolution layer"
+
+ conv_t_module = nn.Conv3d(
+ in_channels=in_channels if not conv_xy_first else inner_channels,
+ out_channels=inner_channels if not conv_xy_first else out_channels,
+ kernel_size=(kernel_size[0], 1, 1),
+ stride=(stride[0], 1, 1),
+ padding=(padding[0], 0, 0),
+ bias=bias,
+ )
+ norm_module = (
+ None
+ if norm is None
+ else norm(num_features=inner_channels, eps=norm_eps, momentum=norm_momentum)
+ )
+ activation_module = None if activation is None else activation()
+ conv_xy_module = nn.Conv3d(
+ in_channels=inner_channels if not conv_xy_first else in_channels,
+ out_channels=out_channels if not conv_xy_first else inner_channels,
+ kernel_size=(1, kernel_size[1], kernel_size[2]),
+ stride=(1, stride[1], stride[2]),
+ padding=(0, padding[1], padding[2]),
+ bias=bias,
+ )
+
+ return Conv2plus1d(
+ conv_t=conv_t_module,
+ norm=norm_module,
+ activation=activation_module,
+ conv_xy=conv_xy_module,
+ conv_xy_first=conv_xy_first,
+ )
+
+
+class Conv2plus1d(nn.Module):
+ """
+ Implementation of 2+1d Convolution by factorizing 3D Convolution into an 1D temporal
+ Convolution and a 2D spatial Convolution with Normalization and Activation module
+ in between:
+
+ ::
+
+ Conv_t (or Conv_xy if conv_xy_first = True)
+ ↓
+ Normalization
+ ↓
+ Activation
+ ↓
+ Conv_xy (or Conv_t if conv_xy_first = True)
+
+ The 2+1d Convolution is used to build the R(2+1)D network.
+ """
+
+ def __init__(
+ self,
+ *,
+ conv_t: nn.Module = None,
+ norm: nn.Module = None,
+ activation: nn.Module = None,
+ conv_xy: nn.Module = None,
+ conv_xy_first: bool = False,
+ ) -> None:
+ """
+ Args:
+ conv_t (torch.nn.modules): temporal convolution module.
+ norm (torch.nn.modules): normalization module.
+ activation (torch.nn.modules): activation module.
+ conv_xy (torch.nn.modules): spatial convolution module.
+ conv_xy_first (bool): If True, spatial convolution comes before temporal conv
+ """
+ super().__init__()
+ set_attributes(self, locals())
+ assert self.conv_t is not None
+ assert self.conv_xy is not None
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.conv_xy(x) if self.conv_xy_first else self.conv_t(x)
+ x = self.norm(x) if self.norm else x
+ x = self.activation(x) if self.activation else x
+ x = self.conv_t(x) if self.conv_xy_first else self.conv_xy(x)
+ return x
diff --git a/pytorchvideo/layers/distributed.py b/pytorchvideo/layers/distributed.py
new file mode 100644
index 00000000..4bf37fc9
--- /dev/null
+++ b/pytorchvideo/layers/distributed.py
@@ -0,0 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import torch
+
+
+def get_world_size() -> int:
+ """
+ Simple wrapper for correctly getting worldsize in both distributed
+ / non-distributed settings
+ """
+ return (
+ torch.distributed.get_world_size()
+ if torch.distributed.is_available() and torch.distributed.is_initialized()
+ else 1
+ )
diff --git a/pytorchvideo/layers/fusion.py b/pytorchvideo/layers/fusion.py
new file mode 100644
index 00000000..9656bec4
--- /dev/null
+++ b/pytorchvideo/layers/fusion.py
@@ -0,0 +1,149 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Callable, List
+
+import torch
+import torch.nn as nn
+
+
+"""
+Fusion layers are nn.Modules that take a list of Tensors (e.g. from a multi-stream
+architecture), and return a single fused Tensor. This file has several
+different types of fusion layers and a factory function "make_fusion_layer" to
+construct them.
+"""
+
+
+def make_fusion_layer(method: str, feature_dims: List[int]):
+ """
+ Args:
+ method (str): the fusion method to be constructed. Options:
+ - 'concat'
+ - 'temporal_concat'
+ - 'max'
+ - 'sum'
+ - 'prod'
+
+ feature_dims (List[int]): the first argument of all fusion layers. It holds a list
+ of required feature_dims for each tensor input (where the tensor inputs are of
+ shape (batch_size, seq_len, feature_dim)). The list order must corresponds to
+ the tensor order passed to forward(...).
+ """
+ if method == "concat":
+ return ConcatFusion(feature_dims)
+ elif method == "temporal_concat":
+ return TemporalConcatFusion(feature_dims)
+ elif method == "max":
+ return ReduceFusion(feature_dims, lambda x: torch.max(x, dim=0).values)
+ elif method == "sum":
+ return ReduceFusion(feature_dims, lambda x: torch.sum(x, dim=0))
+ elif method == "prod":
+ return ReduceFusion(feature_dims, lambda x: torch.prod(x, dim=0))
+ else:
+ raise NotImplementedError(f"Fusion {method} not available.")
+
+
+class ConcatFusion(nn.Module):
+ """
+ Concatenates all inputs by their last dimension. The resulting tensor last dim will be
+ the sum of the last dimension of all input tensors.
+ """
+
+ def __init__(self, feature_dims: List[int]):
+ super().__init__()
+ _verify_feature_dim(feature_dims)
+ self._output_dim = sum(feature_dims)
+
+ @property
+ def output_dim(self):
+ """
+ Last dimension size of forward(..) tensor output.
+ """
+ return self._output_dim
+
+ def forward(self, input_list: List[torch.Tensor]) -> torch.Tensor:
+ """
+ Args:
+ input_list (List[torch.Tensor]): a list of tensors of shape
+ (batch_size, seq_len, feature_dim).
+
+ Returns:
+ Tensor of shape (batch_size, seq_len, sum(feature_dims)) where sum(feature_dims)
+ is the sum of all input feature_dims.
+ """
+ return torch.cat(input_list, dim=-1)
+
+
+class TemporalConcatFusion(nn.Module):
+ """
+ Concatenates all inputs by their temporal dimension which is assumed to be dim=1.
+ """
+
+ def __init__(self, feature_dims: List[int]):
+ super().__init__()
+ _verify_feature_dim(feature_dims)
+
+ # All input dimensions must be the same
+ self._output_dim = max(feature_dims)
+ assert self._output_dim == min(feature_dims)
+
+ @property
+ def output_dim(self):
+ """
+ Last dimension size of forward(..) tensor output.
+ """
+ return self._output_dim
+
+ def forward(self, input_list: List[torch.Tensor]) -> torch.Tensor:
+ """
+ Args:
+ input_list (List[torch.Tensor]): a list of tensors of shape
+ (batch_size, seq_len, feature_dim)
+
+ Returns:
+ Tensor of shape (batch_size, sum(seq_len), feature_dim) where sum(seq_len) is
+ the sum of all input tensors.
+ """
+ return torch.cat(input_list, dim=1)
+
+
+class ReduceFusion(nn.Module):
+ """
+ Generic fusion method which takes a callable which takes the list of input tensors
+ and expects a single tensor to be used. This class can be used to implement fusion
+ methods like "sum", "max" and "prod".
+ """
+
+ def __init__(
+ self, feature_dims: List[int], reduce_fn: Callable[[torch.Tensor], torch.Tensor]
+ ):
+ super().__init__()
+ _verify_feature_dim(feature_dims)
+ self.reduce_fn = reduce_fn
+
+ # All input dimensions must be the same
+ self._output_dim = max(feature_dims)
+ assert self._output_dim == min(feature_dims)
+
+ @property
+ def output_dim(self):
+ """
+ Last dimension size of forward(..) tensor output.
+ """
+ return self._output_dim
+
+ def forward(self, input_list: List[torch.Tensor]) -> torch.Tensor:
+ """
+ Args:
+ input_list (List[torch.Tensor]): a list of tensors of shape
+ (batch_size, seq_len, feature_dim).
+
+ Returns:
+ Tensor of shape (batch_size, seq_len, feature_dim).
+ """
+ return self.reduce_fn(torch.stack(input_list))
+
+
+def _verify_feature_dim(feature_dims: List[int]):
+ assert isinstance(feature_dims, list)
+ assert all(x > 0 for x in feature_dims)
diff --git a/pytorchvideo/layers/mlp.py b/pytorchvideo/layers/mlp.py
new file mode 100644
index 00000000..78556e77
--- /dev/null
+++ b/pytorchvideo/layers/mlp.py
@@ -0,0 +1,62 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from typing import Callable, List, Optional, Tuple
+
+from torch import nn
+
+
+def make_multilayer_perceptron(
+ fully_connected_dims: List[int],
+ norm: Optional[Callable] = None,
+ mid_activation: Callable = nn.ReLU,
+ final_activation: Optional[Callable] = nn.ReLU,
+ dropout_rate: float = 0.0,
+) -> Tuple[nn.Module, int]:
+ """
+ Factory function for Multi-Layer Perceptron. These are constructed as repeated
+ blocks of the following format where each fc represents the blocks output/input dimension.
+
+ ::
+
+ Linear (in=fc[i-1], out=fc[i])
+ ↓
+ Normalization (norm)
+ ↓
+ Activation (mid_activation)
+ ↓
+ After the repeated Perceptron blocks,
+ a final dropout and activation layer is applied:
+ ↓
+ Dropout (p=dropout_rate)
+ ↓
+ Activation (final_activation)
+
+ """
+ assert isinstance(fully_connected_dims, list)
+ assert len(fully_connected_dims) > 1
+ assert all(_is_pos_int(x) for x in fully_connected_dims)
+
+ layers = []
+ cur_dim = fully_connected_dims[0]
+ for dim in fully_connected_dims[1:-1]:
+ layers.append(nn.Linear(cur_dim, dim))
+ if norm is not None:
+ layers.append(norm(dim))
+ layers.append(mid_activation())
+ cur_dim = dim
+ layers.append(nn.Linear(cur_dim, fully_connected_dims[-1]))
+ if dropout_rate > 0:
+ layers.append(nn.Dropout(p=dropout_rate))
+ if final_activation is not None:
+ layers.append(final_activation())
+
+ mlp = nn.Sequential(*layers)
+ output_dim = fully_connected_dims[-1]
+ return mlp, output_dim
+
+
+def _is_pos_int(number: int) -> bool:
+ """
+ Returns True if a number is a positive integer.
+ """
+ return type(number) == int and number >= 0
diff --git a/pytorchvideo/layers/nonlocal_net.py b/pytorchvideo/layers/nonlocal_net.py
new file mode 100644
index 00000000..aa88e5fb
--- /dev/null
+++ b/pytorchvideo/layers/nonlocal_net.py
@@ -0,0 +1,153 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Callable, Iterable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from pytorchvideo.layers.utils import set_attributes
+
+
+class NonLocal(nn.Module):
+ """
+ Builds Non-local Neural Networks as a generic family of building
+ blocks for capturing long-range dependencies. Non-local Network
+ computes the response at a position as a weighted sum of the
+ features at all positions. This building block can be plugged into
+ many computer vision architectures.
+ More details in the paper:
+ Wang, Xiaolong, Ross Girshick, Abhinav Gupta, and Kaiming He.
+ "Non-local neural networks."
+ In Proceedings of the IEEE conference on CVPR, 2018.
+ """
+
+ def __init__(
+ self,
+ *,
+ conv_theta: nn.Module,
+ conv_phi: nn.Module,
+ conv_g: nn.Module,
+ conv_out: nn.Module,
+ pool: Optional[nn.Module] = None,
+ norm: Optional[nn.Module] = None,
+ instantiation: str = "dot_product",
+ ) -> None:
+ super().__init__()
+ set_attributes(self, locals())
+ assert None not in (conv_theta, conv_phi, conv_g, conv_out)
+ assert instantiation in (
+ "dot_product",
+ "softmax",
+ ), "Unknown norm type {}".format(instantiation)
+ assert (
+ len(
+ {
+ self.conv_theta.out_channels,
+ self.conv_phi.out_channels,
+ self.conv_g.out_channels,
+ self.conv_out.in_channels,
+ }
+ )
+ == 1
+ ), "Nonlocal convolution's input/ output dimension mismatch."
+
+ def forward(self, x) -> torch.Tensor:
+ dim_inner = self.conv_theta.out_channels
+
+ x_identity = x
+ N, C, T, H, W = x.size()
+
+ theta = self.conv_theta(x)
+ # Perform temporal-spatial pooling to reduce the computation.
+ if self.pool is not None:
+ x = self.pool(x)
+
+ phi = self.conv_phi(x)
+ g = self.conv_g(x)
+
+ theta = theta.view(N, dim_inner, -1)
+ phi = phi.view(N, dim_inner, -1)
+ g = g.view(N, dim_inner, -1)
+
+ # (N, C, TxHxW) x (N, C, TxHxW) => (N, TxHxW, TxHxW).
+ theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi))
+ # For original Non-local paper, there are two main ways to normalize
+ # the affinity tensor:
+ # 1) Softmax normalization (norm on exp).
+ # 2) dot_product normalization.
+ if self.instantiation == "softmax":
+ # Normalizing the affinity tensor theta_phi before softmax.
+ theta_phi = theta_phi * (dim_inner ** -0.5)
+ theta_phi = nn.functional.softmax(theta_phi, dim=2)
+ elif self.instantiation == "dot_product":
+ spatial_temporal_dim = theta_phi.shape[2]
+ theta_phi = theta_phi / spatial_temporal_dim
+
+ # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW).
+ theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g))
+ # (N, C, TxHxW) => (N, C, T, H, W).
+ theta_phi_g = theta_phi_g.view(N, dim_inner, T, H, W)
+ p = self.conv_out(theta_phi_g)
+ if self.norm is not None:
+ p = self.norm(p)
+ return x_identity + p
+
+
+def create_nonlocal(
+ *,
+ # Nonlocal configs.
+ dim_in: int,
+ dim_inner: int,
+ pool_size: Optional[Tuple[int]] = (1, 1, 1),
+ instantiation: str = "softmax",
+ # Norm configs.
+ norm: Optional[Callable] = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+):
+ """
+ Builds Non-local Neural Networks as a generic family of building
+ blocks for capturing long-range dependencies. Non-local Network
+ computes the response at a position as a weighted sum of the
+ features at all positions. This building block can be plugged into
+ many computer vision architectures.
+ More details in the paper: https://arxiv.org/pdf/1711.07971
+ Args:
+ dim_in (int): number of dimension for the input.
+ dim_inner (int): number of dimension inside of the Non-local block.
+ pool_size (tuple[int]): the kernel size of spatial temporal pooling,
+ temporal pool kernel size, spatial pool kernel size, spatial pool kernel
+ size in order. By default pool_size is None, then there would be no pooling
+ used.
+ instantiation (string): supports two different instantiation method:
+ "dot_product": normalizing correlation matrix with L2.
+ "softmax": normalizing correlation matrix with Softmax.
+ norm (nn.Module): nn.Module for the normalization layer. The default is
+ nn.BatchNorm3d.
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+ """
+ if pool_size is None:
+ pool_size = (1, 1, 1)
+ assert isinstance(pool_size, Iterable)
+
+ if norm is None:
+ norm_model = None
+ else:
+ norm_model = norm(num_features=dim_in, eps=norm_eps, momentum=norm_momentum)
+
+ if any(size > 1 for size in pool_size):
+ pool_model = nn.MaxPool3d(
+ kernel_size=pool_size, stride=pool_size, padding=[0, 0, 0]
+ )
+ else:
+ pool_model = None
+
+ return NonLocal(
+ conv_theta=nn.Conv3d(dim_in, dim_inner, kernel_size=1, stride=1, padding=0),
+ conv_phi=nn.Conv3d(dim_in, dim_inner, kernel_size=1, stride=1, padding=0),
+ conv_g=nn.Conv3d(dim_in, dim_inner, kernel_size=1, stride=1, padding=0),
+ conv_out=nn.Conv3d(dim_inner, dim_in, kernel_size=1, stride=1, padding=0),
+ pool=pool_model,
+ norm=norm_model,
+ instantiation=instantiation,
+ )
diff --git a/pytorchvideo/layers/positional_encoding.py b/pytorchvideo/layers/positional_encoding.py
new file mode 100644
index 00000000..a574cee3
--- /dev/null
+++ b/pytorchvideo/layers/positional_encoding.py
@@ -0,0 +1,42 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import math
+
+import torch
+from torch import nn
+
+
+class PositionalEncoding(nn.Module):
+ """
+ Applies a positional encoding to a tensor with shape (batch_size x seq_len x embed_dim).
+
+ The positional encoding is computed as follows:
+ PE(pos,2i) = sin(pos/10000^(2i/dmodel))
+ PE(pos,2i+1) = cos(pos/10000^(2i/dmodel))
+
+ where pos = position, pos in [0, seq_len)
+ dmodel = data embedding dimension = embed_dim
+ i = dimension index, i in [0, embed_dim)
+
+ Reference: "Attention Is All You Need" https://arxiv.org/abs/1706.03762
+ Implementation Reference: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
+ """
+
+ def __init__(self, embed_dim: int, seq_len: int = 1024) -> None:
+ super().__init__()
+ pe = torch.zeros(seq_len, embed_dim, dtype=torch.float)
+ position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
+ div_term = torch.exp(
+ torch.arange(0, embed_dim, 2).float() * (-(math.log(10000.0)) / embed_dim)
+ )
+ pe[:, 0::2] = torch.sin(position * div_term)
+ pe[:, 1::2] = torch.cos(position * div_term)
+ pe = pe.unsqueeze(0)
+ self.register_buffer("pe", pe)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ assert self.pe.size(1) >= x.size(1), (
+ "Cannot apply position encoding of size "
+ + f"{self.pe.size()} when input has size {x.size()}"
+ )
+ return x + self.pe[:, : x.size(1), :]
diff --git a/pytorchvideo/layers/squeeze_excitation.py b/pytorchvideo/layers/squeeze_excitation.py
new file mode 100644
index 00000000..47858e0a
--- /dev/null
+++ b/pytorchvideo/layers/squeeze_excitation.py
@@ -0,0 +1,182 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+from pytorchvideo.models.resnet import ResBlock
+
+
+class SqueezeAndExcitationLayer2D(nn.Module):
+ """2D Squeeze and excitation layer, as per https://arxiv.org/pdf/1709.01507.pdf"""
+
+ def __init__(
+ self,
+ in_planes: int,
+ reduction_ratio: Optional[int] = 16,
+ reduced_planes: Optional[int] = None,
+ ):
+
+ """
+ Args:
+ in_planes (int): input channel dimension.
+ reduction_ratio (int): factor by which in_planes should be reduced to
+ get the output channel dimension.
+ reduced_planes (int): Output channel dimension. Only one of reduction_ratio
+ or reduced_planes should be defined.
+ """
+ super().__init__()
+ self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+
+ # Either reduction_ratio is defined, or out_planes is defined
+ assert bool(reduction_ratio) != bool(
+ reduced_planes
+ ), "Only of reduction_ratio or reduced_planes should be defined for SE layer"
+
+ reduced_planes = (
+ in_planes // reduction_ratio if reduced_planes is None else reduced_planes
+ )
+ self.excitation = nn.Sequential(
+ nn.Conv2d(in_planes, reduced_planes, kernel_size=1, stride=1, bias=True),
+ nn.ReLU(),
+ nn.Conv2d(reduced_planes, in_planes, kernel_size=1, stride=1, bias=True),
+ nn.Sigmoid(),
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """
+ Args:
+ x (tensor): 2D image of format C * H * W
+ """
+ x_squeezed = self.avgpool(x)
+ x_excited = self.excitation(x_squeezed)
+ x_scaled = x * x_excited
+ return x_scaled
+
+
+def create_audio_2d_squeeze_excitation_block(
+ dim_in: int,
+ dim_out: int,
+ use_se=False,
+ se_reduction_ratio=16,
+ branch_fusion: Callable = lambda x, y: x + y,
+ # Conv configs.
+ conv_a_kernel_size: int = 3,
+ conv_a_stride: int = 1,
+ conv_a_padding: int = 1,
+ conv_b_kernel_size: int = 3,
+ conv_b_stride: int = 1,
+ conv_b_padding: int = 1,
+ # Norm configs.
+ norm: Callable = nn.BatchNorm2d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+) -> nn.Module:
+
+ """
+ 2-D Residual block with squeeze excitation (SE2D) for 2d. Performs a summation between an
+ identity shortcut in branch1 and a main block in branch2. When the input and
+ output dimensions are different, a convolution followed by a normalization
+ will be performed.
+
+ ::
+
+ Input
+ |-------+
+ ↓ |
+ conv2d |
+ ↓ |
+ Norm |
+ ↓ |
+ activation |
+ ↓ |
+ conv2d |
+ ↓ |
+ Norm |
+ ↓ |
+ SE2D |
+ ↓ }
+ Summation ←-+
+ ↓
+ Activation
+
+ Normalization examples include: BatchNorm3d and None (no normalization).
+ Activation examples include: ReLU, Softmax, Sigmoid, and None (no activation).
+ Transform examples include: BottleneckBlock.
+
+ Args:
+ dim_in (int): input channel size to the bottleneck block.
+ dim_out (int): output channel size of the bottleneck.
+ use_se (bool): if true, use squeeze excitation layer in the bottleneck.
+ se_reduction_ratio (int): factor by which input channels should be reduced to
+ get the output channel dimension in SE layer.
+ branch_fusion (callable): a callable that constructs summation layer.
+ Examples include: lambda x, y: x + y, OctaveSum.
+
+ conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
+ conv_a_stride (tuple): convolutional stride size(s) for conv_a.
+ conv_a_padding (tuple): convolutional padding(s) for conv_a.
+ conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ conv_b_stride (tuple): convolutional stride size(s) for conv_b.
+ conv_b_padding (tuple): convolutional padding(s) for conv_b.
+
+ norm (callable): a callable that constructs normalization layer. Examples
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+
+ activation (callable): a callable that constructs activation layer in
+ bottleneck and block. Examples include: nn.ReLU, nn.Softmax, nn.Sigmoid,
+ and None (not performing activation).
+
+ Returns:
+ (nn.Module): resnet basic block layer.
+ """
+
+ branch2 = [
+ nn.Conv2d(
+ dim_in,
+ dim_out,
+ kernel_size=conv_a_kernel_size,
+ stride=conv_a_stride,
+ padding=conv_a_padding,
+ bias=False,
+ ),
+ norm(dim_out, norm_eps, norm_momentum),
+ activation() if activation else nn.Identity(),
+ nn.Conv2d(
+ dim_out,
+ dim_out,
+ kernel_size=conv_b_kernel_size,
+ stride=conv_b_stride,
+ padding=conv_b_padding,
+ bias=False,
+ ),
+ norm(dim_out, norm_eps, norm_momentum),
+ ]
+ if use_se:
+ branch2.append(
+ SqueezeAndExcitationLayer2D(dim_out, reduction_ratio=se_reduction_ratio)
+ )
+ branch2 = nn.Sequential(*branch2)
+
+ branch1_conv, branch1_norm = None, None
+ if conv_a_stride * conv_b_stride != 1 or dim_in != dim_out:
+ branch1_conv = nn.Conv2d(
+ dim_in,
+ dim_out,
+ kernel_size=1,
+ stride=conv_a_stride * conv_b_stride,
+ bias=False,
+ )
+ branch1_norm = norm(dim_out, norm_eps, norm_momentum)
+
+ return ResBlock(
+ branch1_conv=branch1_conv,
+ branch1_norm=branch1_norm,
+ branch2=branch2,
+ activation=activation() if activation else None,
+ branch_fusion=branch_fusion,
+ )
diff --git a/pytorchvideo/layers/swish.py b/pytorchvideo/layers/swish.py
new file mode 100644
index 00000000..21bdcece
--- /dev/null
+++ b/pytorchvideo/layers/swish.py
@@ -0,0 +1,34 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import torch
+import torch.nn as nn
+
+
+class Swish(nn.Module):
+ """
+ Wrapper for the Swish activation function.
+ """
+
+ def forward(self, x):
+ return SwishFunction.apply(x)
+
+
+class SwishFunction(torch.autograd.Function):
+ """
+ Implementation of the Swish activation function: x * sigmoid(x).
+
+ Searching for activation functions. Ramachandran, Prajit and Zoph, Barret
+ and Le, Quoc V. 2017
+ """
+
+ @staticmethod
+ def forward(ctx, x):
+ result = x * torch.sigmoid(x)
+ ctx.save_for_backward(x)
+ return result
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ x = ctx.saved_variables[0]
+ sigmoid_x = torch.sigmoid(x)
+ return grad_output * (sigmoid_x * (1 + x * (1 - sigmoid_x)))
diff --git a/pytorchvideo/layers/utils.py b/pytorchvideo/layers/utils.py
new file mode 100644
index 00000000..15593d61
--- /dev/null
+++ b/pytorchvideo/layers/utils.py
@@ -0,0 +1,49 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import math
+from typing import List
+
+
+def set_attributes(self, params: List[object] = None) -> None:
+ """
+ An utility function used in classes to set attributes from the input list of parameters.
+ Args:
+ params (list): list of parameters.
+ """
+ if params:
+ for k, v in params.items():
+ if k != "self":
+ setattr(self, k, v)
+
+
+def round_width(width, multiplier, min_width=8, divisor=8, ceil=False):
+ """
+ Round width of filters based on width multiplier
+ Args:
+ width (int): the channel dimensions of the input.
+ multiplier (float): the multiplication factor.
+ min_width (int): the minimum width after multiplication.
+ divisor (int): the new width should be dividable by divisor.
+ ceil (bool): If True, use ceiling as the rounding method.
+ """
+ if not multiplier:
+ return width
+
+ width *= multiplier
+ min_width = min_width or divisor
+ if ceil:
+ width_out = max(min_width, int(math.ceil(width / divisor)) * divisor)
+ else:
+ width_out = max(min_width, int(width + divisor / 2) // divisor * divisor)
+ if width_out < 0.9 * width:
+ width_out += divisor
+ return int(width_out)
+
+
+def round_repeats(repeats, multiplier):
+ """
+ Round number of layers based on depth multiplier.
+ """
+ if not multiplier:
+ return repeats
+ return int(math.ceil(multiplier * repeats))
diff --git a/pytorchvideo/models/__init__.py b/pytorchvideo/models/__init__.py
new file mode 100644
index 00000000..9305e3d5
--- /dev/null
+++ b/pytorchvideo/models/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from .csn import create_csn
+from .head import ResNetBasicHead, create_res_basic_head
+from .masked_multistream import (
+ LSTM,
+ LearnMaskedDefault,
+ MaskedMultiPathWay,
+ MaskedSequential,
+ MaskedTemporalPooling,
+ TransposeMultiheadAttention,
+ TransposeTransformerEncoder,
+)
+from .net import MultiPathWayWithFuse, Net
+from .resnet import BottleneckBlock, create_bottleneck_block, create_resnet
+from .slowfast import create_slowfast
+from .stem import ResNetBasicStem, create_res_basic_stem
+from .weight_init import init_net_weights
diff --git a/pytorchvideo/models/accelerator/__init__.py b/pytorchvideo/models/accelerator/__init__.py
new file mode 100644
index 00000000..5c7f19c6
--- /dev/null
+++ b/pytorchvideo/models/accelerator/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/pytorchvideo/models/accelerator/mobile_cpu/__init__.py b/pytorchvideo/models/accelerator/mobile_cpu/__init__.py
new file mode 100644
index 00000000..5c7f19c6
--- /dev/null
+++ b/pytorchvideo/models/accelerator/mobile_cpu/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/pytorchvideo/models/accelerator/mobile_cpu/efficient_x3d.py b/pytorchvideo/models/accelerator/mobile_cpu/efficient_x3d.py
new file mode 100644
index 00000000..dacdb0fb
--- /dev/null
+++ b/pytorchvideo/models/accelerator/mobile_cpu/efficient_x3d.py
@@ -0,0 +1,195 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from collections import OrderedDict
+
+import torch.nn as nn
+from pytorchvideo.layers.accelerator.mobile_cpu.activation_functions import (
+ supported_act_functions,
+)
+from pytorchvideo.layers.accelerator.mobile_cpu.convolutions import (
+ Conv3d5x1x1BnAct,
+ Conv3dPwBnAct,
+ Conv3dTemporalKernel1BnAct,
+)
+from pytorchvideo.layers.accelerator.mobile_cpu.fully_connected import FullyConnected
+from pytorchvideo.layers.accelerator.mobile_cpu.pool import AdaptiveAvgPool3dOutSize1
+
+from .residual_blocks import X3dBottleneckBlock
+
+
+class EfficientX3d(nn.Module):
+ """
+ This class implements an X3D network for classification with efficient blocks.
+ Args:
+ num_classes (int): Number of classes in classification.
+ dropout (float): Dropout rate used for training the network.
+ expansion (str): Expansion for X3D. Possible options: 'XS', 'S', 'M', 'L'.
+ head_act (str): The activation function to be applied in head, should be a key
+ in dict supported_act_functions (see activation_functions.py for more info
+ about supported activations).
+ """
+
+ def __init__(
+ self,
+ num_classes: int = 400,
+ dropout: float = 0.5,
+ expansion: str = "XS",
+ head_act: str = "identity",
+ ):
+ super().__init__()
+ assert expansion in (
+ "XS",
+ "S",
+ "M",
+ "L",
+ ), f"Expansion {expansion} not supported."
+ # s1 - stem
+ s1 = OrderedDict()
+ s1["pathway0_stem_conv_xy"] = Conv3dTemporalKernel1BnAct(
+ 3,
+ 24,
+ bias=False,
+ groups=1,
+ spatial_kernel=3,
+ spatial_stride=2,
+ spatial_padding=1,
+ activation="identity",
+ use_bn=False,
+ )
+ s1["pathway0_stem_conv"] = Conv3d5x1x1BnAct(
+ 24,
+ 24,
+ bias=False,
+ groups=24,
+ use_bn=True,
+ )
+ self.s1 = nn.Sequential(s1)
+ # s2 - res2
+ s2 = OrderedDict()
+ depth_s2 = 5 if expansion == "L" else 3
+ for i_block in range(depth_s2):
+ cur_block = X3dBottleneckBlock(
+ in_channels=24,
+ mid_channels=54,
+ out_channels=24,
+ use_residual=True,
+ spatial_stride=(2 if i_block == 0 else 1),
+ se_ratio=(0.0625 if (i_block % 2) == 0 else 0),
+ act_functions=("relu", "swish", "relu"),
+ use_bn=(True, True, True),
+ )
+ s2[f"pathway0_res{i_block}"] = cur_block
+ self.s2 = nn.Sequential(s2)
+ # s3 - res3
+ s3 = OrderedDict()
+ depth_s3 = 10 if expansion == "L" else 5
+ for i_block in range(depth_s3):
+ cur_block = X3dBottleneckBlock(
+ in_channels=(24 if i_block == 0 else 48),
+ mid_channels=108,
+ out_channels=48,
+ use_residual=True,
+ spatial_stride=(2 if i_block == 0 else 1),
+ se_ratio=(0.0625 if (i_block % 2) == 0 else 0),
+ act_functions=("relu", "swish", "relu"),
+ use_bn=(True, True, True),
+ )
+ s3[f"pathway0_res{i_block}"] = cur_block
+ self.s3 = nn.Sequential(s3)
+ # s4 - res4
+ s4 = OrderedDict()
+ depth_s4 = 25 if expansion == "L" else 11
+ for i_block in range(depth_s4):
+ cur_block = X3dBottleneckBlock(
+ in_channels=(48 if i_block == 0 else 96),
+ mid_channels=216,
+ out_channels=96,
+ use_residual=True,
+ spatial_stride=(2 if i_block == 0 else 1),
+ se_ratio=(0.0625 if (i_block % 2) == 0 else 0),
+ act_functions=("relu", "swish", "relu"),
+ use_bn=(True, True, True),
+ )
+ s4[f"pathway0_res{i_block}"] = cur_block
+ self.s4 = nn.Sequential(s4)
+ # s5 - res5
+ s5 = OrderedDict()
+ depth_s5 = 15 if expansion == "L" else 7
+ for i_block in range(depth_s5):
+ cur_block = X3dBottleneckBlock(
+ in_channels=(96 if i_block == 0 else 192),
+ mid_channels=432,
+ out_channels=192,
+ use_residual=True,
+ spatial_stride=(2 if i_block == 0 else 1),
+ se_ratio=(0.0625 if (i_block % 2) == 0 else 0),
+ act_functions=("relu", "swish", "relu"),
+ use_bn=(True, True, True),
+ )
+ s5[f"pathway0_res{i_block}"] = cur_block
+ self.s5 = nn.Sequential(s5)
+ # head
+ head = OrderedDict()
+ head["conv_5"] = Conv3dPwBnAct(
+ in_channels=192,
+ out_channels=432,
+ bias=False,
+ use_bn=True,
+ )
+ head["avg_pool"] = AdaptiveAvgPool3dOutSize1()
+ head["lin_5"] = Conv3dPwBnAct(
+ in_channels=432,
+ out_channels=2048,
+ bias=False,
+ use_bn=False,
+ )
+ self.head = nn.Sequential(head)
+ if dropout > 0:
+ self.dropout = nn.Dropout(dropout)
+ self.projection = FullyConnected(2048, num_classes, bias=True)
+ assert head_act in supported_act_functions, f"{head_act} is not supported."
+ self.act = supported_act_functions[head_act]()
+
+ def forward(self, x):
+ x = self.s1(x)
+ x = self.s2(x)
+ x = self.s3(x)
+ x = self.s4(x)
+ x = self.s5(x)
+ x = self.head(x)
+ # (N, C, T, H, W) -> (N, T, H, W, C).
+ x = x.permute((0, 2, 3, 4, 1))
+ if hasattr(self, "dropout"):
+ x = self.dropout(x)
+ x = self.projection(x)
+ # Performs fully convlutional inference.
+ if not self.training:
+ x = self.act(x)
+ x = x.mean([1, 2, 3])
+ x = x.view(x.shape[0], -1)
+
+ return x
+
+
+def create_x3d(
+ *,
+ # EfficientX3d model arguments.
+ num_classes: int = 400,
+ dropout: float = 0.5,
+ expansion: str = "XS",
+ head_act: str = "identity",
+):
+ """
+ This function builds a X3D network with efficient blocks.
+ Args:
+ num_classes (int): Number of classes in classification.
+ dropout (float): Dropout rate used for training the network.
+ expansion (str): Expansion for X3D. Possible options: 'XS', 'S', 'M', 'L'.
+ head_act (str): The activation function to be applied in head, should be a key
+ in dict supported_act_functions (see activation_functions.py for more info
+ about supported activations). Currently ReLU ('relu'), Swish ('swish'),
+ Hardswish ('hswish'), Identity ('identity') are supported.
+ """
+ return EfficientX3d(
+ num_classes=num_classes, dropout=dropout, expansion=expansion, head_act=head_act
+ )
diff --git a/pytorchvideo/models/accelerator/mobile_cpu/residual_blocks.py b/pytorchvideo/models/accelerator/mobile_cpu/residual_blocks.py
new file mode 100644
index 00000000..052c6864
--- /dev/null
+++ b/pytorchvideo/models/accelerator/mobile_cpu/residual_blocks.py
@@ -0,0 +1,214 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from collections import OrderedDict
+from typing import Optional, Tuple
+
+import torch.nn as nn
+from pytorchvideo.accelerator.efficient_blocks.efficient_block_base import (
+ EfficientBlockBase,
+)
+from pytorchvideo.layers.accelerator.mobile_cpu.activation_functions import (
+ supported_act_functions,
+)
+from pytorchvideo.layers.accelerator.mobile_cpu.attention import SqueezeExcitation
+from pytorchvideo.layers.accelerator.mobile_cpu.convolutions import (
+ Conv3d3x3x3DwBnAct,
+ Conv3dPwBnAct,
+ Conv3dTemporalKernel1BnAct,
+)
+from pytorchvideo.layers.utils import round_width
+
+
+class X3dBottleneckBlock(EfficientBlockBase):
+ """
+ Implements a X3D style residual block with optional squeeze-excite (SE)
+ using efficient blocks.
+
+ Input +----------------------+
+ | |
+ v |
+ conv3d[0] (1x1x1) |
+ | |
+ v |
+ batchNorm (optional) |
+ | |
+ v |
+ activation[0] |
+ | |
+ v |
+ conv3d[1] (3x3x3 dw) |
+ | |
+ v |
+ batchNorm (optional) |
+ | |
+ v |
+ Squeeze-Excite (optional) |
+ | |
+ v |
+ activation[1] |
+ | |
+ v |
+ conv3d[2] (1x1x1) |
+ | |
+ v |
+ batchNorm (optional) |
+ | |
+ v |
+ sum <-----------------------+
+ |
+ v
+ activation[2]
+
+ Args:
+ in_channels (int): input channels for for 1x1x1 conv3d[0].
+ mid_channels (int): channels for 3x3x3 dw conv3d[1].
+ out_channels (int): output channels for 1x1x1 conv3d[2].
+ spatial_stride (int): spatial stride for 3x3x3 dw conv3d[1].
+ se_ratio (float): if > 0, apply SE to the 3x3x3 dw conv3d[1], with the SE
+ channel dimensionality being se_ratio times the 3x3x3 conv dim.
+ bias (tuple of bool): if bias[i] is true, use bias for conv3d[i].
+ act_functions (tuple of str): act_functions[i] is the activation function after
+ conv3d[i]. act_functions[i] should be a key in dict supported_act_functions
+ (see activation_functions.py for more info about supported activations).
+ Currently ReLU ('relu'), Swish ('swish'), Hardswish ('hswish'), Identity
+ ('identity') are supported.
+ use_bn (tuple of bool): if use_bn[i] is true, use batchnorm after conv3d[i].
+ norm_eps (float): epsilon for batchnorm.
+ norm_momentum (float): momentum for batchnorm.
+
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ mid_channels: int,
+ out_channels: int,
+ use_residual: bool = True,
+ spatial_stride: int = 1,
+ se_ratio: float = 0.0625,
+ act_functions: Optional[Tuple[str]] = ("relu", "relu", "relu"),
+ bias: Optional[Tuple[bool]] = (False, False, False),
+ use_bn: Optional[Tuple[bool]] = (True, True, True),
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ ):
+ super().__init__()
+
+ # Residual projection
+ self._use_residual = use_residual
+ self._res_proj = None
+ if self._use_residual:
+ self._residual_add_func = nn.quantized.FloatFunctional()
+ if (spatial_stride != 1) or (in_channels != out_channels):
+ self._res_proj = Conv3dTemporalKernel1BnAct(
+ in_channels,
+ out_channels,
+ bias=False,
+ groups=1,
+ spatial_kernel=1,
+ spatial_stride=spatial_stride,
+ spatial_padding=0,
+ spatial_dilation=1,
+ activation="identity",
+ use_bn=True,
+ )
+
+ layers = OrderedDict()
+
+ # 1x1x1 pointwise layer conv[0]
+ assert (
+ act_functions[0] in supported_act_functions
+ ), f"{act_functions[0]} is not supported."
+ layers["conv_0"] = Conv3dPwBnAct(
+ in_channels,
+ mid_channels,
+ bias=bias[0],
+ # If activation function is relu, just include that in convBnRelu block.
+ activation=act_functions[0],
+ use_bn=use_bn[0],
+ norm_eps=norm_eps,
+ norm_momentum=norm_momentum,
+ )
+
+ # 3x3x3 dw layer conv[1]
+ self._spatial_stride = spatial_stride
+ self._mid_channels = mid_channels
+ assert (
+ act_functions[1] in supported_act_functions
+ ), f"{act_functions[1]} is not supported."
+ layers["conv_1"] = Conv3d3x3x3DwBnAct(
+ mid_channels,
+ spatial_stride=self._spatial_stride,
+ bias=bias[1],
+ activation="identity", # Will apply activation after SE.
+ use_bn=use_bn[1],
+ norm_eps=norm_eps,
+ norm_momentum=norm_momentum,
+ )
+ if se_ratio > 0:
+ layers["se"] = SqueezeExcitation(
+ num_channels=mid_channels,
+ num_channels_reduced=round_width(mid_channels, se_ratio),
+ is_3d=True,
+ )
+ # Add activation function if act_functions[1].
+ layers["act_func_1"] = supported_act_functions[act_functions[1]]()
+
+ # Second 1x1x1 pointwise layer conv[2]
+ self._out_channels = out_channels
+ assert (
+ act_functions[2] in supported_act_functions
+ ), f"{act_functions[2]} is not supported."
+ layers["conv_2"] = Conv3dPwBnAct(
+ mid_channels,
+ out_channels,
+ bias=bias[2],
+ # With residual, apply activation function externally after residual sum.
+ activation="identity",
+ use_bn=use_bn[2],
+ norm_eps=norm_eps,
+ norm_momentum=norm_momentum,
+ )
+ self.final_act = supported_act_functions[act_functions[2]]()
+
+ self.layers = nn.Sequential(layers)
+
+ self.convert_flag = False
+
+ def forward(self, x):
+ out = self.layers(x)
+ if self._use_residual:
+ if self._res_proj is not None:
+ x = self._res_proj(x)
+ out = self._residual_add_func.add(x, out)
+ out = self.final_act(out)
+ return out
+
+ def convert(self, input_blob_size, *args, **kwargs):
+ assert (
+ self.convert_flag is False
+ ), "X3dBottleneckBlock: already converted, cannot be converted twice"
+
+ # Convert self.layers
+ batch_size = input_blob_size[0]
+ THW_size = tuple(input_blob_size[2:])
+ if self._res_proj is not None:
+ self._res_proj.convert(input_blob_size)
+ self.layers.conv_0.convert(input_blob_size)
+ # Update input_blob_size when necessary after each layer
+ input_blob_size = (batch_size, self._mid_channels) + THW_size
+
+ self.layers.conv_1.convert(input_blob_size)
+ THW_size = (
+ THW_size[0],
+ THW_size[1] // self._spatial_stride,
+ THW_size[2] // self._spatial_stride,
+ )
+ input_blob_size = (batch_size, self._mid_channels) + THW_size
+ if hasattr(self.layers, "se"):
+ self.layers.se.convert(input_blob_size)
+ self.layers.act_func_1.convert(input_blob_size)
+ self.layers.conv_2.convert(input_blob_size)
+ input_blob_size = (batch_size, self._out_channels) + THW_size
+ self.final_act.convert(input_blob_size)
+ self.convert_flag = True
diff --git a/pytorchvideo/models/byol.py b/pytorchvideo/models/byol.py
new file mode 100644
index 00000000..5965b284
--- /dev/null
+++ b/pytorchvideo/models/byol.py
@@ -0,0 +1,140 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import copy
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class BYOL(nn.Module):
+ """
+ Bootstrap Your Own Latent A New Approach to Self-Supervised Learning
+ Details can be found in:
+ https://arxiv.org/pdf/2006.07733.pdf
+ """
+
+ def __init__(
+ self,
+ backbone: nn.Module,
+ projector: Optional[nn.Module] = None,
+ predictor: Optional[nn.Module] = None,
+ feature_dim: int = 2048,
+ predictor_inner: int = 4096,
+ mmt: float = 0.99,
+ norm: Callable = nn.SyncBatchNorm,
+ ) -> None:
+ """
+ Args:
+ backbone (nn.Module): backbone for byol, input shape depends on the forward
+ input size. Standard inputs include `B x C`, `B x C x H x W`, and
+ `B x C x T x H x W`.
+ projector (nn.Module): stand projector is a mlp with 2 to 3 hidden layers,
+ with (synchronized) BatchNorm and ReLU activation.
+ predictor (nn.Module): predictor MLP of BYOL of similar structure as the
+ projector MLP.
+ feature_dim (int): output feature dimension.
+ predictor_inner (int): inner channel size for predictor.
+ mmt (float): momentum update ratio for the momentum backbone.
+ norm (callable): normalization to be used in projector, default is
+ synchronized batchnorm.
+ """
+ super().__init__()
+ self.mmt = mmt
+ self.feature_dim = feature_dim
+ if projector is not None:
+ backbone = nn.Sequential(
+ backbone,
+ projector,
+ )
+ self.backbone = backbone
+ self.backbone_mmt = copy.deepcopy(backbone)
+ for p in self.backbone_mmt.parameters():
+ p.requires_grad = False
+ if predictor is None:
+ self.predictor = nn.Sequential(
+ nn.Linear(feature_dim, predictor_inner, bias=False),
+ norm(predictor_inner),
+ nn.ReLU(inplace=True),
+ nn.Linear(predictor_inner, feature_dim, bias=True),
+ )
+ else:
+ self.predictor = predictor
+
+ def sim_loss(self, q, k):
+ """
+ Similarity loss for byol.
+ Args:
+ q and k (nn.tensor): inputs to calculate the similarity, expected to have
+ the same shape of `N x C`.
+ """
+ similarity = torch.einsum("nc,nc->n", [q, k])
+ loss = -similarity.mean()
+ return loss
+
+ def update_mmt(self, mmt: float):
+ """
+ Update the momentum. This function can be used to perform momentum annealing.
+ Args:
+ mmt (float): update the momentum.
+ """
+ self.mmt = mmt
+
+ def get_mmt(self) -> float:
+ """
+ Get the momentum. This function can be used to perform momentum annealing.
+ """
+ return self.mmt
+
+ @torch.no_grad()
+ def _momentum_update_backbone(self):
+ """
+ Momentum update on the backbone.
+ """
+ for param, param_mmt in zip(
+ self.backbone.parameters(), self.backbone_mmt.parameters()
+ ):
+ param_mmt.data = param_mmt.data * self.mmt + param.data * (1.0 - self.mmt)
+
+ @torch.no_grad()
+ def forward_backbone_mmt(self, x):
+ """
+ Forward momentum backbone.
+ Args:
+ x (tensor): input to be forwarded.
+ """
+ with torch.no_grad():
+ proj = self.backbone_mmt(x)
+ return F.normalize(proj, dim=1)
+
+ def forward_backbone(self, x):
+ """
+ Forward backbone.
+ Args:
+ x (tensor): input to be forwarded.
+ """
+ proj = self.backbone(x)
+ pred = self.predictor(proj)
+ return F.normalize(pred, dim=1)
+
+ def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+ """
+ Args:
+ x1 (torch.tensor): a batch of image with augmentation. The input tensor
+ shape should able to be feed into the backbone.
+ x2 (torch.tensor): the size batch of image with different augmentation. The
+ input tensor shape should able to be feed into the backbone.
+ """
+ pred_1 = self.forward_backbone(x1)
+ pred_2 = self.forward_backbone(x2)
+
+ with torch.no_grad():
+ self._momentum_update_backbone()
+ proj_mmt_1 = self.forward_backbone_mmt(x1)
+ proj_mmt_2 = self.forward_backbone_mmt(x2)
+
+ loss = (
+ self.sim_loss(pred_1, proj_mmt_2) + self.sim_loss(pred_2, proj_mmt_1)
+ ) / 2
+ return loss
diff --git a/pytorchvideo/models/csn.py b/pytorchvideo/models/csn.py
new file mode 100644
index 00000000..d3c95853
--- /dev/null
+++ b/pytorchvideo/models/csn.py
@@ -0,0 +1,187 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Callable, Tuple
+
+import torch.nn as nn
+from pytorchvideo.models.head import create_res_basic_head
+from pytorchvideo.models.resnet import Net, create_bottleneck_block, create_res_stage
+from pytorchvideo.models.stem import create_res_basic_stem
+
+
+def create_csn(
+ *,
+ # Input clip configs.
+ input_channel: int = 3,
+ # Model configs.
+ model_depth: int = 50,
+ model_num_class: int = 400,
+ dropout_rate: float = 0,
+ # Normalization configs.
+ norm: Callable = nn.BatchNorm3d,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+ # Stem configs.
+ stem_dim_out: int = 64,
+ stem_conv_kernel_size: Tuple[int] = (3, 7, 7),
+ stem_conv_stride: Tuple[int] = (1, 2, 2),
+ stem_pool: Callable = None,
+ stem_pool_kernel_size: Tuple[int] = (1, 3, 3),
+ stem_pool_stride: Tuple[int] = (1, 2, 2),
+ # Stage configs.
+ stage_conv_a_kernel_size: Tuple[int] = (1, 1, 1),
+ stage_conv_b_kernel_size: Tuple[int] = (3, 3, 3),
+ stage_conv_b_width_per_group: int = 1,
+ stage_spatial_stride: Tuple[int] = (1, 2, 2, 2),
+ stage_temporal_stride: Tuple[int] = (1, 2, 2, 2),
+ bottleneck: Callable = create_bottleneck_block,
+ bottleneck_ratio: int = 4,
+ # Head configs.
+ head_pool: Callable = nn.AvgPool3d,
+ head_pool_kernel_size: Tuple[int] = (1, 7, 7),
+ head_output_size: Tuple[int] = (1, 1, 1),
+ head_activation: Callable = None,
+ head_output_with_global_average: bool = True,
+) -> nn.Module:
+ """
+ Build Channel-Separated Convolutional Networks (CSN):
+ Video classification with channel-separated convolutional networks.
+ Du Tran, Heng Wang, Lorenzo Torresani, Matt Feiszli. ICCV 2019.
+
+ CSN follows the ResNet style architecture including three parts: Stem,
+ Stages and Head. The three parts are assembled in the following order:
+
+ ::
+
+ Input
+ ↓
+ Stem
+ ↓
+ Stage 1
+ ↓
+ .
+ .
+ .
+ ↓
+ Stage N
+ ↓
+ Head
+
+ CSN uses depthwise convolution. To further reduce the computational cost, it uses
+ low resolution (112x112), short clips (4 frames), different striding and kernel
+ size, etc.
+
+ Args:
+
+ input_channel (int): number of channels for the input video clip.
+
+ model_depth (int): the depth of the resnet. Options include: 50, 101, 152.
+ model_num_class (int): the number of classes for the video dataset.
+ dropout_rate (float): dropout rate.
+
+ norm (callable): a callable that constructs normalization layer.
+
+ activation (callable): a callable that constructs activation layer.
+
+ stem_dim_out (int): output channel size to stem.
+ stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
+ stem_conv_stride (tuple): convolutional stride size(s) of stem.
+ stem_pool (callable): a callable that constructs resnet head pooling layer.
+ stem_pool_kernel_size (tuple): pooling kernel size(s).
+ stem_pool_stride (tuple): pooling stride size(s).
+
+ stage_conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
+ stage_conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ stage_conv_b_width_per_group(int): the width of each group for conv_b. Set
+ it to 1 for depthwise convolution.
+ stage_spatial_stride (tuple): the spatial stride for each stage.
+ stage_temporal_stride (tuple): the temporal stride for each stage.
+ bottleneck (callable): a callable that constructs bottleneck block layer.
+ Examples include: create_bottleneck_block.
+ bottleneck_ratio (int): the ratio between inner and outer dimensions for
+ the bottleneck block.
+
+ head_pool (callable): a callable that constructs resnet head pooling layer.
+ head_pool_kernel_size (tuple): the pooling kernel size.
+ head_output_size (tuple): the size of output tensor for head.
+ head_activation (callable): a callable that constructs activation layer.
+ head_output_with_global_average (bool): if True, perform global averaging on
+ the head output.
+
+ Returns:
+ (nn.Module): the csn model.
+ """
+ # Number of blocks for different stages given the model depth.
+ _MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3), 101: (3, 4, 23, 3), 152: (3, 8, 36, 3)}
+
+ # Given a model depth, get the number of blocks for each stage.
+ assert (
+ model_depth in _MODEL_STAGE_DEPTH.keys()
+ ), f"{model_depth} is not in {_MODEL_STAGE_DEPTH.keys()}"
+ stage_depths = _MODEL_STAGE_DEPTH[model_depth]
+
+ blocks = []
+ # Create stem for CSN.
+ stem = create_res_basic_stem(
+ in_channels=input_channel,
+ out_channels=stem_dim_out,
+ conv_kernel_size=stem_conv_kernel_size,
+ conv_stride=stem_conv_stride,
+ conv_padding=[size // 2 for size in stem_conv_kernel_size],
+ pool=stem_pool,
+ pool_kernel_size=stem_pool_kernel_size,
+ pool_stride=stem_pool_stride,
+ pool_padding=[size // 2 for size in stem_pool_kernel_size],
+ norm=norm,
+ activation=activation,
+ )
+ blocks.append(stem)
+
+ stage_dim_in = stem_dim_out
+ stage_dim_out = stage_dim_in * 4
+
+ # Create each stage for CSN.
+ for idx in range(len(stage_depths)):
+ stage_dim_inner = stage_dim_out // bottleneck_ratio
+ depth = stage_depths[idx]
+
+ stage_conv_b_stride = (
+ stage_temporal_stride[idx],
+ stage_spatial_stride[idx],
+ stage_spatial_stride[idx],
+ )
+
+ stage = create_res_stage(
+ depth=depth,
+ dim_in=stage_dim_in,
+ dim_inner=stage_dim_inner,
+ dim_out=stage_dim_out,
+ bottleneck=bottleneck,
+ conv_a_kernel_size=stage_conv_a_kernel_size,
+ conv_a_stride=(1, 1, 1),
+ conv_a_padding=[size // 2 for size in stage_conv_a_kernel_size],
+ conv_b_kernel_size=stage_conv_b_kernel_size,
+ conv_b_stride=stage_conv_b_stride,
+ conv_b_padding=[size // 2 for size in stage_conv_b_kernel_size],
+ conv_b_num_groups=(stage_dim_inner // stage_conv_b_width_per_group),
+ conv_b_dilation=(1, 1, 1),
+ norm=norm,
+ activation=activation,
+ )
+
+ blocks.append(stage)
+ stage_dim_in = stage_dim_out
+ stage_dim_out = stage_dim_out * 2
+
+ # Create head for CSN.
+ head = create_res_basic_head(
+ in_features=stage_dim_in,
+ out_features=model_num_class,
+ pool=head_pool,
+ output_size=head_output_size,
+ pool_kernel_size=head_pool_kernel_size,
+ dropout_rate=dropout_rate,
+ activation=head_activation,
+ output_with_global_average=head_output_with_global_average,
+ )
+ blocks.append(head)
+ return Net(blocks=nn.ModuleList(blocks))
diff --git a/pytorchvideo/models/head.py b/pytorchvideo/models/head.py
new file mode 100644
index 00000000..873ce7f4
--- /dev/null
+++ b/pytorchvideo/models/head.py
@@ -0,0 +1,164 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Callable, Tuple
+
+import torch
+import torch.nn as nn
+from pytorchvideo.layers.utils import set_attributes
+
+
+def create_res_basic_head(
+ *,
+ # Projection configs.
+ in_features: int,
+ out_features: int,
+ # Pooling configs.
+ pool: Callable = nn.AvgPool3d,
+ output_size: Tuple[int] = (1, 1, 1),
+ pool_kernel_size: Tuple[int] = (1, 7, 7),
+ pool_stride: Tuple[int] = (1, 1, 1),
+ pool_padding: Tuple[int] = (0, 0, 0),
+ # Dropout configs.
+ dropout_rate: float = 0.5,
+ # Activation configs.
+ activation: Callable = None,
+ # Output configs.
+ output_with_global_average: bool = True,
+) -> nn.Module:
+ """
+ Creates ResNet basic head. This layer performs an optional pooling operation
+ followed by an optional dropout, a fully-connected projection, an activation layer
+ and a global spatiotemporal averaging.
+
+ ::
+
+
+ Pooling
+ ↓
+ Dropout
+ ↓
+ Projection
+ ↓
+ Activation
+ ↓
+ Averaging
+
+ Activation examples include: ReLU, Softmax, Sigmoid, and None.
+ Pool3d examples include: AvgPool3d, MaxPool3d, AdaptiveAvgPool3d, and None.
+
+ Args:
+
+ in_features: input channel size of the resnet head.
+ out_features: output channel size of the resnet head.
+
+ pool (callable): a callable that constructs resnet head pooling layer,
+ examples include: nn.AvgPool3d, nn.MaxPool3d, nn.AdaptiveAvgPool3d, and
+ None (not applying pooling).
+ pool_kernel_size (tuple): pooling kernel size(s) when not using adaptive
+ pooling.
+ pool_stride (tuple): pooling stride size(s) when not using adaptive pooling.
+ pool_padding (tuple): pooling padding size(s) when not using adaptive
+ pooling.
+ output_size (tuple): spatial temporal output size when using adaptive
+ pooling.
+
+ activation (callable): a callable that constructs resnet head activation
+ layer, examples include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not
+ applying activation).
+
+ dropout_rate (float): dropout rate.
+
+ output_with_global_average (bool): if True, perform global averaging on temporal
+ and spatial dimensions and reshape output to batch_size x out_features.
+ """
+ if activation is None:
+ activation_model = None
+ elif activation == nn.Softmax:
+ activation_model = activation(dim=1)
+ else:
+ activation_model = activation()
+
+ if pool is None:
+ pool_model = None
+ elif pool == nn.AdaptiveAvgPool3d:
+ pool_model = pool(output_size)
+ else:
+ pool_model = pool(
+ kernel_size=pool_kernel_size, stride=pool_stride, padding=pool_padding
+ )
+
+ if output_with_global_average:
+ output_pool = nn.AdaptiveAvgPool3d(1)
+ else:
+ output_pool = None
+
+ return ResNetBasicHead(
+ proj=nn.Linear(in_features, out_features),
+ activation=activation_model,
+ pool=pool_model,
+ dropout=nn.Dropout(dropout_rate) if dropout_rate > 0 else None,
+ output_pool=output_pool,
+ )
+
+
+class ResNetBasicHead(nn.Module):
+ """
+ ResNet basic head. This layer performs an optional pooling operation followed by an
+ optional dropout, a fully-connected projection, an optional activation layer and a
+ global spatiotemporal averaging.
+
+ ::
+
+ Pool3d
+ ↓
+ Dropout
+ ↓
+ Projection
+ ↓
+ Activation
+ ↓
+ Averaging
+
+ The builder can be found in `create_res_basic_head`.
+ """
+
+ def __init__(
+ self,
+ pool: nn.Module = None,
+ dropout: nn.Module = None,
+ proj: nn.Module = None,
+ activation: nn.Module = None,
+ output_pool: nn.Module = None,
+ ) -> None:
+ """
+ Args:
+ pool (torch.nn.modules): pooling module.
+ dropout(torch.nn.modules): dropout module.
+ proj (torch.nn.modules): project module.
+ activation (torch.nn.modules): activation module.
+ output_pool (torch.nn.Module): pooling module for output.
+ """
+ super().__init__()
+ set_attributes(self, locals())
+ assert self.proj is not None
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ # Performs pooling.
+ if self.pool is not None:
+ x = self.pool(x)
+ # Performs dropout.
+ if self.dropout is not None:
+ x = self.dropout(x)
+ # Performs projection.
+ x = x.permute((0, 2, 3, 4, 1))
+ x = self.proj(x)
+ x = x.permute((0, 4, 1, 2, 3))
+ # Performs activation.
+ if self.activation is not None:
+ x = self.activation(x)
+
+ if self.output_pool is not None:
+ # Performs global averaging.
+ x = self.output_pool(x)
+ x = x.view(x.shape[0], -1)
+ return x
diff --git a/pytorchvideo/models/hub/__init__.py b/pytorchvideo/models/hub/__init__.py
new file mode 100644
index 00000000..949d0acd
--- /dev/null
+++ b/pytorchvideo/models/hub/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from .efficient_x3d_mobile_cpu import efficient_x3d_xs, efficient_x3d_s
+from .resnet import slow_r50
+from .slowfast import slowfast_r50, slowfast_r101
+from .x3d import x3d_m, x3d_s, x3d_xs
diff --git a/pytorchvideo/models/hub/efficient_x3d_mobile_cpu.py b/pytorchvideo/models/hub/efficient_x3d_mobile_cpu.py
new file mode 100644
index 00000000..1af4048a
--- /dev/null
+++ b/pytorchvideo/models/hub/efficient_x3d_mobile_cpu.py
@@ -0,0 +1,80 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Any
+
+import torch.nn as nn
+from pytorchvideo.models.accelerator.mobile_cpu.efficient_x3d import create_x3d
+from torch.hub import load_state_dict_from_url
+
+_root_dir = "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics"
+_checkpoint_paths = {
+ "efficient_x3d_xs": f"{_root_dir}/efficient_x3d_xs_original_form.pyth",
+ "efficient_x3d_s": f"{_root_dir}/efficient_x3d_s_original_form.pyth",
+}
+
+
+def _efficient_x3d(
+ pretrained: bool = False,
+ progress: bool = True,
+ checkpoint_path: str = None,
+ # Model params
+ expansion: str = "XS",
+ **kwargs: Any,
+) -> nn.Module:
+
+ model = create_x3d(
+ expansion=expansion,
+ **kwargs,
+ )
+
+ if pretrained and checkpoint_path is not None:
+ state_dict = load_state_dict_from_url(checkpoint_path, progress=progress)
+ model.load_state_dict(state_dict, strict=True)
+
+ return model
+
+
+def efficient_x3d_xs(pretrained: bool = False, progress: bool = True, **kwargs):
+ r"""
+ X3D-XS model architectures [1] with pretrained weights trained
+ on the Kinetics dataset with efficient implementation for mobile cpu.
+
+ [1] Christoph Feichtenhofer, "X3D: Expanding Architectures for
+ Efficient Video Recognition." https://arxiv.org/abs/2004.04730
+
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on Kinetcis-400 dataset
+ progress (bool): If True, displays a progress bar of the download to stderr
+ To modify any other model settings, specify them in the kwargs.
+ All the args are defined in pytorchvideo/models/x3d.py
+ """
+ return _efficient_x3d(
+ pretrained=pretrained,
+ progress=progress,
+ checkpoint_path=_checkpoint_paths["efficient_x3d_xs"],
+ expansion="XS",
+ **kwargs,
+ )
+
+
+def efficient_x3d_s(pretrained: bool = False, progress: bool = True, **kwargs):
+ r"""
+ X3D-S model architectures [1] with pretrained weights trained
+ on the Kinetics dataset with efficient implementation for mobile cpu.
+
+ [1] Christoph Feichtenhofer, "X3D: Expanding Architectures for
+ Efficient Video Recognition." https://arxiv.org/abs/2004.04730
+
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on Kinetcis-400 dataset
+ progress (bool): If True, displays a progress bar of the download to stderr
+ To modify any other model settings, specify them in the kwargs.
+ All the args are defined in pytorchvideo/models/x3d.py
+ """
+ return _efficient_x3d(
+ pretrained=pretrained,
+ progress=progress,
+ checkpoint_path=_checkpoint_paths["efficient_x3d_s"],
+ expansion="S",
+ **kwargs,
+ )
diff --git a/pytorchvideo/models/hub/resnet.py b/pytorchvideo/models/hub/resnet.py
new file mode 100644
index 00000000..4e724537
--- /dev/null
+++ b/pytorchvideo/models/hub/resnet.py
@@ -0,0 +1,52 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Any
+
+import torch.nn as nn
+from pytorchvideo.models.resnet import create_resnet
+from torch.hub import load_state_dict_from_url
+
+
+"""
+ResNet style models for video recognition.
+"""
+
+checkpoint_paths = {
+ "slow_r50": "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/SLOW_8x8_R50.pyth"
+}
+
+
+def slow_r50(
+ pretrained: bool = False, progress: bool = True, **kwargs: Any
+) -> nn.Module:
+ r"""
+ Slow R50 model architecture [1] with pretrained weights based on 8x8 setting
+ on the Kinetics dataset. Model with pretrained weights has top1 accuracy of 74.58.
+
+ [1] Christoph Feichtenhofer et al, "SlowFast Networks for Video Recognition"
+ https://arxiv.org/pdf/1812.03982.pdf
+
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on the Kinetics dataset
+ progress (bool): If True, displays a progress bar of the download to stderr
+ kwargs: use these to modify any of the other model settings. All the
+ options are defined in pytorchvideo/models/resnet.py
+
+ NOTE: to use the pretrained model, do not modify the model configuration
+ via the kwargs. Only modify settings via kwargs to initialize a new model
+ without pretrained weights.
+ """
+ model = create_resnet(
+ stem_conv_kernel_size=(1, 7, 7),
+ head_pool_kernel_size=(8, 7, 7),
+ model_depth=50,
+ **kwargs,
+ )
+
+ if pretrained:
+ path = checkpoint_paths["slow_r50"]
+ checkpoint = load_state_dict_from_url(path, progress=progress)
+ state_dict = checkpoint["model_state"]
+ model.load_state_dict(state_dict)
+
+ return model
diff --git a/pytorchvideo/models/hub/slowfast.py b/pytorchvideo/models/hub/slowfast.py
new file mode 100644
index 00000000..96d76bea
--- /dev/null
+++ b/pytorchvideo/models/hub/slowfast.py
@@ -0,0 +1,92 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Any
+
+import torch.nn as nn
+from pytorchvideo.models.slowfast import create_slowfast
+from torch.hub import load_state_dict_from_url
+
+
+root_dir = "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics"
+checkpoint_paths = {
+ "slowfast_r50": f"{root_dir}/SLOWFAST_8x8_R50.pyth",
+ "slowfast_r101": f"{root_dir}/SLOWFAST_8x8_R101.pyth",
+}
+
+
+def _slowfast(
+ pretrained: bool = False,
+ progress: bool = True,
+ checkpoint_path: str = "",
+ **kwargs: Any,
+) -> nn.Module:
+ model = create_slowfast(**kwargs)
+ if pretrained:
+ checkpoint = load_state_dict_from_url(checkpoint_path, progress=progress)
+ state_dict = checkpoint["model_state"]
+ model.load_state_dict(state_dict)
+ return model
+
+
+def slowfast_r50(
+ pretrained: bool = False,
+ progress: bool = True,
+ **kwargs: Any,
+) -> nn.Module:
+ r"""
+ SlowFast R50 model architecture [1] trained with an 8x8 setting on the
+ Kinetics dataset. Model with pretrained weights has top1 accuracy of 76.4.
+
+ [1] Christoph Feichtenhofer et al, "SlowFast Networks for Video Recognition"
+ https://arxiv.org/pdf/1812.03982.pdf
+
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on Kinetics dataset
+ progress (bool): If True, displays a progress bar of the download to stderr
+ kwargs: use these to modify any of the other model settings. All the
+ options are defined in pytorchvideo/models/slowfast.py
+
+ NOTE: to use the pretrained model, do not modify the model configuration
+ via the kwargs. Only modify settings via kwargs to initialize a new model
+ without pretrained weights.
+ """
+ return _slowfast(
+ pretrained=pretrained,
+ progress=progress,
+ checkpoint_path=checkpoint_paths["slowfast_r50"],
+ model_depth=50,
+ slowfast_fusion_conv_kernel_size=(7, 1, 1),
+ **kwargs,
+ )
+
+
+def slowfast_r101(
+ pretrained: bool = False,
+ progress: bool = True,
+ **kwargs: Any,
+) -> nn.Module:
+ r"""
+ SlowFast R101 model architecture [1] trained with an 8x8 setting on the
+ Kinetics dataset. Model with pretrained weights has top1 accuracy of 77.9.
+
+ [1] Christoph Feichtenhofer et al, "SlowFast Networks for Video Recognition"
+ https://arxiv.org/pdf/1812.03982.pdf
+
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on Kinetics dataset
+ progress (bool): If True, displays a progress bar of the download to stderr
+ kwargs: use these to modify any of the other model settings. All the
+ options are defined in pytorchvideo/models/slowfast.py
+
+ NOTE: to use the pretrained model, do not modify the model configuration
+ via the kwargs. Only modify settings via kwargs to initialize a new model
+ without pretrained weights.
+ """
+ return _slowfast(
+ pretrained=pretrained,
+ progress=progress,
+ checkpoint_path=checkpoint_paths["slowfast_r101"],
+ model_depth=101,
+ slowfast_fusion_conv_kernel_size=(5, 1, 1),
+ **kwargs,
+ )
diff --git a/pytorchvideo/models/hub/x3d.py b/pytorchvideo/models/hub/x3d.py
new file mode 100644
index 00000000..15ec88ce
--- /dev/null
+++ b/pytorchvideo/models/hub/x3d.py
@@ -0,0 +1,125 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Any
+
+import torch.nn as nn
+from pytorchvideo.models.x3d import create_x3d
+from torch.hub import load_state_dict_from_url
+
+
+root_dir = "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics"
+checkpoint_paths = {
+ "x3d_xs": f"{root_dir}/X3D_XS.pyth",
+ "x3d_s": f"{root_dir}/X3D_S.pyth",
+ "x3d_m": f"{root_dir}/X3D_M.pyth",
+}
+
+
+def _x3d(
+ pretrained: bool = False,
+ progress: bool = True,
+ checkpoint_path: str = "",
+ **kwargs: Any,
+) -> nn.Module:
+ model = create_x3d(**kwargs)
+ if pretrained and len(checkpoint_path) > 0:
+ checkpoint = load_state_dict_from_url(checkpoint_path, progress=progress)
+ state_dict = checkpoint["model_state"]
+ model.load_state_dict(state_dict, strict=True)
+ return model
+
+
+def x3d_xs(
+ pretrained: bool = False,
+ progress: bool = True,
+ **kwargs,
+):
+ r"""
+ X3D-XS model architecture [1] trained on the Kinetics dataset.
+ Model with pretrained weights has top1 accuracy of 69.12.
+
+ [1] Christoph Feichtenhofer, "X3D: Expanding Architectures for
+ Efficient Video Recognition." https://arxiv.org/abs/2004.04730
+
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on the Kinetics dataset
+ progress (bool): If True, displays a progress bar of the download to stderr
+ kwargs: use these to modify any of the other model settings. All the
+ options are defined in pytorchvideo/models/x3d.py
+
+ NOTE: to use the pretrained model, do not modify the model configuration
+ via the kwargs. Only modify settings via kwargs to initialize a new model
+ without pretrained weights.
+ """
+ return _x3d(
+ pretrained=pretrained,
+ progress=progress,
+ checkpoint_path=checkpoint_paths["x3d_xs"],
+ input_clip_length=4,
+ input_crop_size=160,
+ **kwargs,
+ )
+
+
+def x3d_s(
+ pretrained: bool = False,
+ progress: bool = True,
+ **kwargs,
+):
+ """
+ X3D-XS model architecture [1] trained on the Kinetics dataset.
+ Model with pretrained weights has top1 accuracy of 73.33.
+
+ [1] Christoph Feichtenhofer, "X3D: Expanding Architectures for
+ Efficient Video Recognition." https://arxiv.org/abs/2004.04730
+
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on the Kinetics dataset
+ progress (bool): If True, displays a progress bar of the download to stderr
+ kwargs: use these to modify any of the other model settings. All the
+ options are defined in pytorchvideo/models/x3d.py
+
+ NOTE: to use the pretrained model, do not modify the model configuration
+ via the kwargs. Only modify settings via kwargs to initialize a new model
+ without pretrained weights.
+ """
+ return _x3d(
+ pretrained=pretrained,
+ progress=progress,
+ checkpoint_path=checkpoint_paths["x3d_s"],
+ input_clip_length=13,
+ input_crop_size=160,
+ **kwargs,
+ )
+
+
+def x3d_m(
+ pretrained: bool = False,
+ progress: bool = True,
+ **kwargs,
+):
+ """
+ X3D-XS model architecture [1] trained on the Kinetics dataset.
+ Model with pretrained weights has top1 accuracy of 75.94.
+
+ [1] Christoph Feichtenhofer, "X3D: Expanding Architectures for
+ Efficient Video Recognition." https://arxiv.org/abs/2004.04730
+
+ Args:
+ pretrained (bool): If True, returns a model pre-trained on the Kinetics dataset
+ progress (bool): If True, displays a progress bar of the download to stderr
+ kwargs: use these to modify any of the other model settings. All the
+ options are defined in pytorchvideo/models/x3d.py
+
+ NOTE: to use the pretrained model, do not modify the model configuration
+ via the kwargs. Only modify settings via kwargs to initialize a new model
+ without pretrained weights.
+ """
+ return _x3d(
+ pretrained=pretrained,
+ progress=progress,
+ checkpoint_path=checkpoint_paths["x3d_m"],
+ input_clip_length=16,
+ input_crop_size=224,
+ **kwargs,
+ )
diff --git a/pytorchvideo/models/masked_multistream.py b/pytorchvideo/models/masked_multistream.py
new file mode 100644
index 00000000..096d4b05
--- /dev/null
+++ b/pytorchvideo/models/masked_multistream.py
@@ -0,0 +1,384 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from typing import List, Optional, Tuple
+
+import torch
+from pytorchvideo.layers.utils import set_attributes
+from torch import nn
+from torch.nn.utils.rnn import pack_padded_sequence
+
+
+"""
+This file contains nn.Modules that take a tensor and mask in their forward function.
+These masks can be used to represent invalid values (e.g. for tensors with varying
+temporal dimension size). To easily compose these modules together, a
+MaskedSequential module is provided.
+
+Example usage:
+
+ feature_dim = 64
+ input_stream = MaskedSequential(
+ PositionalEncoding(feature_dim),
+ Dropout(p=0.1),
+ TransposeMultiheadAttention(feature_dim),
+ MaskedTemporalPooling(feature_dim, method="avg"),
+ LayerNorm(feature_dim),
+ LearnMaskedDefault(feature_dim),
+ )
+
+ input_tensor = ... # tensor with shape (batch_size, seq_len, feature_dim)
+ mask_tensor = ... # bool tensor with shape (batch_size, seq_len)
+ result = input_stream(input=input_tensor, mask=mask_tensor)
+"""
+
+
+class MaskedTemporalPooling(torch.nn.Module):
+ """
+ Applies temporal pooling operations on masked inputs. For each pooling operation
+ all masked values are ignored.
+ """
+
+ def __init__(self, method: str):
+ """
+ method (str): the method of pooling to use. Options:
+ 'max': reduces temporal dimension to each valid max value.
+ 'avg': averages valid values in the temporal dimension.
+ 'sum': sums valid values in the temporal dimension.
+ Note if all batch row elements are invalid, the temporal dimension is
+ pooled to 0 values.
+ """
+ super().__init__()
+ assert method in ("max", "avg", "sum")
+ self._method = method
+
+ def forward(
+ self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
+ ) -> torch.Tensor:
+ """
+ Args:
+ x (torch.Tensor): tensor with shape (batch_size, seq_len, feature_dim)
+ mask (torch.Tensor): bool tensor with shape (batch_size, seq_len).
+ Sequence elements that are False are invalid.
+
+ Returns:
+ Tensor with shape (batch_size, feature_dim)
+ """
+ assert x.dim() == 3, "Requires x shape (batch_size x seq_len x feature_dim)"
+ b, t = x.shape[0], x.shape[1]
+ if mask is None:
+ mask = torch.ones((b, t), dtype=torch.bool)
+
+ if self._method == "max":
+ x[~mask, :] = float("-inf")
+
+ # Invalid batch rows are set to 0.
+ invalid_first_dim = ~mask.view(b, -1).any(dim=-1)
+ x[invalid_first_dim, :] = 0
+
+ x = torch.max(x, dim=1)[0]
+ elif self._method == "avg":
+ x = x * mask.unsqueeze(-1).float()
+ mask = mask.view(b, t, -1).any(dim=-1)
+ valid_lengths = mask.float().sum(dim=-1).int()
+ x = x.sum(dim=1)
+ x = x.div(valid_lengths.clamp(min=1).unsqueeze(-1).expand(x.size()).float())
+ elif self._method == "sum": # sum
+ x = x * mask.unsqueeze(-1).float()
+ x = x.sum(dim=1)
+ else:
+ raise NotImplementedError(
+ f"{self._method} not available options are: 'max', 'avg', 'sum'"
+ )
+
+ return x
+
+
+class TransposeMultiheadAttention(nn.Module):
+ """
+ Wrapper for nn.MultiheadAttention which first transposes the input tensor
+ from (batch_size, seq_len, feature_dim) to (seq_length, batch_size, feature_dim),
+ then applies the attention and transposes the attention outputs back to the input
+ shape.
+ """
+
+ def __init__(self, feature_dim: int, num_heads: int = 1):
+ """
+ Args:
+ feature_dim (int): attention embedding dimension
+ num_heads (int): number of attention heads
+ """
+ super().__init__()
+ self._attention = nn.MultiheadAttention(
+ embed_dim=feature_dim, num_heads=num_heads
+ )
+ self._attention_weights = None
+
+ @property
+ def attention_weights(self) -> Optional[torch.Tensor]:
+ """
+ Contains attention weights from last forward call.
+ """
+ return self._attention_weights
+
+ def forward(
+ self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
+ ) -> torch.Tensor:
+ """
+ Args:
+ x (torch.Tensor): tensor of shape (batch_size, seq_len, feature_dim)
+ mask (torch.Tensor): bool tensor with shape (batch_size, seq_len).
+ Sequence elements that are False are invalid.
+
+ Returns:
+ Tensor with shape (batch_size, seq_len, feature_dim)
+ """
+ assert x.dim() == 3, "Requires x shape (batch_size x seq_len x feature_dim)"
+
+ if mask is not None:
+ # At least the first element of each masked batch row must be valid for
+ # key_padding_mask.
+ mask[:, 0] = True
+ mask = ~mask
+
+ # Transpose x to (seq_length x batch_size x feature_dim).
+ x = x.transpose(0, 1)
+ attn_output, self._attention_weights = self._attention(
+ x, x, x, key_padding_mask=mask
+ )
+
+ # Transpose attention output to (batch_size x seq_length x feature_dim).
+ attn_output = attn_output.transpose(0, 1)
+ return attn_output
+
+
+class LearnMaskedDefault(nn.Module):
+ """
+ Learns default values to fill invalid entries within input tensors. The
+ invalid entries are represented by a mask which is passed into forward alongside
+ the input tensor. Note the default value is only used if all entries in the batch row are
+ invalid rather than just a portion of invalid entries within each batch row.
+ """
+
+ def __init__(
+ self, feature_dim: int, init_method: str = "gaussian", freeze: bool = False
+ ):
+ """
+ Args:
+ feature_dim (int): the size of the default value parameter, this must match the
+ input tensor size.
+ init_method (str): the initial default value parameter. Options:
+ 'guassian'
+ 'zeros'
+ freeze (bool): If True, the learned default parameter weights are frozen.
+ """
+ super().__init__()
+ if init_method == "zeros":
+ self._learned_defaults = nn.Parameter(
+ torch.zeros(feature_dim), requires_grad=(not freeze)
+ )
+ elif init_method == "gaussian":
+ self._learned_defaults = nn.Parameter(
+ torch.Tensor(feature_dim), requires_grad=(not freeze)
+ )
+ nn.init.normal_(self._learned_defaults)
+ else:
+ raise NotImplementedError(
+ f"{init_method} not available. Options are: 'zeros' or 'gaussian'"
+ )
+
+ def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+ """
+ Args:
+ x (torch.Tensor): tensor of shape (batch_size, feature_dim).
+ mask (torch.Tensor): bool tensor of shape (batch_size, seq_len) If all elements
+ in the batch dimension are False the learned default parameter is used for
+ that batch element.
+
+ Returns:
+ Tensor with shape (batch_size, feature_dim)
+ """
+ # Determine which rows have no valid entries and use these for the default value mask.
+ mask = mask.view(mask.shape[0], -1).any(dim=-1)
+ for i in range(1, x.dim()):
+ mask = mask.unsqueeze(i)
+ x = x * mask.float() + self._learned_defaults * (1 - mask.float())
+ return x
+
+
+class LSTM(nn.Module):
+ """
+ Wrapper for torch.nn.LSTM that handles masked inputs.
+ """
+
+ def __init__(
+ self,
+ dim_in: int,
+ hidden_dim: int,
+ dropout: float = 0.0,
+ bidirectional: bool = False,
+ ):
+ """
+ Args:
+ dim_in (int): input feature dimension
+ hidden_dim (int): hidden dimesion of lstm layer
+ dropout (float): dropout rate - 0.0 if no dropout
+ bidirectional (bool): bidirectional or forward only
+ """
+ super().__init__()
+ self.lstm = nn.LSTM(
+ dim_in,
+ hidden_dim,
+ batch_first=True,
+ dropout=dropout,
+ bidirectional=bidirectional,
+ )
+ self.lstm.flatten_parameters()
+ self.output_dim = 2 * hidden_dim if bidirectional else hidden_dim
+ self.bidirectional = bidirectional
+
+ def forward(
+ self, data: torch.Tensor, mask: Optional[torch.Tensor] = None
+ ) -> torch.Tensor:
+ """
+ Args:
+ data (torch.Tensor): tensor with shape (batch_size, seq_len, feature_dim)
+ mask (torch.Tensor): bool tensor with shape (batch_size, seq_len).
+ Sequence elements that are False are invalid.
+
+ Returns:
+ Tensor with shape (batch_size, output_dim) - outoput_dim is determined by
+ hidden_dim and whether bidirectional or not
+ """
+ assert data.dim() == 3
+ b, t = data.shape[0], data.shape[1]
+
+ if mask is None:
+ mask = torch.ones((b, t), dtype=torch.bool)
+
+ lengths = mask.sum(axis=1)
+ x_packed = pack_padded_sequence(
+ data,
+ lengths.clamp(1, data.size(1)),
+ batch_first=True,
+ enforce_sorted=False,
+ )
+ _, (h, _) = self.lstm(x_packed)
+
+ if self.bidirectional:
+ out = torch.cat([h[0, :, :], h[1, :, :]], dim=-1)
+ else:
+ out = h[-1, :, :]
+
+ return out
+
+
+class TransposeTransformerEncoder(nn.Module):
+ """
+ Wrapper for torch.nn.TransformerEncoder that handles masked inputs.
+ """
+
+ def __init__(
+ self,
+ dim_in: int,
+ num_heads: int = 1,
+ num_layers: int = 1,
+ ):
+ """
+ Args:
+ dim_in (int): input feature dimension
+ num_heads (int): number of heads in the nn.MultiHeadAttention layers
+ num_layers (int): the number of sub-encoder-layers in the encoder
+ """
+ super().__init__()
+ self.encoder = nn.TransformerEncoder(
+ nn.TransformerEncoderLayer(dim_in, num_heads), num_layers
+ )
+
+ def forward(
+ self, data: torch.Tensor, mask: Optional[torch.Tensor] = None
+ ) -> torch.Tensor:
+ """
+ Args:
+ data (torch.Tensor): tensor with shape (batch_size, seq_len, feature_dim)
+ mask (torch.Tensor): bool tensor with shape (batch_size, seq_len).
+ Sequence elements that are False are invalid.
+
+ Returns:
+ Tensor with shape (batch_size, feature_dim)
+ """
+ if mask is not None:
+ # At least the first element of each masked batch row must be valid for
+ # key_padding_mask.
+ mask[:, 0] = True
+ mask = ~mask
+
+ out = self.encoder(
+ src=data.transpose(0, 1), src_key_padding_mask=mask
+ ).transpose(0, 1)
+
+ return out[:, 0, :]
+
+
+class MaskedSequential(nn.Sequential):
+ """
+ A sequential container that overrides forward to take a mask as well as the usual
+ input tensor. This mask is only applied to modules in _MASK_MODULES (which take
+ the mask argument).
+ """
+
+ _MASK_MODULES = [
+ MaskedTemporalPooling,
+ LearnMaskedDefault,
+ TransposeMultiheadAttention,
+ LSTM,
+ TransposeTransformerEncoder,
+ ]
+
+ def forward(self, input: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+ for module in self:
+ if any(isinstance(module, mask_type) for mask_type in self._MASK_MODULES):
+ input = module(input, mask=mask)
+ else:
+ input = module(input)
+
+ return input
+
+
+class MaskedMultiPathWay(nn.Module):
+ """
+ Masked multi-pathway is composed of a list of stream nn.Modules followed by a
+ fusion nn.Module that reduces these streams. Each stream module takes a mask
+ and input tensor.
+
+ ::
+
+ Pathway 1 ... Pathway N
+ ↓ ↓
+ Block 1 Block N
+ ↓⭠ --Fusion----↓
+ """
+
+ def __init__(
+ self,
+ *,
+ multipathway_blocks: nn.ModuleList,
+ multipathway_fusion: Optional[nn.Module],
+ ) -> None:
+ """
+ Args:
+ multipathway_blocks (nn.module_list): list of models from all pathways.
+ multipathway_fusion (nn.module): fusion model.
+ """
+ super().__init__()
+ set_attributes(self, locals())
+
+ def forward(
+ self, x_and_mask: List[Tuple[torch.Tensor, torch.Tensor]]
+ ) -> torch.Tensor:
+ out = []
+ for pathway_idx in range(len(self.multipathway_blocks)):
+ out.append(self.multipathway_blocks[pathway_idx](*x_and_mask[pathway_idx]))
+
+ if self.multipathway_fusion is not None:
+ x = self.multipathway_fusion(out)
+ return x
diff --git a/pytorchvideo/models/memory_bank.py b/pytorchvideo/models/memory_bank.py
new file mode 100644
index 00000000..f724c2d1
--- /dev/null
+++ b/pytorchvideo/models/memory_bank.py
@@ -0,0 +1,113 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchvideo.layers.utils import set_attributes
+
+
+class MemoryBank(nn.Module):
+ """
+ Performs Non-Parametric Instance Discrimination for self supervised learning on
+ video. A memory bank is built to keep and update the historical feature embedding
+ and use them for contrastive learning.
+
+ The original paper is:
+ Unsupervised Feature Learning via Non-Parametric Instance Discrimination
+ https://arxiv.org/pdf/1805.01978.pdf
+
+ More details can be found from the memory bank part in the following paper:
+ Momentum Contrast for Unsupervised Visual Representation Learning
+ https://arxiv.org/pdf/1911.05722.pdf
+ """
+
+ def __init__(
+ self,
+ backbone: nn.Module,
+ mlp: Optional[nn.Module] = None,
+ neg_size: int = 4096,
+ temperature: float = 0.07,
+ bank_size: int = 1280000,
+ dim: int = 2048,
+ mmt: float = 0.999,
+ ) -> None:
+ """
+ Args:
+ backbone (nn.Module): backbone used to forward the input.
+ mlp (nn.Module): multi-layer perception used in memory bank instance
+ discrimination model.
+ neg_size (int): size of negative samples per instance.
+ temperature (float): temperature to use for contrastive learning.
+ bank_size (int): size of the memory bank, expected to be the same size as
+ the training set.
+ dim (int): dimension of the channel.
+ mmt (float): momentum to use.
+ """
+ super().__init__()
+ set_attributes(self, locals())
+ self._init_mem_bank(bank_size, dim)
+
+ def _init_mem_bank(self, bank_size: int, dim: int) -> None:
+ """
+ Given the memory bank size and the channel dimension, initialize the memory
+ bank.
+ Args:
+ bank_size (int): size of the memory bank, expected to be the same size as
+ the training set.
+ dim (int): dimension of the channel.
+ """
+ stdv = 1.0 / math.sqrt(dim / 3)
+ self.register_buffer(
+ "memory",
+ torch.rand(
+ bank_size,
+ dim,
+ )
+ .mul_(2 * stdv)
+ .add_(-stdv)
+ .to(next(self.backbone.parameters()).device),
+ )
+
+ def forward(self, x: torch.Tensor, x_ind: torch.Tensor) -> torch.Tensor:
+ """
+ Perform contrastive learning with random sampled negative instance from the
+ memory bank. During training, update the memory bank with latest feature
+ embedding.
+ Args:
+ x (torch.tensor): a batch of image with augmentation. The input tensor
+ shape should able to be feed into the backbone.
+ x_ind (torch.tensor): the index of the image x from the dataset. Expected
+ shape is B.
+ """
+ batch_size = x.shape[0]
+ x = self.backbone(x)
+ if self.mlp is not None:
+ x = self.mlp(x)
+ # Normalize the output embedding before multiplication.
+ x = F.normalize(x, p=2, dim=1)
+ # Random sample negative instances from the memory bank.
+ idx = torch.randint(0, self.bank_size, size=(batch_size, self.neg_size + 1)).to(
+ x.device
+ )
+ # Fill the first with positive instances.
+ idx.select(1, 0).copy_(x_ind.data)
+ weight = torch.index_select(self.memory, 0, idx.view(-1)).detach()
+ weight = weight.view(batch_size, self.neg_size + 1, self.dim)
+ # Multiplication for contrastive learning.
+ out = torch.einsum("bkc,bc->bk", weight, x)
+ out = torch.div(out, self.temperature)
+ gt = torch.zeros((batch_size,), device=x.device, dtype=torch.long)
+ loss = torch.nn.functional.cross_entropy(out, gt)
+ # Update memory during training.
+ if self.training:
+ with torch.no_grad():
+ pos = torch.index_select(self.memory, 0, x_ind.view(-1))
+ pos.mul_(self.mmt)
+ pos.add_(torch.mul(x, 1 - self.mmt))
+ norm = pos.pow(2).sum(1, keepdim=True).pow(0.5)
+ updated = pos.div(norm)
+ self.memory.index_copy_(0, x_ind, updated)
+ return loss
diff --git a/pytorchvideo/models/net.py b/pytorchvideo/models/net.py
new file mode 100644
index 00000000..f9a43263
--- /dev/null
+++ b/pytorchvideo/models/net.py
@@ -0,0 +1,92 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+from pytorchvideo.layers.utils import set_attributes
+from pytorchvideo.models.weight_init import init_net_weights
+
+
+class Net(nn.Module):
+ """
+ Build a general Net models with a list of blocks for video recognition.
+
+ ::
+
+ Input
+ ↓
+ Block 1
+ ↓
+ .
+ .
+ .
+ ↓
+ Block N
+ ↓
+
+ The ResNet builder can be found in `create_resnet`.
+ """
+
+ def __init__(self, *, blocks: nn.ModuleList) -> None:
+ """
+ Args:
+ blocks (torch.nn.module_list): the list of block modules.
+ """
+ super().__init__()
+ assert blocks is not None
+ self.blocks = blocks
+ init_net_weights(self)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ for idx in range(len(self.blocks)):
+ x = self.blocks[idx](x)
+ return x
+
+
+class MultiPathWayWithFuse(nn.Module):
+ """
+ Build multi-pathway block with fusion for video recognition, each of the pathway
+ contains its own Blocks and Fusion layers across different pathways.
+
+ ::
+
+ Pathway 1 ... Pathway N
+ ↓ ↓
+ Block 1 Block N
+ ↓⭠ --Fusion----↓
+ """
+
+ def __init__(
+ self,
+ *,
+ multipathway_blocks: nn.ModuleList,
+ multipathway_fusion: Optional[nn.Module],
+ inplace: Optional[bool] = True,
+ ) -> None:
+ """
+ Args:
+ multipathway_blocks (nn.module_list): list of models from all pathways.
+ multipathway_fusion (nn.module): fusion model.
+ inplace (bool): If inplace, directly update the input list without making
+ a copy.
+ """
+ super().__init__()
+ set_attributes(self, locals())
+
+ def forward(self, x: List[torch.Tensor]) -> torch.Tensor:
+ assert isinstance(
+ x, list
+ ), "input for MultiPathWayWithFuse needs to be a list of tensors"
+ if self.inplace:
+ x_out = x
+ else:
+ x_out = [None] * len(x)
+ for pathway_idx in range(len(self.multipathway_blocks)):
+ if self.multipathway_blocks[pathway_idx] is not None:
+ x_out[pathway_idx] = self.multipathway_blocks[pathway_idx](
+ x[pathway_idx]
+ )
+ if self.multipathway_fusion is not None:
+ x_out = self.multipathway_fusion(x_out)
+ return x_out
diff --git a/pytorchvideo/models/r2plus1d.py b/pytorchvideo/models/r2plus1d.py
new file mode 100644
index 00000000..48f02de8
--- /dev/null
+++ b/pytorchvideo/models/r2plus1d.py
@@ -0,0 +1,309 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from functools import partial
+from typing import Callable, Tuple
+
+import torch.nn as nn
+from pytorchvideo.layers.convolutions import create_conv_2plus1d
+from pytorchvideo.models.head import create_res_basic_head
+from pytorchvideo.models.net import Net
+from pytorchvideo.models.resnet import create_bottleneck_block, create_res_stage
+from pytorchvideo.models.stem import create_res_basic_stem
+
+
+def create_2plus1d_bottleneck_block(
+ *,
+ # Convolution configs.
+ dim_in: int,
+ dim_inner: int,
+ dim_out: int,
+ conv_a_kernel_size: Tuple[int] = (1, 1, 1),
+ conv_a_stride: Tuple[int] = (1, 1, 1),
+ conv_a_padding: Tuple[int] = (0, 0, 0),
+ conv_a: Callable = nn.Conv3d,
+ conv_b_kernel_size: Tuple[int] = (3, 3, 3),
+ conv_b_stride: Tuple[int] = (2, 2, 2),
+ conv_b_padding: Tuple[int] = (1, 1, 1),
+ conv_b_num_groups: int = 1,
+ conv_b_dilation: Tuple[int] = (1, 1, 1),
+ conv_b: Callable = create_conv_2plus1d,
+ conv_c: Callable = nn.Conv3d,
+ # Norm configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+) -> nn.Module:
+ """
+ 2plus1d bottleneck block: a sequence of spatiotemporal Convolution, Normalization,
+ and Activations repeated in the following order:
+
+ ::
+
+ Conv3d (conv_a)
+ ↓
+ Normalization (norm_a)
+ ↓
+ Activation (act_a)
+ ↓
+ Conv(2+1)d (conv_b)
+ ↓
+ Normalization (norm_b)
+ ↓
+ Activation (act_b)
+ ↓
+ Conv3d (conv_c)
+ ↓
+ Normalization (norm_c)
+
+ Normalization examples include: BatchNorm3d and None (no normalization).
+ Activation examples include: ReLU, Softmax, Sigmoid, and None (no activation).
+
+ Args:
+ dim_in (int): input channel size to the bottleneck block.
+ dim_inner (int): intermediate channel size of the bottleneck.
+ dim_out (int): output channel size of the bottleneck.
+ conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
+ conv_a_stride (tuple): convolutional stride size(s) for conv_a.
+ conv_a_padding (tuple): convolutional padding(s) for conv_a.
+ conv_a (callable): a callable that constructs the conv_a conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+ conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ conv_b_stride (tuple): convolutional stride size(s) for conv_b.
+ conv_b_padding (tuple): convolutional padding(s) for conv_b.
+ conv_b_num_groups (int): number of groups for groupwise convolution for
+ conv_b.
+ conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
+ conv_b (callable): a callable that constructs the conv_b conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+ conv_c (callable): a callable that constructs the conv_c conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+
+ norm (callable): a callable that constructs normalization layer, examples
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+
+ activation (callable): a callable that constructs activation layer, examples
+ include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
+ activation).
+
+ Returns:
+ (nn.Module): 2plus1d bottleneck block.
+ """
+ return create_bottleneck_block(
+ dim_in=dim_in,
+ dim_inner=dim_inner,
+ dim_out=dim_out,
+ conv_a_kernel_size=conv_a_kernel_size,
+ conv_a_stride=conv_a_stride,
+ conv_a_padding=conv_a_padding,
+ conv_a=conv_a,
+ conv_b_kernel_size=conv_b_kernel_size,
+ conv_b_stride=conv_b_stride,
+ conv_b_padding=conv_b_padding,
+ conv_b_num_groups=conv_b_num_groups,
+ conv_b_dilation=conv_b_dilation,
+ conv_b=partial(
+ create_conv_2plus1d,
+ norm=norm,
+ norm_eps=norm_eps,
+ norm_momentum=norm_momentum,
+ activation=activation,
+ ),
+ conv_c=conv_c,
+ norm=norm,
+ norm_eps=norm_eps,
+ norm_momentum=norm_momentum,
+ activation=activation,
+ )
+
+
+def create_r2plus1d(
+ *,
+ # Input clip configs.
+ input_channel: int = 3,
+ # Model configs.
+ model_depth: int = 50,
+ model_num_class: int = 400,
+ dropout_rate: float = 0.0,
+ # Normalization configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+ # Stem configs.
+ stem_dim_out: int = 64,
+ stem_conv_kernel_size: Tuple[int] = (1, 7, 7),
+ stem_conv_stride: Tuple[int] = (1, 2, 2),
+ # Stage configs.
+ stage_conv_a_kernel_size: Tuple[Tuple[int]] = (
+ (1, 1, 1),
+ (1, 1, 1),
+ (1, 1, 1),
+ (1, 1, 1),
+ ),
+ stage_conv_b_kernel_size: Tuple[Tuple[int]] = (
+ (3, 3, 3),
+ (3, 3, 3),
+ (3, 3, 3),
+ (3, 3, 3),
+ ),
+ stage_conv_b_num_groups: Tuple[int] = (1, 1, 1, 1),
+ stage_conv_b_dilation: Tuple[Tuple[int]] = (
+ (1, 1, 1),
+ (1, 1, 1),
+ (1, 1, 1),
+ (1, 1, 1),
+ ),
+ stage_spatial_stride: Tuple[int] = (2, 2, 2, 2),
+ stage_temporal_stride: Tuple[int] = (1, 1, 2, 2),
+ stage_bottleneck: Tuple[Callable] = (
+ create_2plus1d_bottleneck_block,
+ create_2plus1d_bottleneck_block,
+ create_2plus1d_bottleneck_block,
+ create_2plus1d_bottleneck_block,
+ ),
+ # Head configs.
+ head_pool: Callable = nn.AvgPool3d,
+ head_pool_kernel_size: Tuple[int] = (4, 7, 7),
+ head_output_size: Tuple[int] = (1, 1, 1),
+ head_activation: Callable = nn.Softmax,
+ head_output_with_global_average: bool = True,
+) -> nn.Module:
+ """
+ Build the R(2+1)D network from::
+ A closer look at spatiotemporal convolutions for action recognition.
+ Du Tran, Heng Wang, Lorenzo Torresani, Jamie Ray, Yann LeCun, Manohar Paluri. CVPR 2018.
+
+ R(2+1)D follows the ResNet style architecture including three parts: Stem,
+ Stages and Head. The three parts are assembled in the following order:
+
+ ::
+
+ Input
+ ↓
+ Stem
+ ↓
+ Stage 1
+ ↓
+ .
+ .
+ .
+ ↓
+ Stage N
+ ↓
+ Head
+
+ Args:
+
+ input_channel (int): number of channels for the input video clip.
+
+ model_depth (int): the depth of the resnet.
+ model_num_class (int): the number of classes for the video dataset.
+ dropout_rate (float): dropout rate.
+
+ norm (callable): a callable that constructs normalization layer.
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+
+ activation (callable): a callable that constructs activation layer.
+
+ stem_dim_out (int): output channel size for stem.
+ stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
+ stem_conv_stride (tuple): convolutional stride size(s) of stem.
+
+ stage_conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
+ stage_conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ stage_conv_b_num_groups (tuple): number of groups for groupwise convolution
+ for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
+ stage_conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
+ stage_spatial_stride (tuple): the spatial stride for each stage.
+ stage_temporal_stride (tuple): the temporal stride for each stage.
+ stage_bottleneck (tuple): a callable that constructs bottleneck block layer
+ for each stage. Examples include: create_bottleneck_block,
+ create_2plus1d_bottleneck_block.
+
+ head_pool (callable): a callable that constructs resnet head pooling layer.
+ head_pool_kernel_size (tuple): the pooling kernel size.
+ head_output_size (tuple): the size of output tensor for head.
+ head_activation (callable): a callable that constructs activation layer.
+ head_output_with_global_average (bool): if True, perform global averaging on
+ the head output.
+
+ Returns:
+ (nn.Module): basic resnet.
+ """
+ # Number of blocks for different stages given the model depth.
+ _MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3), 101: (3, 4, 23, 3), 152: (3, 8, 36, 3)}
+
+ # Given a model depth, get the number of blocks for each stage.
+ assert (
+ model_depth in _MODEL_STAGE_DEPTH.keys()
+ ), f"{model_depth} is not in {_MODEL_STAGE_DEPTH.keys()}"
+ stage_depths = _MODEL_STAGE_DEPTH[model_depth]
+
+ blocks = []
+ # Create stem for R(2+1)D.
+ stem = create_res_basic_stem(
+ in_channels=input_channel,
+ out_channels=stem_dim_out,
+ conv_kernel_size=stem_conv_kernel_size,
+ conv_stride=stem_conv_stride,
+ conv_padding=[size // 2 for size in stem_conv_kernel_size],
+ pool=None,
+ norm=norm,
+ activation=activation,
+ )
+ blocks.append(stem)
+
+ stage_dim_in = stem_dim_out
+ stage_dim_out = stage_dim_in * 4
+
+ # Create each stage for R(2+1)D.
+ for idx in range(len(stage_depths)):
+ stage_dim_inner = stage_dim_out // 4
+ depth = stage_depths[idx]
+
+ stage_conv_b_stride = (
+ stage_temporal_stride[idx],
+ stage_spatial_stride[idx],
+ stage_spatial_stride[idx],
+ )
+
+ stage = create_res_stage(
+ depth=depth,
+ dim_in=stage_dim_in,
+ dim_inner=stage_dim_inner,
+ dim_out=stage_dim_out,
+ bottleneck=stage_bottleneck[idx],
+ conv_a_kernel_size=stage_conv_a_kernel_size[idx],
+ conv_a_stride=[1, 1, 1],
+ conv_a_padding=[size // 2 for size in stage_conv_a_kernel_size[idx]],
+ conv_b_kernel_size=stage_conv_b_kernel_size[idx],
+ conv_b_stride=stage_conv_b_stride,
+ conv_b_padding=[size // 2 for size in stage_conv_b_kernel_size[idx]],
+ conv_b_num_groups=stage_conv_b_num_groups[idx],
+ conv_b_dilation=stage_conv_b_dilation[idx],
+ norm=norm,
+ activation=activation,
+ )
+
+ blocks.append(stage)
+ stage_dim_in = stage_dim_out
+ stage_dim_out = stage_dim_out * 2
+
+ # Create head for R(2+1)D.
+ head = create_res_basic_head(
+ in_features=stage_dim_in,
+ out_features=model_num_class,
+ pool=head_pool,
+ output_size=head_output_size,
+ pool_kernel_size=head_pool_kernel_size,
+ dropout_rate=dropout_rate,
+ activation=head_activation,
+ output_with_global_average=head_output_with_global_average,
+ )
+ blocks.append(head)
+ return Net(blocks=nn.ModuleList(blocks))
diff --git a/pytorchvideo/models/resnet.py b/pytorchvideo/models/resnet.py
new file mode 100644
index 00000000..24346c83
--- /dev/null
+++ b/pytorchvideo/models/resnet.py
@@ -0,0 +1,1383 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Callable, List, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from pytorchvideo.layers.utils import set_attributes
+from pytorchvideo.models.head import create_res_basic_head
+from pytorchvideo.models.net import Net
+from pytorchvideo.models.stem import (
+ create_acoustic_res_basic_stem,
+ create_res_basic_stem,
+)
+
+
+def create_bottleneck_block(
+ *,
+ # Convolution configs.
+ dim_in: int,
+ dim_inner: int,
+ dim_out: int,
+ conv_a_kernel_size: Tuple[int] = (3, 1, 1),
+ conv_a_stride: Tuple[int] = (2, 1, 1),
+ conv_a_padding: Tuple[int] = (1, 0, 0),
+ conv_a: Callable = nn.Conv3d,
+ conv_b_kernel_size: Tuple[int] = (1, 3, 3),
+ conv_b_stride: Tuple[int] = (1, 2, 2),
+ conv_b_padding: Tuple[int] = (0, 1, 1),
+ conv_b_num_groups: int = 1,
+ conv_b_dilation: Tuple[int] = (1, 1, 1),
+ conv_b: Callable = nn.Conv3d,
+ conv_c: Callable = nn.Conv3d,
+ # Norm configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+) -> nn.Module:
+ """
+ Bottleneck block: a sequence of spatiotemporal Convolution, Normalization,
+ and Activations repeated in the following order:
+
+ ::
+
+ Conv3d (conv_a)
+ ↓
+ Normalization (norm_a)
+ ↓
+ Activation (act_a)
+ ↓
+ Conv3d (conv_b)
+ ↓
+ Normalization (norm_b)
+ ↓
+ Activation (act_b)
+ ↓
+ Conv3d (conv_c)
+ ↓
+ Normalization (norm_c)
+
+ Normalization examples include: BatchNorm3d and None (no normalization).
+ Activation examples include: ReLU, Softmax, Sigmoid, and None (no activation).
+
+ Args:
+ dim_in (int): input channel size to the bottleneck block.
+ dim_inner (int): intermediate channel size of the bottleneck.
+ dim_out (int): output channel size of the bottleneck.
+ conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
+ conv_a_stride (tuple): convolutional stride size(s) for conv_a.
+ conv_a_padding (tuple): convolutional padding(s) for conv_a.
+ conv_a (callable): a callable that constructs the conv_a conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+ conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ conv_b_stride (tuple): convolutional stride size(s) for conv_b.
+ conv_b_padding (tuple): convolutional padding(s) for conv_b.
+ conv_b_num_groups (int): number of groups for groupwise convolution for
+ conv_b.
+ conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
+ conv_b (callable): a callable that constructs the conv_b conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+ conv_c (callable): a callable that constructs the conv_c conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+
+ norm (callable): a callable that constructs normalization layer, examples
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+
+ activation (callable): a callable that constructs activation layer, examples
+ include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
+ activation).
+
+ Returns:
+ (nn.Module): resnet bottleneck block.
+ """
+ conv_a = conv_a(
+ in_channels=dim_in,
+ out_channels=dim_inner,
+ kernel_size=conv_a_kernel_size,
+ stride=conv_a_stride,
+ padding=conv_a_padding,
+ bias=False,
+ )
+ norm_a = (
+ None
+ if norm is None
+ else norm(num_features=dim_inner, eps=norm_eps, momentum=norm_momentum)
+ )
+ act_a = None if activation is None else activation()
+
+ conv_b = conv_b(
+ in_channels=dim_inner,
+ out_channels=dim_inner,
+ kernel_size=conv_b_kernel_size,
+ stride=conv_b_stride,
+ padding=conv_b_padding,
+ bias=False,
+ groups=conv_b_num_groups,
+ dilation=conv_b_dilation,
+ )
+ norm_b = (
+ None
+ if norm is None
+ else norm(num_features=dim_inner, eps=norm_eps, momentum=norm_momentum)
+ )
+ act_b = None if activation is None else activation()
+
+ conv_c = conv_c(
+ in_channels=dim_inner, out_channels=dim_out, kernel_size=(1, 1, 1), bias=False
+ )
+ norm_c = (
+ None
+ if norm is None
+ else norm(num_features=dim_out, eps=norm_eps, momentum=norm_momentum)
+ )
+
+ return BottleneckBlock(
+ conv_a=conv_a,
+ norm_a=norm_a,
+ act_a=act_a,
+ conv_b=conv_b,
+ norm_b=norm_b,
+ act_b=act_b,
+ conv_c=conv_c,
+ norm_c=norm_c,
+ )
+
+
+def create_acoustic_bottleneck_block(
+ *,
+ # Convolution configs.
+ dim_in: int,
+ dim_inner: int,
+ dim_out: int,
+ conv_a_kernel_size: Tuple[int] = (3, 1, 1),
+ conv_a_stride: Tuple[int] = (2, 1, 1),
+ conv_a_padding: Tuple[int] = (1, 0, 0),
+ conv_a: Callable = nn.Conv3d,
+ # Conv b f configs.
+ conv_b_kernel_size: Tuple[int] = (1, 1, 1),
+ conv_b_stride: Tuple[int] = (1, 1, 1),
+ conv_b_padding: Tuple[int] = (0, 0, 0),
+ conv_b_num_groups: int = 1,
+ conv_b_dilation: Tuple[int] = (1, 1, 1),
+ conv_b: Callable = nn.Conv3d,
+ conv_c: Callable = nn.Conv3d,
+ # Norm configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+) -> nn.Module:
+ """
+ Acoustic Bottleneck block: a sequence of spatiotemporal Convolution, Normalization,
+ and Activations repeated in the following order:
+
+ ::
+
+ Conv3d (conv_a)
+ ↓
+ Normalization (norm_a)
+ ↓
+ Activation (act_a)
+ ↓
+ ---------------------------------
+ ↓ ↓
+ Temporal Conv3d (conv_b) Spatial Conv3d (conv_b)
+ ↓ ↓
+ Normalization (norm_b) Normalization (norm_b)
+ ↓ ↓
+ Activation (act_b) Activation (act_b)
+ ↓ ↓
+ ---------------------------------
+ ↓
+ Conv3d (conv_c)
+ ↓
+ Normalization (norm_c)
+
+ Normalization examples include: BatchNorm3d and None (no normalization).
+ Activation examples include: ReLU, Softmax, Sigmoid, and None (no activation).
+
+ Args:
+ dim_in (int): input channel size to the bottleneck block.
+ dim_inner (int): intermediate channel size of the bottleneck.
+ dim_out (int): output channel size of the bottleneck.
+ conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
+ conv_a_stride (tuple): convolutional stride size(s) for conv_a.
+ conv_a_padding (tuple): convolutional padding(s) for conv_a.
+ conv_a (callable): a callable that constructs the conv_a conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+ conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ conv_b_stride (tuple): convolutional stride size(s) for conv_b.
+ conv_b_padding (tuple): convolutional padding(s) for conv_b.
+ conv_b_num_groups (int): number of groups for groupwise convolution for
+ conv_b.
+ conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
+ conv_b (callable): a callable that constructs the conv_b conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+ conv_c (callable): a callable that constructs the conv_c conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+
+ norm (callable): a callable that constructs normalization layer, examples
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+
+ activation (callable): a callable that constructs activation layer, examples
+ include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
+ activation).
+
+ Returns:
+ (nn.Module): resnet acoustic bottleneck block.
+ """
+ conv_a = conv_a(
+ in_channels=dim_in,
+ out_channels=dim_inner,
+ kernel_size=conv_a_kernel_size,
+ stride=conv_a_stride,
+ padding=conv_a_padding,
+ bias=False,
+ )
+ norm_a = (
+ None
+ if norm is None
+ else norm(num_features=dim_inner, eps=norm_eps, momentum=norm_momentum)
+ )
+ act_a = None if activation is None else activation()
+
+ conv_b_1_kernel_size = [conv_b_kernel_size[0], 1, 1]
+ conv_b_1_stride = conv_b_stride
+ conv_b_1_padding = [conv_b_padding[0], 0, 0]
+
+ conv_b_2_kernel_size = [1, conv_b_kernel_size[1], conv_b_kernel_size[2]]
+ conv_b_2_stride = conv_b_stride
+ conv_b_2_padding = [0, conv_b_padding[1], conv_b_padding[2]]
+
+ conv_b_1_num_groups, conv_b_2_num_groups = (conv_b_num_groups,) * 2
+ conv_b_1_dilation = [conv_b_dilation[0], 1, 1]
+ conv_b_2_dilation = [1, conv_b_dilation[1], conv_b_dilation[2]]
+
+ conv_b_1 = conv_b(
+ in_channels=dim_inner,
+ out_channels=dim_inner,
+ kernel_size=conv_b_1_kernel_size,
+ stride=conv_b_1_stride,
+ padding=conv_b_1_padding,
+ bias=False,
+ groups=conv_b_1_num_groups,
+ dilation=conv_b_1_dilation,
+ )
+ norm_b_1 = (
+ None
+ if norm is None
+ else norm(num_features=dim_inner, eps=norm_eps, momentum=norm_momentum)
+ )
+ act_b_1 = None if activation is None else activation()
+
+ conv_b_2 = conv_b(
+ in_channels=dim_inner,
+ out_channels=dim_inner,
+ kernel_size=conv_b_2_kernel_size,
+ stride=conv_b_2_stride,
+ padding=conv_b_2_padding,
+ bias=False,
+ groups=conv_b_2_num_groups,
+ dilation=conv_b_2_dilation,
+ )
+ norm_b_2 = (
+ None
+ if norm is None
+ else norm(num_features=dim_inner, eps=norm_eps, momentum=norm_momentum)
+ )
+ act_b_2 = None if activation is None else activation()
+
+ conv_c = conv_c(
+ in_channels=dim_inner, out_channels=dim_out, kernel_size=(1, 1, 1), bias=False
+ )
+ norm_c = (
+ None
+ if norm is None
+ else norm(num_features=dim_out, eps=norm_eps, momentum=norm_momentum)
+ )
+
+ return SeparableBottleneckBlock(
+ conv_a=conv_a,
+ norm_a=norm_a,
+ act_a=act_a,
+ conv_b=nn.ModuleList([conv_b_2, conv_b_1]),
+ norm_b=nn.ModuleList([norm_b_2, norm_b_1]),
+ act_b=nn.ModuleList([act_b_2, act_b_1]),
+ conv_c=conv_c,
+ norm_c=norm_c,
+ )
+
+
+def create_res_block(
+ *,
+ # Bottleneck Block configs.
+ dim_in: int,
+ dim_inner: int,
+ dim_out: int,
+ bottleneck: Callable,
+ use_shortcut: bool = False,
+ branch_fusion: Callable = lambda x, y: x + y,
+ # Conv configs.
+ conv_a_kernel_size: Tuple[int] = (3, 1, 1),
+ conv_a_stride: Tuple[int] = (2, 1, 1),
+ conv_a_padding: Tuple[int] = (1, 0, 0),
+ conv_a: Callable = nn.Conv3d,
+ conv_b_kernel_size: Tuple[int] = (1, 3, 3),
+ conv_b_stride: Tuple[int] = (1, 2, 2),
+ conv_b_padding: Tuple[int] = (0, 1, 1),
+ conv_b_num_groups: int = 1,
+ conv_b_dilation: Tuple[int] = (1, 1, 1),
+ conv_b: Callable = nn.Conv3d,
+ conv_c: Callable = nn.Conv3d,
+ conv_skip: Callable = nn.Conv3d,
+ # Norm configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ # Activation configs.
+ activation_bottleneck: Callable = nn.ReLU,
+ activation_block: Callable = nn.ReLU,
+) -> nn.Module:
+ """
+ Residual block. Performs a summation between an identity shortcut in branch1 and a
+ main block in branch2. When the input and output dimensions are different, a
+ convolution followed by a normalization will be performed.
+
+ ::
+
+
+ Input
+ |-------+
+ ↓ |
+ Block |
+ ↓ |
+ Summation ←-+
+ ↓
+ Activation
+
+ Normalization examples include: BatchNorm3d and None (no normalization).
+ Activation examples include: ReLU, Softmax, Sigmoid, and None (no activation).
+ Transform examples include: BottleneckBlock.
+
+ Args:
+ dim_in (int): input channel size to the bottleneck block.
+ dim_inner (int): intermediate channel size of the bottleneck.
+ dim_out (int): output channel size of the bottleneck.
+ bottleneck (callable): a callable that constructs bottleneck block layer.
+ Examples include: create_bottleneck_block.
+ use_shortcut (bool): If true, use conv and norm layers in skip connection.
+ branch_fusion (callable): a callable that constructs summation layer.
+ Examples include: lambda x, y: x + y, OctaveSum.
+
+ conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
+ conv_a_stride (tuple): convolutional stride size(s) for conv_a.
+ conv_a_padding (tuple): convolutional padding(s) for conv_a.
+ conv_a (callable): a callable that constructs the conv_a conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+ conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ conv_b_stride (tuple): convolutional stride size(s) for conv_b.
+ conv_b_padding (tuple): convolutional padding(s) for conv_b.
+ conv_b_num_groups (int): number of groups for groupwise convolution for
+ conv_b.
+ conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
+ conv_b (callable): a callable that constructs the conv_b conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+ conv_c (callable): a callable that constructs the conv_c conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+ conv_skip (callable): a callable that constructs the conv_skip conv layer,
+ examples include nn.Conv3d, OctaveConv, etc
+
+ norm (callable): a callable that constructs normalization layer. Examples
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+
+ activation_bottleneck (callable): a callable that constructs activation layer in
+ bottleneck. Examples include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None
+ (not performing activation).
+ activation_block (callable): a callable that constructs activation layer used
+ at the end of the block. Examples include: nn.ReLU, nn.Softmax, nn.Sigmoid,
+ and None (not performing activation).
+
+ Returns:
+ (nn.Module): resnet basic block layer.
+ """
+ branch1_conv_stride = tuple(map(np.prod, zip(conv_a_stride, conv_b_stride)))
+ norm_model = None
+ if use_shortcut or (
+ norm is not None and (dim_in != dim_out or np.prod(branch1_conv_stride) != 1)
+ ):
+ norm_model = norm(num_features=dim_out, eps=norm_eps, momentum=norm_momentum)
+
+ return ResBlock(
+ branch1_conv=conv_skip(
+ dim_in,
+ dim_out,
+ kernel_size=(1, 1, 1),
+ stride=branch1_conv_stride,
+ bias=False,
+ )
+ if (dim_in != dim_out or np.prod(branch1_conv_stride) != 1) or use_shortcut
+ else None,
+ branch1_norm=norm_model,
+ branch2=bottleneck(
+ dim_in=dim_in,
+ dim_inner=dim_inner,
+ dim_out=dim_out,
+ conv_a_kernel_size=conv_a_kernel_size,
+ conv_a_stride=conv_a_stride,
+ conv_a_padding=conv_a_padding,
+ conv_a=conv_a,
+ conv_b_kernel_size=conv_b_kernel_size,
+ conv_b_stride=conv_b_stride,
+ conv_b_padding=conv_b_padding,
+ conv_b_num_groups=conv_b_num_groups,
+ conv_b_dilation=conv_b_dilation,
+ conv_b=conv_b,
+ conv_c=conv_c,
+ norm=norm,
+ norm_eps=norm_eps,
+ norm_momentum=norm_momentum,
+ activation=activation_bottleneck,
+ ),
+ activation=None if activation_block is None else activation_block(),
+ branch_fusion=branch_fusion,
+ )
+
+
+def create_res_stage(
+ *,
+ # Stage configs.
+ depth: int,
+ # Bottleneck Block configs.
+ dim_in: int,
+ dim_inner: int,
+ dim_out: int,
+ bottleneck: Callable,
+ # Conv configs.
+ conv_a_kernel_size: Union[Tuple[int], List[Tuple[int]]] = (3, 1, 1),
+ conv_a_stride: Tuple[int] = (2, 1, 1),
+ conv_a_padding: Union[Tuple[int], List[Tuple[int]]] = (1, 0, 0),
+ conv_a: Callable = nn.Conv3d,
+ conv_b_kernel_size: Tuple[int] = (1, 3, 3),
+ conv_b_stride: Tuple[int] = (1, 2, 2),
+ conv_b_padding: Tuple[int] = (0, 1, 1),
+ conv_b_num_groups: int = 1,
+ conv_b_dilation: Tuple[int] = (1, 1, 1),
+ conv_b: Callable = nn.Conv3d,
+ conv_c: Callable = nn.Conv3d,
+ # Norm configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+) -> nn.Module:
+ """
+ Create Residual Stage, which composes sequential blocks that make up a ResNet. These
+ blocks could be, for example, Residual blocks, Non-Local layers, or
+ Squeeze-Excitation layers.
+
+ ::
+
+
+ Input
+ ↓
+ ResBlock
+ ↓
+ .
+ .
+ .
+ ↓
+ ResBlock
+
+ Normalization examples include: BatchNorm3d and None (no normalization).
+ Activation examples include: ReLU, Softmax, Sigmoid, and None (no activation).
+ Bottleneck examples include: create_bottleneck_block.
+
+ Args:
+ depth (init): number of blocks to create.
+
+ dim_in (int): input channel size to the bottleneck block.
+ dim_inner (int): intermediate channel size of the bottleneck.
+ dim_out (int): output channel size of the bottleneck.
+ bottleneck (callable): a callable that constructs bottleneck block layer.
+ Examples include: create_bottleneck_block.
+
+ conv_a_kernel_size (tuple or list of tuple): convolutional kernel size(s)
+ for conv_a. If conv_a_kernel_size is a tuple, use it for all blocks in
+ the stage. If conv_a_kernel_size is a list of tuple, the kernel sizes
+ will be repeated until having same length of depth in the stage. For
+ example, for conv_a_kernel_size = [(3, 1, 1), (1, 1, 1)], the kernel
+ size for the first 6 blocks would be [(3, 1, 1), (1, 1, 1), (3, 1, 1),
+ (1, 1, 1), (3, 1, 1)].
+ conv_a_stride (tuple): convolutional stride size(s) for conv_a.
+ conv_a_padding (tuple or list of tuple): convolutional padding(s) for
+ conv_a. If conv_a_padding is a tuple, use it for all blocks in
+ the stage. If conv_a_padding is a list of tuple, the padding sizes
+ will be repeated until having same length of depth in the stage.
+ conv_a (callable): a callable that constructs the conv_a conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+ conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ conv_b_stride (tuple): convolutional stride size(s) for conv_b.
+ conv_b_padding (tuple): convolutional padding(s) for conv_b.
+ conv_b_num_groups (int): number of groups for groupwise convolution for
+ conv_b.
+ conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
+ conv_b (callable): a callable that constructs the conv_b conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+ conv_c (callable): a callable that constructs the conv_c conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+
+ norm (callable): a callable that constructs normalization layer. Examples
+ include nn.BatchNorm3d, and None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+
+ activation (callable): a callable that constructs activation layer. Examples
+ include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
+ activation).
+
+ Returns:
+ (nn.Module): resnet basic stage layer.
+ """
+ res_blocks = []
+ if isinstance(conv_a_kernel_size[0], int):
+ conv_a_kernel_size = [conv_a_kernel_size]
+ if isinstance(conv_a_padding[0], int):
+ conv_a_padding = [conv_a_padding]
+ # Repeat conv_a kernels until having same length of depth in the stage.
+ conv_a_kernel_size = (conv_a_kernel_size * depth)[:depth]
+ conv_a_padding = (conv_a_padding * depth)[:depth]
+
+ for ind in range(depth):
+ block = create_res_block(
+ dim_in=dim_in if ind == 0 else dim_out,
+ dim_inner=dim_inner,
+ dim_out=dim_out,
+ bottleneck=bottleneck,
+ conv_a_kernel_size=conv_a_kernel_size[ind],
+ conv_a_stride=conv_a_stride if ind == 0 else (1, 1, 1),
+ conv_a_padding=conv_a_padding[ind],
+ conv_a=conv_a,
+ conv_b_kernel_size=conv_b_kernel_size,
+ conv_b_stride=conv_b_stride if ind == 0 else (1, 1, 1),
+ conv_b_padding=conv_b_padding,
+ conv_b_num_groups=conv_b_num_groups,
+ conv_b_dilation=conv_b_dilation,
+ conv_b=conv_b,
+ conv_c=conv_c,
+ norm=norm,
+ norm_eps=norm_eps,
+ norm_momentum=norm_momentum,
+ activation_bottleneck=activation,
+ activation_block=activation,
+ )
+ res_blocks.append(block)
+ return ResStage(res_blocks=nn.ModuleList(res_blocks))
+
+
+def create_resnet(
+ *,
+ # Input clip configs.
+ input_channel: int = 3,
+ # Model configs.
+ model_depth: int = 50,
+ model_num_class: int = 400,
+ dropout_rate: float = 0.5,
+ # Normalization configs.
+ norm: Callable = nn.BatchNorm3d,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+ # Stem configs.
+ stem_dim_out: int = 64,
+ stem_conv_kernel_size: Tuple[int] = (3, 7, 7),
+ stem_conv_stride: Tuple[int] = (1, 2, 2),
+ stem_pool: Callable = nn.MaxPool3d,
+ stem_pool_kernel_size: Tuple[int] = (1, 3, 3),
+ stem_pool_stride: Tuple[int] = (1, 2, 2),
+ # Stage configs.
+ stage1_pool: Callable = None,
+ stage1_pool_kernel_size: Tuple[int] = (2, 1, 1),
+ stage_conv_a_kernel_size: Tuple[Union[Tuple[int], List[Tuple[int]]]] = (
+ (1, 1, 1),
+ (1, 1, 1),
+ (3, 1, 1),
+ (3, 1, 1),
+ ),
+ stage_conv_b_kernel_size: Tuple[Tuple[int]] = (
+ (1, 3, 3),
+ (1, 3, 3),
+ (1, 3, 3),
+ (1, 3, 3),
+ ),
+ stage_conv_b_num_groups: Tuple[int] = (1, 1, 1, 1),
+ stage_conv_b_dilation: Tuple[Tuple[int]] = (
+ (1, 1, 1),
+ (1, 1, 1),
+ (1, 1, 1),
+ (1, 1, 1),
+ ),
+ stage_spatial_stride: Tuple[int] = (1, 2, 2, 2),
+ stage_temporal_stride: Tuple[int] = (1, 1, 1, 1),
+ bottleneck: Callable = create_bottleneck_block,
+ # Head configs.
+ head_pool: Callable = nn.AvgPool3d,
+ head_pool_kernel_size: Tuple[int] = (4, 7, 7),
+ head_output_size: Tuple[int] = (1, 1, 1),
+ head_activation: Callable = None,
+ head_output_with_global_average: bool = True,
+) -> nn.Module:
+ """
+ Build ResNet style models for video recognition. ResNet has three parts:
+ Stem, Stages and Head. Stem is the first Convolution layer (Conv1) with an
+ optional pooling layer. Stages are grouped residual blocks. There are usually
+ multiple stages and each stage may include multiple residual blocks. Head
+ may include pooling, dropout, a fully-connected layer and global spatial
+ temporal averaging. The three parts are assembled in the following order:
+
+ ::
+
+ Input
+ ↓
+ Stem
+ ↓
+ Stage 1
+ ↓
+ .
+ .
+ .
+ ↓
+ Stage N
+ ↓
+ Head
+
+ Args:
+
+ input_channel (int): number of channels for the input video clip.
+
+ model_depth (int): the depth of the resnet. Options include: 50, 101, 152.
+ model_num_class (int): the number of classes for the video dataset.
+ dropout_rate (float): dropout rate.
+
+
+ norm (callable): a callable that constructs normalization layer.
+
+ activation (callable): a callable that constructs activation layer.
+
+ stem_dim_out (int): output channel size to stem.
+ stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
+ stem_conv_stride (tuple): convolutional stride size(s) of stem.
+ stem_pool (callable): a callable that constructs resnet head pooling layer.
+ stem_pool_kernel_size (tuple): pooling kernel size(s).
+ stem_pool_stride (tuple): pooling stride size(s).
+
+ stage_conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
+ stage_conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ stage_conv_b_num_groups (tuple): number of groups for groupwise convolution
+ for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
+ stage_conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
+ stage_spatial_stride (tuple): the spatial stride for each stage.
+ stage_temporal_stride (tuple): the temporal stride for each stage.
+ bottleneck (callable): a callable that constructs bottleneck block layer.
+ Examples include: create_bottleneck_block.
+
+ head_pool (callable): a callable that constructs resnet head pooling layer.
+ head_pool_kernel_size (tuple): the pooling kernel size.
+ head_output_size (tuple): the size of output tensor for head.
+ head_activation (callable): a callable that constructs activation layer.
+ head_output_with_global_average (bool): if True, perform global averaging on
+ the head output.
+
+ Returns:
+ (nn.Module): basic resnet.
+ """
+ # Number of blocks for different stages given the model depth.
+ _MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3), 101: (3, 4, 23, 3), 152: (3, 8, 36, 3)}
+
+ # Given a model depth, get the number of blocks for each stage.
+ assert (
+ model_depth in _MODEL_STAGE_DEPTH.keys()
+ ), f"{model_depth} is not in {_MODEL_STAGE_DEPTH.keys()}"
+ stage_depths = _MODEL_STAGE_DEPTH[model_depth]
+
+ blocks = []
+ # Create stem for resnet.
+ stem = create_res_basic_stem(
+ in_channels=input_channel,
+ out_channels=stem_dim_out,
+ conv_kernel_size=stem_conv_kernel_size,
+ conv_stride=stem_conv_stride,
+ conv_padding=[size // 2 for size in stem_conv_kernel_size],
+ pool=stem_pool,
+ pool_kernel_size=stem_pool_kernel_size,
+ pool_stride=stem_pool_stride,
+ pool_padding=[size // 2 for size in stem_pool_kernel_size],
+ norm=norm,
+ activation=activation,
+ )
+ blocks.append(stem)
+
+ stage_dim_in = stem_dim_out
+ stage_dim_out = stage_dim_in * 4
+
+ # Create each stage for resnet.
+ for idx in range(len(stage_depths)):
+ stage_dim_inner = stage_dim_out // 4
+ depth = stage_depths[idx]
+
+ stage_conv_a_kernel = stage_conv_a_kernel_size[idx]
+ stage_conv_a_stride = (stage_temporal_stride[idx], 1, 1)
+ stage_conv_a_padding = (
+ [size // 2 for size in stage_conv_a_kernel]
+ if isinstance(stage_conv_a_kernel[0], int)
+ else [[size // 2 for size in sizes] for sizes in stage_conv_a_kernel]
+ )
+
+ stage_conv_b_stride = (1, stage_spatial_stride[idx], stage_spatial_stride[idx])
+
+ stage = create_res_stage(
+ depth=depth,
+ dim_in=stage_dim_in,
+ dim_inner=stage_dim_inner,
+ dim_out=stage_dim_out,
+ bottleneck=bottleneck,
+ conv_a_kernel_size=stage_conv_a_kernel,
+ conv_a_stride=stage_conv_a_stride,
+ conv_a_padding=stage_conv_a_padding,
+ conv_b_kernel_size=stage_conv_b_kernel_size[idx],
+ conv_b_stride=stage_conv_b_stride,
+ conv_b_padding=[size // 2 for size in stage_conv_b_kernel_size[idx]],
+ conv_b_num_groups=stage_conv_b_num_groups[idx],
+ conv_b_dilation=stage_conv_b_dilation[idx],
+ norm=norm,
+ activation=activation,
+ )
+
+ blocks.append(stage)
+ stage_dim_in = stage_dim_out
+ stage_dim_out = stage_dim_out * 2
+
+ if idx == 0 and stage1_pool is not None:
+ blocks.append(
+ stage1_pool(
+ kernel_size=stage1_pool_kernel_size,
+ stride=stage1_pool_kernel_size,
+ padding=(0, 0, 0),
+ )
+ )
+
+ head = create_res_basic_head(
+ in_features=stage_dim_in,
+ out_features=model_num_class,
+ pool=head_pool,
+ output_size=head_output_size,
+ pool_kernel_size=head_pool_kernel_size,
+ dropout_rate=dropout_rate,
+ activation=head_activation,
+ output_with_global_average=head_output_with_global_average,
+ )
+ blocks.append(head)
+ return Net(blocks=nn.ModuleList(blocks))
+
+
+def create_acoustic_building_block(
+ *,
+ # Convolution configs.
+ dim_in: int,
+ dim_inner: int,
+ dim_out: int,
+ conv_a_kernel_size: Tuple[int] = None,
+ conv_a_stride: Tuple[int] = None,
+ conv_a_padding: Tuple[int] = None,
+ conv_a: Callable = None,
+ # Conv b f configs.
+ conv_b_kernel_size: Tuple[int] = (1, 1, 1),
+ conv_b_stride: Tuple[int] = (1, 1, 1),
+ conv_b_padding: Tuple[int] = (0, 0, 0),
+ conv_b_num_groups: int = 1,
+ conv_b_dilation: Tuple[int] = (1, 1, 1),
+ conv_b: Callable = nn.Conv3d,
+ conv_c: Callable = nn.Conv3d,
+ # Norm configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+) -> nn.Module:
+ """
+ Acoustic building block: a sequence of spatiotemporal Convolution, Normalization,
+ and Activations repeated in the following order:
+
+ ::
+
+
+ Conv3d (conv_a)
+ ↓
+ Normalization (norm_a)
+ ↓
+ Activation (act_a)
+ ↓
+ ---------------------------------
+ ↓ ↓
+ Temporal Conv3d (conv_b) Spatial Conv3d (conv_b)
+ ↓ ↓
+ Normalization (norm_b) Normalization (norm_b)
+ ↓ ↓
+ Activation (act_b) Activation (act_b)
+ ↓ ↓
+ ---------------------------------
+ ↓
+ Conv3d (conv_c)
+ ↓
+ Normalization (norm_c)
+
+ Normalization examples include: BatchNorm3d and None (no normalization).
+ Activation examples include: ReLU, Softmax, Sigmoid, and None (no activation).
+
+ Args:
+
+ dim_in (int): input channel size to the bottleneck block.
+ dim_inner (int): intermediate channel size of the bottleneck.
+ dim_out (int): output channel size of the bottleneck.
+ conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
+ conv_a_stride (tuple): convolutional stride size(s) for conv_a.
+ conv_a_padding (tuple): convolutional padding(s) for conv_a.
+ conv_a (callable): a callable that constructs the conv_a conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+ conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ conv_b_stride (tuple): convolutional stride size(s) for conv_b.
+ conv_b_padding (tuple): convolutional padding(s) for conv_b.
+ conv_b_num_groups (int): number of groups for groupwise convolution for
+ conv_b.
+ conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
+ conv_b (callable): a callable that constructs the conv_b conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+ conv_c (callable): a callable that constructs the conv_c conv layer, examples
+ include nn.Conv3d, OctaveConv, etc
+
+ norm (callable): a callable that constructs normalization layer, examples
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+
+ activation (callable): a callable that constructs activation layer, examples
+ include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
+ activation).
+
+ Returns:
+ (nn.Module): resnet acoustic bottleneck block.
+ """
+ # Conv b f configs.
+ conv_b_1_kernel_size = [conv_b_kernel_size[0], 1, 1]
+ conv_b_2_kernel_size = [1, conv_b_kernel_size[1], conv_b_kernel_size[2]]
+
+ conv_b_1_stride = [conv_b_stride[0], 1, 1]
+ conv_b_2_stride = [1, conv_b_stride[1], conv_b_stride[2]]
+
+ conv_b_1_padding = [conv_b_padding[0], 0, 0]
+ conv_b_2_padding = [0, conv_b_padding[1], conv_b_padding[2]]
+
+ conv_b_1_num_groups, conv_b_2_num_groups = (conv_b_num_groups,) * 2
+
+ conv_b_1_dilation = [conv_b_dilation[0], 1, 1]
+ conv_b_2_dilation = [1, conv_b_dilation[1], conv_b_dilation[2]]
+
+ conv_b_1 = conv_b(
+ in_channels=dim_in,
+ out_channels=dim_inner,
+ kernel_size=conv_b_1_kernel_size,
+ stride=conv_b_1_stride,
+ padding=conv_b_1_padding,
+ bias=False,
+ groups=conv_b_1_num_groups,
+ dilation=conv_b_1_dilation,
+ )
+ norm_b_1 = (
+ None
+ if norm is None
+ else norm(num_features=dim_inner, eps=norm_eps, momentum=norm_momentum)
+ )
+ act_b_1 = None if activation is None else activation()
+
+ conv_b_2 = conv_b(
+ in_channels=dim_in,
+ out_channels=dim_inner,
+ kernel_size=conv_b_2_kernel_size,
+ stride=conv_b_2_stride,
+ padding=conv_b_2_padding,
+ bias=False,
+ groups=conv_b_2_num_groups,
+ dilation=conv_b_2_dilation,
+ )
+ norm_b_2 = (
+ None
+ if norm is None
+ else norm(num_features=dim_inner, eps=norm_eps, momentum=norm_momentum)
+ )
+ act_b_2 = None if activation is None else activation()
+
+ conv_c = conv_c(
+ in_channels=dim_inner, out_channels=dim_out, kernel_size=(1, 1, 1), bias=False
+ )
+ norm_c = (
+ None
+ if norm is None
+ else norm(num_features=dim_out, eps=norm_eps, momentum=norm_momentum)
+ )
+ return SeparableBottleneckBlock(
+ conv_a=None,
+ norm_a=None,
+ act_a=None,
+ conv_b=nn.ModuleList([conv_b_1, conv_b_2]),
+ norm_b=nn.ModuleList([norm_b_1, norm_b_2]),
+ act_b=nn.ModuleList([act_b_1, act_b_2]),
+ conv_c=conv_c,
+ norm_c=norm_c,
+ )
+
+
+def create_acoustic_resnet(
+ *,
+ # Model configs.
+ input_channel: int = 2,
+ model_depth: int = 50,
+ model_num_class: int = 400,
+ dropout_rate: float = 0.5,
+ # Normalization configs.
+ norm: Callable = nn.BatchNorm3d,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+ # Stem configs.
+ stem_dim_out: int = 64,
+ stem_conv_kernel_size: Tuple[int] = (9, 9, 9),
+ stem_conv_stride: Tuple[int] = (1, 1, 1),
+ stem_conv_padding: Tuple[int] = (4, 4, 4),
+ stem_pool: Callable = nn.MaxPool3d,
+ stem_pool_kernel_size: Tuple[int] = (1, 3, 3),
+ stem_pool_stride: Tuple[int] = (1, 2, 2),
+ stem: Callable = create_acoustic_res_basic_stem,
+ # Stage configs.
+ stage_conv_a_kernel_size: Tuple[int] = (3, 1, 1),
+ stage_conv_a_padding: Tuple[int] = (1, 0, 0),
+ stage_conv_b_kernel_size: Tuple[int] = (1, 3, 3),
+ stage_conv_b_padding: Tuple[int] = (0, 1, 1),
+ stage_conv_b_num_groups: int = 1,
+ stage_conv_b_dilation: Tuple[int] = (1, 1, 1),
+ stage_spatial_stride: Tuple[int] = (1, 2, 2, 2),
+ stage_temporal_stride: Tuple[int] = (1, 2, 2, 2),
+ bottleneck: Tuple[Callable] = (
+ create_acoustic_bottleneck_block,
+ create_acoustic_bottleneck_block,
+ create_bottleneck_block,
+ create_bottleneck_block,
+ ),
+ # Head configs.
+ head_pool: Callable = nn.AvgPool3d,
+ head_output_size: Tuple[int] = (1, 1, 1),
+ head_activation: Callable = nn.Softmax,
+ head_pool_kernel_size: Tuple[int] = (1, 1, 1),
+) -> nn.Module:
+ """
+ Build ResNet style models for acoustic recognition. ResNet has three parts:
+ Stem, Stages and Head. The three parts are assembled in the following order:
+
+ ::
+
+ Input
+ ↓
+ Stem
+ ↓
+ Stage 1
+ ↓
+ .
+ .
+ .
+ ↓
+ Stage N
+ ↓
+ Head
+
+ Args:
+
+ input_channel (int): number of channels for the input video clip.
+ input_clip_length (int): length of the input video clip.
+ input_crop_size (int): spatial resolution of the input video clip.
+
+ model_depth (int): the depth of the resnet.
+ model_num_class (int): the number of classes for the video dataset.
+ dropout_rate (float): dropout rate.
+
+ norm (callable): a callable that constructs normalization layer.
+
+ activation (callable): a callable that constructs activation layer.
+
+ stem_dim_out (int): output channel size to stem.
+ stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
+ stem_conv_stride (tuple): convolutional stride size(s) of stem.
+ stem_pool (callable): a callable that constructs resnet head pooling layer.
+ stem_pool_kernel_size (tuple): pooling kernel size(s).
+ stem_pool_stride (tuple): pooling stride size(s).
+ stem (callable): a callable that constructs stem layer.
+ Examples include: create_res_video_stem.
+
+ stage_conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
+ stage_conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ stage_conv_b_num_groups (int): number of groups for groupwise convolution
+ for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
+ stage_conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
+ stage_spatial_stride (tuple): the spatial stride for each stage.
+ stage_temporal_stride (tuple): the temporal stride for each stage.
+ bottleneck (callable): a callable that constructs bottleneck block
+ layer.
+ Examples include: create_bottleneck_block.
+
+ head_pool (callable): a callable that constructs resnet head pooling layer.
+ head_output_size (tuple): the size of output tensor for head.
+ head_activation (callable): a callable that constructs activation layer.
+
+ Returns:
+ (nn.Module): acoustic resnet that takes audio inputs in log-mel-spectrogram of
+ shape B x 1 x 1 x T x F.
+ """
+ # Given a model depth, get the number of blocks for each stage.
+ _MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3), 101: (3, 4, 23, 3), 152: (3, 8, 36, 3)}
+ assert model_depth in _MODEL_STAGE_DEPTH.keys()
+ stage_depths = _MODEL_STAGE_DEPTH[model_depth]
+ assert len(bottleneck) == len(stage_depths)
+
+ blocks = []
+ # Create stem for resnet.
+ stem = stem(
+ in_channels=input_channel,
+ out_channels=stem_dim_out,
+ conv_kernel_size=stem_conv_kernel_size,
+ conv_stride=stem_conv_stride,
+ conv_padding=stem_conv_padding,
+ pool=stem_pool,
+ pool_kernel_size=stem_pool_kernel_size,
+ pool_stride=stem_pool_stride,
+ pool_padding=[size // 2 for size in stem_pool_kernel_size],
+ norm=norm,
+ activation=activation,
+ )
+ blocks.append(stem)
+ stage_dim_in = stem_dim_out
+ stage_dim_out = stage_dim_in * 4
+
+ # Create each stage for resnet.
+ for idx in range(len(stage_depths)):
+ stage_dim_inner = stage_dim_out // 4
+ depth = stage_depths[idx]
+
+ stage_conv_a_stride = (stage_temporal_stride[idx], 1, 1)
+ stage_conv_b_stride = (1, stage_spatial_stride[idx], stage_spatial_stride[idx])
+
+ stage = create_res_stage(
+ depth=depth,
+ dim_in=stage_dim_in,
+ dim_inner=stage_dim_inner,
+ dim_out=stage_dim_out,
+ bottleneck=bottleneck[idx],
+ conv_a_kernel_size=stage_conv_a_kernel_size,
+ conv_a_stride=stage_conv_a_stride,
+ conv_a_padding=stage_conv_a_padding,
+ conv_b_kernel_size=stage_conv_b_kernel_size,
+ conv_b_stride=stage_conv_b_stride,
+ conv_b_padding=stage_conv_b_padding,
+ conv_b_num_groups=stage_conv_b_num_groups,
+ conv_b_dilation=stage_conv_b_dilation,
+ norm=norm,
+ activation=activation,
+ )
+ blocks.append(stage)
+ stage_dim_in = stage_dim_out
+ stage_dim_out = stage_dim_out * 2
+
+ # Create head for resnet.
+ head = create_res_basic_head(
+ in_features=stage_dim_in,
+ out_features=model_num_class,
+ pool=head_pool,
+ output_size=head_output_size,
+ pool_kernel_size=head_pool_kernel_size,
+ dropout_rate=dropout_rate,
+ activation=head_activation,
+ )
+ blocks.append(head)
+ return Net(blocks=nn.ModuleList(blocks))
+
+
+class ResBlock(nn.Module):
+ """
+ Residual block. Performs a summation between an identity shortcut in branch1 and a
+ main block in branch2. When the input and output dimensions are different, a
+ convolution followed by a normalization will be performed.
+
+ ::
+
+
+ Input
+ |-------+
+ ↓ |
+ Block |
+ ↓ |
+ Summation ←-+
+ ↓
+ Activation
+
+ The builder can be found in `create_res_block`.
+ """
+
+ def __init__(
+ self,
+ branch1_conv: nn.Module = None,
+ branch1_norm: nn.Module = None,
+ branch2: nn.Module = None,
+ activation: nn.Module = None,
+ branch_fusion: Callable = None,
+ ) -> nn.Module:
+ """
+ Args:
+ branch1_conv (torch.nn.modules): convolutional module in branch1.
+ branch1_norm (torch.nn.modules): normalization module in branch1.
+ branch2 (torch.nn.modules): bottleneck block module in branch2.
+ activation (torch.nn.modules): activation module.
+ branch_fusion: (Callable): A callable or layer that combines branch1
+ and branch2.
+ """
+ super().__init__()
+ set_attributes(self, locals())
+ assert self.branch2 is not None
+
+ def forward(self, x) -> torch.Tensor:
+ if self.branch1_conv is None:
+ x = self.branch_fusion(x, self.branch2(x))
+ else:
+ shortcut = self.branch1_conv(x)
+ if self.branch1_norm is not None:
+ shortcut = self.branch1_norm(shortcut)
+ x = self.branch_fusion(shortcut, self.branch2(x))
+ if self.activation is not None:
+ x = self.activation(x)
+ return x
+
+
+class SeparableBottleneckBlock(nn.Module):
+ """
+ Separable Bottleneck block: a sequence of spatiotemporal Convolution, Normalization,
+ and Activations repeated in the following order. Requires a tuple of models to be
+ provided to conv_b, norm_b, act_b to perform Convolution, Normalization, and
+ Activations in parallel Separably.
+
+ ::
+
+
+ Conv3d (conv_a)
+ ↓
+ Normalization (norm_a)
+ ↓
+ Activation (act_a)
+ ↓
+ Conv3d(s) (conv_b), ...
+ ↓ (↓)
+ Normalization(s) (norm_b), ...
+ ↓ (↓)
+ Activation(s) (act_b), ...
+ ↓ (↓)
+ Reduce (sum or cat)
+ ↓
+ Conv3d (conv_c)
+ ↓
+ Normalization (norm_c)
+ """
+
+ def __init__(
+ self,
+ *,
+ conv_a: nn.Module,
+ norm_a: nn.Module,
+ act_a: nn.Module,
+ conv_b: nn.ModuleList,
+ norm_b: nn.ModuleList,
+ act_b: nn.ModuleList,
+ conv_c: nn.Module,
+ norm_c: nn.Module,
+ reduce_method: str = "sum",
+ ) -> None:
+ """
+ Args:
+ conv_a (torch.nn.modules): convolutional module.
+ norm_a (torch.nn.modules): normalization module.
+ act_a (torch.nn.modules): activation module.
+ conv_b (torch.nn.modules_list): convolutional module(s).
+ norm_b (torch.nn.modules_list): normalization module(s).
+ act_b (torch.nn.modules_list): activation module(s).
+ conv_c (torch.nn.modules): convolutional module.
+ norm_c (torch.nn.modules): normalization module.
+ reduce_method (str): if multiple conv_b is used, reduce the output with
+ `sum`, or `cat`.
+ """
+ super().__init__()
+ set_attributes(self, locals())
+ assert all(
+ op is not None for op in (self.conv_b, self.conv_c)
+ ), f"{self.conv_a}, {self.conv_b}, {self.conv_c} has None"
+ assert reduce_method in ["sum", "cat"]
+ if self.norm_c is not None:
+ # This flag is used for weight initialization.
+ self.norm_c.block_final_bn = True
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ # Explicitly forward every layer.
+ # Branch2a, for example Tx1x1, BN, ReLU.
+ if self.conv_a is not None:
+ x = self.conv_a(x)
+ if self.norm_a is not None:
+ x = self.norm_a(x)
+ if self.act_a is not None:
+ x = self.act_a(x)
+
+ # Branch2b, for example 1xHxW, BN, ReLU.
+ output = []
+ for ind in range(len(self.conv_b)):
+ x_ = self.conv_b[ind](x)
+ if self.norm_b[ind] is not None:
+ x_ = self.norm_b[ind](x_)
+ if self.act_b[ind] is not None:
+ x_ = self.act_b[ind](x_)
+ output.append(x_)
+ if self.reduce_method == "sum":
+ x = torch.stack(output, dim=0).sum(dim=0, keepdim=False)
+ elif self.reduce_method == "cat":
+ x = torch.cat(output, dim=1)
+
+ # Branch2c, for example 1x1x1, BN.
+ x = self.conv_c(x)
+ if self.norm_c is not None:
+ x = self.norm_c(x)
+ return x
+
+
+class BottleneckBlock(nn.Module):
+ """
+ Bottleneck block: a sequence of spatiotemporal Convolution, Normalization,
+ and Activations repeated in the following order:
+
+ ::
+
+
+ Conv3d (conv_a)
+ ↓
+ Normalization (norm_a)
+ ↓
+ Activation (act_a)
+ ↓
+ Conv3d (conv_b)
+ ↓
+ Normalization (norm_b)
+ ↓
+ Activation (act_b)
+ ↓
+ Conv3d (conv_c)
+ ↓
+ Normalization (norm_c)
+
+ The builder can be found in `create_bottleneck_block`.
+ """
+
+ def __init__(
+ self,
+ *,
+ conv_a: nn.Module = None,
+ norm_a: nn.Module = None,
+ act_a: nn.Module = None,
+ conv_b: nn.Module = None,
+ norm_b: nn.Module = None,
+ act_b: nn.Module = None,
+ conv_c: nn.Module = None,
+ norm_c: nn.Module = None,
+ ) -> None:
+ """
+ Args:
+ conv_a (torch.nn.modules): convolutional module.
+ norm_a (torch.nn.modules): normalization module.
+ act_a (torch.nn.modules): activation module.
+ conv_b (torch.nn.modules): convolutional module.
+ norm_b (torch.nn.modules): normalization module.
+ act_b (torch.nn.modules): activation module.
+ conv_c (torch.nn.modules): convolutional module.
+ norm_c (torch.nn.modules): normalization module.
+ """
+ super().__init__()
+ set_attributes(self, locals())
+ assert all(op is not None for op in (self.conv_a, self.conv_b, self.conv_c))
+ if self.norm_c is not None:
+ # This flag is used for weight initialization.
+ self.norm_c.block_final_bn = True
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ # Explicitly forward every layer.
+ # Branch2a, for example Tx1x1, BN, ReLU.
+ x = self.conv_a(x)
+ if self.norm_a is not None:
+ x = self.norm_a(x)
+ if self.act_a is not None:
+ x = self.act_a(x)
+
+ # Branch2b, for example 1xHxW, BN, ReLU.
+ x = self.conv_b(x)
+ if self.norm_b is not None:
+ x = self.norm_b(x)
+ if self.act_b is not None:
+ x = self.act_b(x)
+
+ # Branch2c, for example 1x1x1, BN.
+ x = self.conv_c(x)
+ if self.norm_c is not None:
+ x = self.norm_c(x)
+ return x
+
+
+class ResStage(nn.Module):
+ """
+ ResStage composes sequential blocks that make up a ResNet. These blocks could be,
+ for example, Residual blocks, Non-Local layers, or Squeeze-Excitation layers.
+
+ ::
+
+
+ Input
+ ↓
+ ResBlock
+ ↓
+ .
+ .
+ .
+ ↓
+ ResBlock
+
+ The builder can be found in `create_res_stage`.
+ """
+
+ def __init__(self, res_blocks: nn.ModuleList) -> nn.Module:
+ """
+ Args:
+ res_blocks (torch.nn.module_list): ResBlock module(s).
+ """
+ super().__init__()
+ self.res_blocks = res_blocks
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ for _, res_block in enumerate(self.res_blocks):
+ x = res_block(x)
+ return x
diff --git a/pytorchvideo/models/simclr.py b/pytorchvideo/models/simclr.py
new file mode 100644
index 00000000..305afca5
--- /dev/null
+++ b/pytorchvideo/models/simclr.py
@@ -0,0 +1,63 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from fvcore.nn.distributed import differentiable_all_gather
+from pytorchvideo.layers.utils import set_attributes
+
+
+class SimCLR(nn.Module):
+ """
+ A Simple Framework for Contrastive Learning of Visual Representations
+ Details can be found from:
+ https://arxiv.org/abs/2002.05709
+ """
+
+ def __init__(
+ self,
+ mlp: nn.Module,
+ backbone: Optional[nn.Module] = None,
+ temperature: float = 0.07,
+ ) -> None:
+ super().__init__()
+ set_attributes(self, locals())
+
+ def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+ """
+ Args:
+ x1 (torch.tensor): a batch of image with augmentation. The input tensor
+ shape should able to be feed into the backbone.
+ x2 (torch.tensor): the size batch of image with different augmentation. The
+ input tensor shape should able to be feed into the backbone.
+ """
+ if self.backbone is not None:
+ x1 = self.backbone(x1)
+ x1 = self.mlp(x1)
+ x1 = F.normalize(x1, p=2, dim=1)
+
+ if self.backbone is not None:
+ x2 = self.backbone(x2)
+ x2 = self.mlp(x2)
+ x2 = F.normalize(x2, p=2, dim=1)
+ x2 = torch.cat(differentiable_all_gather(x2), dim=0)
+
+ prod = torch.einsum("nc,kc->nk", [x1, x2])
+ prod = prod.div(self.temperature)
+ batch_size = x1.size(0)
+ if dist.is_available() and dist.is_initialized():
+ device_ind = dist.get_rank()
+ else:
+ device_ind = 0
+ gt = (
+ torch.tensor(
+ list(range(device_ind * batch_size, (device_ind + 1) * batch_size))
+ )
+ .long()
+ .to(x1.device)
+ )
+ loss = torch.nn.functional.cross_entropy(prod, gt)
+ return loss
diff --git a/pytorchvideo/models/slowfast.py b/pytorchvideo/models/slowfast.py
new file mode 100644
index 00000000..5430a595
--- /dev/null
+++ b/pytorchvideo/models/slowfast.py
@@ -0,0 +1,487 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from pytorchvideo.layers.utils import set_attributes
+from pytorchvideo.models.head import create_res_basic_head
+from pytorchvideo.models.net import MultiPathWayWithFuse, Net
+from pytorchvideo.models.resnet import create_bottleneck_block, create_res_stage
+from pytorchvideo.models.stem import create_res_basic_stem
+
+
+def create_slowfast(
+ *,
+ # SlowFast configs.
+ slowfast_channel_reduction_ratio: Union[Tuple[int], int] = (8,),
+ slowfast_conv_channel_fusion_ratio: int = 2,
+ slowfast_fusion_conv_kernel_size: Tuple[int] = (
+ 7,
+ 1,
+ 1,
+ ), # deprecated, use fusion_builder
+ slowfast_fusion_conv_stride: Tuple[int] = (
+ 4,
+ 1,
+ 1,
+ ), # deprecated, use fusion_builder
+ fusion_builder: Callable[
+ [int, int], nn.Module
+ ] = None, # Args: fusion_dim_in, stage_idx
+ # Input clip configs.
+ input_channels: Tuple[int] = (3, 3),
+ # Model configs.
+ model_depth: int = 50,
+ model_num_class: int = 400,
+ dropout_rate: float = 0.5,
+ # Normalization configs.
+ norm: Callable = nn.BatchNorm3d,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+ # Stem configs.
+ stem_function: Tuple[Callable] = (
+ create_res_basic_stem,
+ create_res_basic_stem,
+ ),
+ stem_dim_outs: Tuple[int] = (64, 8),
+ stem_conv_kernel_sizes: Tuple[Tuple[int]] = ((1, 7, 7), (5, 7, 7)),
+ stem_conv_strides: Tuple[Tuple[int]] = ((1, 2, 2), (1, 2, 2)),
+ stem_pool: Union[Callable, Tuple[Callable]] = (nn.MaxPool3d, nn.MaxPool3d),
+ stem_pool_kernel_sizes: Tuple[Tuple[int]] = ((1, 3, 3), (1, 3, 3)),
+ stem_pool_strides: Tuple[Tuple[int]] = ((1, 2, 2), (1, 2, 2)),
+ # Stage configs.
+ stage_conv_a_kernel_sizes: Tuple[Tuple[Tuple[int]]] = (
+ ((1, 1, 1), (1, 1, 1), (3, 1, 1), (3, 1, 1)),
+ ((3, 1, 1), (3, 1, 1), (3, 1, 1), (3, 1, 1)),
+ ),
+ stage_conv_b_kernel_sizes: Tuple[Tuple[Tuple[int]]] = (
+ ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)),
+ ((1, 3, 3), (1, 3, 3), (1, 3, 3), (1, 3, 3)),
+ ),
+ stage_conv_b_num_groups: Tuple[Tuple[int]] = ((1, 1, 1, 1), (1, 1, 1, 1)),
+ stage_conv_b_dilations: Tuple[Tuple[Tuple[int]]] = (
+ ((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)),
+ ((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)),
+ ),
+ stage_spatial_strides: Tuple[Tuple[int]] = ((1, 2, 2, 2), (1, 2, 2, 2)),
+ stage_temporal_strides: Tuple[Tuple[int]] = ((1, 1, 1, 1), (1, 1, 1, 1)),
+ bottleneck: Union[Callable, Tuple[Tuple[Callable]]] = (
+ (
+ create_bottleneck_block,
+ create_bottleneck_block,
+ create_bottleneck_block,
+ create_bottleneck_block,
+ ),
+ (
+ create_bottleneck_block,
+ create_bottleneck_block,
+ create_bottleneck_block,
+ create_bottleneck_block,
+ ),
+ ),
+ # Head configs.
+ head_pool: Callable = nn.AvgPool3d,
+ head_pool_kernel_sizes: Tuple[Tuple[int]] = ((8, 7, 7), (32, 7, 7)),
+ head_output_size: Tuple[int] = (1, 1, 1),
+ head_activation: Callable = None,
+ head_output_with_global_average: bool = True,
+) -> nn.Module:
+ """
+ Build SlowFast model for video recognition, SlowFast model involves a Slow pathway,
+ operating at low frame rate, to capture spatial semantics, and a Fast pathway,
+ operating at high frame rate, to capture motion at fine temporal resolution. The
+ Fast pathway can be made very lightweight by reducing its channel capacity, yet can
+ learn useful temporal information for video recognition. Details can be found from
+ the paper:
+
+ Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+ "SlowFast networks for video recognition."
+ https://arxiv.org/pdf/1812.03982.pdf
+
+ ::
+
+ Slow Input Fast Input
+ ↓ ↓
+ Stem Stem
+ ↓ ⭠ Fusion- ↓
+ Stage 1 Stage 1
+ ↓ ⭠ Fusion- ↓
+ . .
+ ↓ ↓
+ Stage N Stage N
+ ↓ ⭠ Fusion- ↓
+ ↓
+ Head
+
+ Args:
+ slowfast_channel_reduction_ratio (int): Corresponds to the inverse of the channel
+ reduction ratio, $\beta$ between the Slow and Fast pathways.
+ slowfast_conv_channel_fusion_ratio (int): Ratio of channel dimensions
+ between the Slow and Fast pathways.
+ DEPRECATED slowfast_fusion_conv_kernel_size (tuple): the convolutional kernel
+ size used for fusion.
+ DEPRECATED slowfast_fusion_conv_stride (tuple): the convolutional stride size
+ used for fusion.
+ fusion_builder (Callable[[int, int], nn.Module]): Builder function for generating
+ the fusion modules based on stage dimension and index
+
+ input_channels (tuple): number of channels for the input video clip.
+
+ model_depth (int): the depth of the resnet.
+ model_num_class (int): the number of classes for the video dataset.
+ dropout_rate (float): dropout rate.
+
+ norm (callable): a callable that constructs normalization layer.
+
+ activation (callable): a callable that constructs activation layer.
+
+ stem_function (Tuple[Callable]): a callable that constructs stem layer.
+ Examples include create_res_basic_stem. Indexed by pathway
+ stem_dim_outs (tuple): output channel size to stem.
+ stem_conv_kernel_sizes (tuple): convolutional kernel size(s) of stem.
+ stem_conv_strides (tuple): convolutional stride size(s) of stem.
+ stem_pool (Tuple[Callable]): a callable that constructs resnet head pooling layer.
+ Indexed by pathway
+ stem_pool_kernel_sizes (tuple): pooling kernel size(s).
+ stem_pool_strides (tuple): pooling stride size(s).
+
+ stage_conv_a_kernel_sizes (tuple): convolutional kernel size(s) for conv_a.
+ stage_conv_b_kernel_sizes (tuple): convolutional kernel size(s) for conv_b.
+ stage_conv_b_num_groups (tuple): number of groups for groupwise convolution
+ for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
+ stage_conv_b_dilations (tuple): dilation for 3D convolution for conv_b.
+ stage_spatial_strides (tuple): the spatial stride for each stage.
+ stage_temporal_strides (tuple): the temporal stride for each stage.
+ bottleneck (Tuple[Tuple[Callable]]): a callable that constructs bottleneck
+ block layer. Examples include: create_bottleneck_block.
+ Indexed by pathway and stage index
+
+ head_pool (callable): a callable that constructs resnet head pooling layer.
+ head_output_sizes (tuple): the size of output tensor for head.
+ head_activation (callable): a callable that constructs activation layer.
+ head_output_with_global_average (bool): if True, perform global averaging on
+ the head output.
+ Returns:
+ (nn.Module): SlowFast model.
+ """
+
+ # Number of blocks for different stages given the model depth.
+ _num_pathway = len(input_channels)
+ _MODEL_STAGE_DEPTH = {
+ 18: (1, 1, 1, 1),
+ 50: (3, 4, 6, 3),
+ 101: (3, 4, 23, 3),
+ 152: (3, 8, 36, 3),
+ }
+ assert (
+ model_depth in _MODEL_STAGE_DEPTH.keys()
+ ), f"{model_depth} is not in {_MODEL_STAGE_DEPTH.keys()}"
+ stage_depths = _MODEL_STAGE_DEPTH[model_depth]
+
+ # Fix up inputs
+ if isinstance(slowfast_channel_reduction_ratio, int):
+ slowfast_channel_reduction_ratio = (slowfast_channel_reduction_ratio,)
+ if isinstance(stem_pool, Callable):
+ stem_pool = (stem_pool,) * _num_pathway
+ if isinstance(bottleneck, Callable):
+ bottleneck = (bottleneck,) * len(stage_depths)
+ bottleneck = (bottleneck,) * _num_pathway
+ if fusion_builder is None:
+ fusion_builder = FastToSlowFusionBuilder(
+ slowfast_channel_reduction_ratio=slowfast_channel_reduction_ratio[0],
+ conv_fusion_channel_ratio=slowfast_conv_channel_fusion_ratio,
+ conv_kernel_size=slowfast_fusion_conv_kernel_size,
+ conv_stride=slowfast_fusion_conv_stride,
+ norm=norm,
+ activation=activation,
+ max_stage_idx=len(stage_depths) - 1,
+ ).create_module
+
+ # Build stem blocks.
+ stems = []
+ for pathway_idx in range(_num_pathway):
+ stems.append(
+ stem_function[pathway_idx](
+ in_channels=input_channels[pathway_idx],
+ out_channels=stem_dim_outs[pathway_idx],
+ conv_kernel_size=stem_conv_kernel_sizes[pathway_idx],
+ conv_stride=stem_conv_strides[pathway_idx],
+ conv_padding=[
+ size // 2 for size in stem_conv_kernel_sizes[pathway_idx]
+ ],
+ pool=stem_pool[pathway_idx],
+ pool_kernel_size=stem_pool_kernel_sizes[pathway_idx],
+ pool_stride=stem_pool_strides[pathway_idx],
+ pool_padding=[
+ size // 2 for size in stem_pool_kernel_sizes[pathway_idx]
+ ],
+ norm=norm,
+ activation=activation,
+ )
+ )
+
+ stages = []
+ stages.append(
+ MultiPathWayWithFuse(
+ multipathway_blocks=nn.ModuleList(stems),
+ multipathway_fusion=fusion_builder(
+ fusion_dim_in=stem_dim_outs[0],
+ stage_idx=0,
+ ),
+ )
+ )
+
+ # Build stages blocks.
+ stage_dim_in = stem_dim_outs[0]
+ stage_dim_out = stage_dim_in * 4
+ for idx in range(len(stage_depths)):
+ pathway_stage_dim_in = [
+ stage_dim_in
+ + stage_dim_in
+ * slowfast_conv_channel_fusion_ratio
+ // slowfast_channel_reduction_ratio[0],
+ ]
+ pathway_stage_dim_inner = [
+ stage_dim_out // 4,
+ ]
+ pathway_stage_dim_out = [
+ stage_dim_out,
+ ]
+ for reduction_ratio in slowfast_channel_reduction_ratio:
+ pathway_stage_dim_in = pathway_stage_dim_in + [
+ stage_dim_in // reduction_ratio
+ ]
+ pathway_stage_dim_inner = pathway_stage_dim_inner + [
+ stage_dim_out // 4 // reduction_ratio
+ ]
+ pathway_stage_dim_out = pathway_stage_dim_out + [
+ stage_dim_out // reduction_ratio
+ ]
+
+ stage = []
+ for pathway_idx in range(_num_pathway):
+ depth = stage_depths[idx]
+
+ stage_conv_a_stride = (stage_temporal_strides[pathway_idx][idx], 1, 1)
+ stage_conv_b_stride = (
+ 1,
+ stage_spatial_strides[pathway_idx][idx],
+ stage_spatial_strides[pathway_idx][idx],
+ )
+ stage.append(
+ create_res_stage(
+ depth=depth,
+ dim_in=pathway_stage_dim_in[pathway_idx],
+ dim_inner=pathway_stage_dim_inner[pathway_idx],
+ dim_out=pathway_stage_dim_out[pathway_idx],
+ bottleneck=bottleneck[pathway_idx][idx],
+ conv_a_kernel_size=stage_conv_a_kernel_sizes[pathway_idx][idx],
+ conv_a_stride=stage_conv_a_stride,
+ conv_a_padding=[
+ size // 2
+ for size in stage_conv_a_kernel_sizes[pathway_idx][idx]
+ ],
+ conv_b_kernel_size=stage_conv_b_kernel_sizes[pathway_idx][idx],
+ conv_b_stride=stage_conv_b_stride,
+ conv_b_padding=[
+ size // 2
+ for size in stage_conv_b_kernel_sizes[pathway_idx][idx]
+ ],
+ conv_b_num_groups=stage_conv_b_num_groups[pathway_idx][idx],
+ conv_b_dilation=stage_conv_b_dilations[pathway_idx][idx],
+ norm=norm,
+ activation=activation,
+ )
+ )
+ stages.append(
+ MultiPathWayWithFuse(
+ multipathway_blocks=nn.ModuleList(stage),
+ multipathway_fusion=fusion_builder(
+ fusion_dim_in=stage_dim_out,
+ stage_idx=idx + 1,
+ ),
+ )
+ )
+ stage_dim_in = stage_dim_out
+ stage_dim_out = stage_dim_out * 2
+
+ if head_pool is None:
+ pool_model = None
+ elif head_pool == nn.AdaptiveAvgPool3d:
+ pool_model = [head_pool(head_output_size[idx]) for idx in range(_num_pathway)]
+ elif head_pool == nn.AvgPool3d:
+ pool_model = [
+ head_pool(
+ kernel_size=head_pool_kernel_sizes[idx],
+ stride=(1, 1, 1),
+ padding=(0, 0, 0),
+ )
+ for idx in range(_num_pathway)
+ ]
+ else:
+ raise NotImplementedError(f"Unsupported pool_model type {pool_model}")
+
+ stages.append(PoolConcatPathway(retain_list=False, pool=nn.ModuleList(pool_model)))
+ head_in_features = stage_dim_in
+ for reduction_ratio in slowfast_channel_reduction_ratio:
+ head_in_features = head_in_features + stage_dim_in // reduction_ratio
+ stages.append(
+ create_res_basic_head(
+ in_features=head_in_features,
+ out_features=model_num_class,
+ pool=None,
+ output_size=head_output_size,
+ dropout_rate=dropout_rate,
+ activation=head_activation,
+ output_with_global_average=head_output_with_global_average,
+ )
+ )
+ return Net(blocks=nn.ModuleList(stages))
+
+
+# TODO: move to pytorchvideo/layer once we have a common.py
+class PoolConcatPathway(nn.Module):
+ """
+ Given a list of tensors, perform optional spatio-temporal pool and concatenate the
+ tensors along the channel dimension.
+ """
+
+ def __init__(
+ self,
+ retain_list: bool = False,
+ pool: Optional[nn.ModuleList] = None,
+ dim: int = 1,
+ ) -> None:
+ """
+ Args:
+ retain_list (bool): if True, return the concatenated tensor in a list.
+ pool (nn.module_list): if not None, list of pooling models for different
+ pathway before performing concatenation.
+ dim (int): dimension to performance concatenation.
+ """
+ super().__init__()
+ set_attributes(self, locals())
+
+ def forward(self, x: List[torch.Tensor]) -> torch.Tensor:
+ if self.pool is not None:
+ assert len(x) == len(self.pool)
+ output = []
+ for ind in range(len(x)):
+ if x[ind] is not None:
+ if self.pool is not None and self.pool[ind] is not None:
+ x[ind] = self.pool[ind](x[ind])
+ output.append(x[ind])
+ if self.retain_list:
+ return [torch.cat(output, 1)]
+ else:
+ return torch.cat(output, 1)
+
+
+class FastToSlowFusionBuilder:
+ def __init__(
+ self,
+ slowfast_channel_reduction_ratio: int,
+ conv_fusion_channel_ratio: float,
+ conv_kernel_size: Tuple[int],
+ conv_stride: Tuple[int],
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ activation: Callable = nn.ReLU,
+ max_stage_idx: int = 3,
+ ) -> None:
+ """
+ Given a list of two tensors from Slow pathway and Fast pathway, fusion information
+ from the Fast pathway to the Slow on through a convolution followed by a
+ concatenation, then return the fused list of tensors from Slow and Fast pathway in
+ order.
+ Args:
+ slowfast_channel_reduction_ratio (int): Reduction ratio from the stage dimension.
+ Used to compute conv_dim_in = fusion_dim_in // slowfast_channel_reduction_ratio
+ conv_fusion_channel_ratio (int): channel ratio for the convolution used to fuse
+ from Fast pathway to Slow pathway.
+ conv_kernel_size (int): kernel size of the convolution used to fuse from Fast
+ pathway to Slow pathway.
+ conv_stride (int): stride size of the convolution used to fuse from Fast pathway
+ to Slow pathway.
+ norm (callable): a callable that constructs normalization layer, examples
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+ activation (callable): a callable that constructs activation layer, examples
+ include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
+ activation).
+ max_stage_idx (int): Returns identity module if we exceed this
+ """
+ set_attributes(self, locals())
+
+ def create_module(self, fusion_dim_in: int, stage_idx: int) -> nn.Module:
+ """
+ Creates the module for the given stage
+ Args:
+ fusion_dim_in (int): input stage dimension
+ stage_idx (int): which stage this is
+ """
+ if stage_idx > self.max_stage_idx:
+ return nn.Identity()
+
+ conv_dim_in = fusion_dim_in // self.slowfast_channel_reduction_ratio
+ conv_fast_to_slow = nn.Conv3d(
+ conv_dim_in,
+ int(conv_dim_in * self.conv_fusion_channel_ratio),
+ kernel_size=self.conv_kernel_size,
+ stride=self.conv_stride,
+ padding=[k_size // 2 for k_size in self.conv_kernel_size],
+ bias=False,
+ )
+ norm_module = (
+ None
+ if self.norm is None
+ else self.norm(
+ num_features=conv_dim_in * self.conv_fusion_channel_ratio,
+ eps=self.norm_eps,
+ momentum=self.norm_momentum,
+ )
+ )
+ activation_module = None if self.activation is None else self.activation()
+ return FuseFastToSlow(
+ conv_fast_to_slow=conv_fast_to_slow,
+ norm=norm_module,
+ activation=activation_module,
+ )
+
+
+class FuseFastToSlow(nn.Module):
+ """
+ Given a list of two tensors from Slow pathway and Fast pathway, fusion information
+ from the Fast pathway to the Slow on through a convolution followed by a
+ concatenation, then return the fused list of tensors from Slow and Fast pathway in
+ order.
+ """
+
+ def __init__(
+ self,
+ conv_fast_to_slow: nn.Module,
+ norm: Optional[nn.Module] = None,
+ activation: Optional[nn.Module] = None,
+ ) -> None:
+ """
+ Args:
+ conv_fast_to_slow (nn.module): convolution to perform fusion.
+ norm (nn.module): normalization module.
+ activation (torch.nn.modules): activation module.
+ """
+ super().__init__()
+ set_attributes(self, locals())
+
+ def forward(self, x):
+ x_s = x[0]
+ x_f = x[1]
+ fuse = self.conv_fast_to_slow(x_f)
+ if self.norm is not None:
+ fuse = self.norm(fuse)
+ if self.activation is not None:
+ fuse = self.activation(fuse)
+ x_s_fuse = torch.cat([x_s, fuse], 1)
+ return [x_s_fuse, x_f]
diff --git a/pytorchvideo/models/stem.py b/pytorchvideo/models/stem.py
new file mode 100644
index 00000000..1f110215
--- /dev/null
+++ b/pytorchvideo/models/stem.py
@@ -0,0 +1,260 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Callable, Tuple
+
+import torch
+import torch.nn as nn
+from pytorchvideo.layers.convolutions import ConvReduce3D
+from pytorchvideo.layers.utils import set_attributes
+
+
+def create_res_basic_stem(
+ *,
+ # Conv configs.
+ in_channels: int,
+ out_channels: int,
+ conv_kernel_size: Tuple[int] = (3, 7, 7),
+ conv_stride: Tuple[int] = (1, 2, 2),
+ conv_padding: Tuple[int] = (1, 3, 3),
+ conv_bias: bool = False,
+ conv: Callable = nn.Conv3d,
+ # Pool configs.
+ pool: Callable = nn.MaxPool3d,
+ pool_kernel_size: Tuple[int] = (1, 3, 3),
+ pool_stride: Tuple[int] = (1, 2, 2),
+ pool_padding: Tuple[int] = (0, 1, 1),
+ # BN configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+) -> nn.Module:
+ """
+ Creates the basic resnet stem layer. It performs spatiotemporal Convolution, BN, and
+ Relu following by a spatiotemporal pooling.
+
+ ::
+
+ Conv3d
+ ↓
+ Normalization
+ ↓
+ Activation
+ ↓
+ Pool3d
+
+ Normalization options include: BatchNorm3d and None (no normalization).
+ Activation options include: ReLU, Softmax, Sigmoid, and None (no activation).
+ Pool3d options include: AvgPool3d, MaxPool3d, and None (no pooling).
+
+ Args:
+
+ in_channels (int): input channel size of the convolution.
+ out_channels (int): output channel size of the convolution.
+ conv_kernel_size (tuple): convolutional kernel size(s).
+ conv_stride (tuple): convolutional stride size(s).
+ conv_padding (tuple): convolutional padding size(s).
+ conv_bias (bool): convolutional bias. If true, adds a learnable bias to the
+ output.
+ conv (callable): Callable used to build the convolution layer.
+
+ pool (callable): a callable that constructs pooling layer, options include:
+ nn.AvgPool3d, nn.MaxPool3d, and None (not performing pooling).
+ pool_kernel_size (tuple): pooling kernel size(s).
+ pool_stride (tuple): pooling stride size(s).
+ pool_padding (tuple): pooling padding size(s).
+
+ norm (callable): a callable that constructs normalization layer, options
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+
+ activation (callable): a callable that constructs activation layer, options
+ include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
+ activation).
+
+ Returns:
+ (nn.Module): resnet basic stem layer.
+ """
+ conv_module = conv(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=conv_kernel_size,
+ stride=conv_stride,
+ padding=conv_padding,
+ bias=conv_bias,
+ )
+ norm_module = (
+ None
+ if norm is None
+ else norm(num_features=out_channels, eps=norm_eps, momentum=norm_momentum)
+ )
+ activation_module = None if activation is None else activation()
+ pool_module = (
+ None
+ if pool is None
+ else pool(
+ kernel_size=pool_kernel_size, stride=pool_stride, padding=pool_padding
+ )
+ )
+
+ return ResNetBasicStem(
+ conv=conv_module,
+ norm=norm_module,
+ activation=activation_module,
+ pool=pool_module,
+ )
+
+
+def create_acoustic_res_basic_stem(
+ *,
+ # Conv configs.
+ in_channels: int,
+ out_channels: int,
+ conv_kernel_size: Tuple[int] = (3, 7, 7),
+ conv_stride: Tuple[int] = (1, 1, 1),
+ conv_padding: Tuple[int] = (1, 3, 3),
+ conv_bias: bool = False,
+ # Pool configs.
+ pool: Callable = nn.MaxPool3d,
+ pool_kernel_size: Tuple[int] = (1, 3, 3),
+ pool_stride: Tuple[int] = (1, 2, 2),
+ pool_padding: Tuple[int] = (0, 1, 1),
+ # BN configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+) -> nn.Module:
+ """
+ Creates the acoustic resnet stem layer. It performs a spatial and a temporal
+ Convolution in parallel, then performs, BN, and Relu following by a spatiotemporal
+ pooling.
+
+ ::
+
+ Conv3d Conv3d
+ ↓
+ Normalization
+ ↓
+ Activation
+ ↓
+ Pool3d
+
+ Normalization options include: BatchNorm3d and None (no normalization).
+ Activation options include: ReLU, Softmax, Sigmoid, and None (no activation).
+ Pool3d options include: AvgPool3d, MaxPool3d, and None (no pooling).
+
+ Args:
+ in_channels (int): input channel size of the convolution.
+ out_channels (int): output channel size of the convolution.
+ conv_kernel_size (tuple): convolutional kernel size(s).
+ conv_stride (tuple): convolutional stride size(s), it will be performed as
+ temporal and spatial convolution in parallel.
+ conv_padding (tuple): convolutional padding size(s), it will be performed
+ as temporal and spatial convolution in parallel.
+ conv_bias (bool): convolutional bias. If true, adds a learnable bias to the
+ output.
+
+ pool (callable): a callable that constructs pooling layer, options include:
+ nn.AvgPool3d, nn.MaxPool3d, and None (not performing pooling).
+ pool_kernel_size (tuple): pooling kernel size(s).
+ pool_stride (tuple): pooling stride size(s).
+ pool_padding (tuple): pooling padding size(s).
+
+ norm (callable): a callable that constructs normalization layer, options
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+
+ activation (callable): a callable that constructs activation layer, options
+ include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
+ activation).
+
+ Returns:
+ (nn.Module): resnet basic stem layer.
+ """
+ conv_module = ConvReduce3D(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=(
+ # Temporal conv kernel size.
+ (conv_kernel_size[0], 1, 1),
+ # Spatial conv kernel size.
+ (1, conv_kernel_size[1], conv_kernel_size[2]),
+ ),
+ stride=(conv_stride, conv_stride),
+ padding=((conv_padding[0], 0, 0), (0, conv_padding[1], conv_padding[2])),
+ bias=(conv_bias, conv_bias),
+ reduction_method="sum",
+ )
+ norm_module = (
+ None
+ if norm is None
+ else norm(num_features=out_channels, eps=norm_eps, momentum=norm_momentum)
+ )
+ activation_module = None if activation is None else activation()
+ pool_module = (
+ None
+ if pool is None
+ else pool(
+ kernel_size=pool_kernel_size, stride=pool_stride, padding=pool_padding
+ )
+ )
+
+ return ResNetBasicStem(
+ conv=conv_module,
+ norm=norm_module,
+ activation=activation_module,
+ pool=pool_module,
+ )
+
+
+class ResNetBasicStem(nn.Module):
+ """
+ ResNet basic 3D stem module. Performs spatiotemporal Convolution, BN, and activation
+ following by a spatiotemporal pooling.
+
+ ::
+
+ Conv3d
+ ↓
+ Normalization
+ ↓
+ Activation
+ ↓
+ Pool3d
+
+ The builder can be found in `create_res_basic_stem`.
+ """
+
+ def __init__(
+ self,
+ *,
+ conv: nn.Module = None,
+ norm: nn.Module = None,
+ activation: nn.Module = None,
+ pool: nn.Module = None,
+ ) -> None:
+ """
+ Args:
+ conv (torch.nn.modules): convolutional module.
+ norm (torch.nn.modules): normalization module.
+ activation (torch.nn.modules): activation module.
+ pool (torch.nn.modules): pooling module.
+ """
+ super().__init__()
+ set_attributes(self, locals())
+ assert self.conv is not None
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.conv(x)
+ if self.norm is not None:
+ x = self.norm(x)
+ if self.activation is not None:
+ x = self.activation(x)
+ if self.pool is not None:
+ x = self.pool(x)
+ return x
diff --git a/pytorchvideo/models/weight_init.py b/pytorchvideo/models/weight_init.py
new file mode 100644
index 00000000..258e7bfc
--- /dev/null
+++ b/pytorchvideo/models/weight_init.py
@@ -0,0 +1,42 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import torch.nn as nn
+from fvcore.nn.weight_init import c2_msra_fill
+
+
+def init_net_weights(model: nn.Module, fc_init_std: float = 0.01) -> None:
+ """
+ Performs ResNet style weight initialization. That is, recursively initialize the
+ given model in the following way for each type:
+ Conv - Follow the initialization of kaiming_normal:
+ https://pytorch.org/docs/stable/_modules/torch/nn/init.html#kaiming_normal_
+ BatchNorm - Set weight and bias of last BatchNorm at every residual bottleneck
+ to 0.
+ Linear - Set weight to 0 mean Gaussian with std deviation fc_init_std and bias
+ to 0.
+ Args:
+ fc_init_std (float): the expected standard deviation for fully-connected layer.
+ """
+ for m in model.modules():
+ if isinstance(m, (nn.Conv2d, nn.Conv3d)):
+ """
+ Follow the initialization method proposed in:
+ {He, Kaiming, et al.
+ "Delving deep into rectifiers: Surpassing human-level
+ performance on imagenet classification."
+ arXiv preprint arXiv:1502.01852 (2015)}
+ """
+ c2_msra_fill(m)
+ elif isinstance(m, nn.modules.batchnorm._NormBase):
+ if m.weight is not None:
+ if hasattr(m, "block_final_bn") and m.block_final_bn:
+ m.weight.data.fill_(0.0)
+ else:
+ m.weight.data.fill_(1.0)
+ if m.bias is not None:
+ m.bias.data.zero_()
+ if isinstance(m, nn.Linear):
+ m.weight.data.normal_(mean=0.0, std=fc_init_std)
+ if m.bias is not None:
+ m.bias.data.zero_()
+ return model
diff --git a/pytorchvideo/models/x3d.py b/pytorchvideo/models/x3d.py
new file mode 100644
index 00000000..f74c5921
--- /dev/null
+++ b/pytorchvideo/models/x3d.py
@@ -0,0 +1,800 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Callable, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from fvcore.nn.squeeze_excitation import SqueezeExcitation
+from pytorchvideo.layers.convolutions import Conv2plus1d
+from pytorchvideo.layers.swish import Swish
+from pytorchvideo.layers.utils import round_repeats, round_width, set_attributes
+from pytorchvideo.models.head import ResNetBasicHead
+from pytorchvideo.models.net import Net
+from pytorchvideo.models.resnet import BottleneckBlock, ResBlock, ResStage
+from pytorchvideo.models.stem import ResNetBasicStem
+
+
+def create_x3d_stem(
+ *,
+ # Conv configs.
+ in_channels: int,
+ out_channels: int,
+ conv_kernel_size: Tuple[int] = (5, 3, 3),
+ conv_stride: Tuple[int] = (1, 2, 2),
+ conv_padding: Tuple[int] = (2, 1, 1),
+ # BN configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+) -> nn.Module:
+ """
+ Creates the stem layer for X3D. It performs spatial Conv, temporal Conv, BN, and Relu.
+
+ ::
+
+ Conv_xy
+ ↓
+ Conv_t
+ ↓
+ Normalization
+ ↓
+ Activation
+
+ Args:
+ in_channels (int): input channel size of the convolution.
+ out_channels (int): output channel size of the convolution.
+ conv_kernel_size (tuple): convolutional kernel size(s).
+ conv_stride (tuple): convolutional stride size(s).
+ conv_padding (tuple): convolutional padding size(s).
+
+ norm (callable): a callable that constructs normalization layer, options
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+
+ activation (callable): a callable that constructs activation layer, options
+ include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
+ activation).
+
+ Returns:
+ (nn.Module): X3D stem layer.
+ """
+ conv_xy_module = nn.Conv3d(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=(1, conv_kernel_size[1], conv_kernel_size[2]),
+ stride=(1, conv_stride[1], conv_stride[2]),
+ padding=(0, conv_padding[1], conv_padding[2]),
+ bias=False,
+ )
+ conv_t_module = nn.Conv3d(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=(conv_kernel_size[0], 1, 1),
+ stride=(conv_stride[0], 1, 1),
+ padding=(conv_padding[0], 0, 0),
+ bias=False,
+ groups=out_channels,
+ )
+ stacked_conv_module = Conv2plus1d(
+ conv_t=conv_xy_module,
+ norm=None,
+ activation=None,
+ conv_xy=conv_t_module,
+ )
+
+ norm_module = (
+ None
+ if norm is None
+ else norm(num_features=out_channels, eps=norm_eps, momentum=norm_momentum)
+ )
+ activation_module = None if activation is None else activation()
+
+ return ResNetBasicStem(
+ conv=stacked_conv_module,
+ norm=norm_module,
+ activation=activation_module,
+ pool=None,
+ )
+
+
+def create_x3d_bottleneck_block(
+ *,
+ # Convolution configs.
+ dim_in: int,
+ dim_inner: int,
+ dim_out: int,
+ conv_kernel_size: Tuple[int] = (3, 3, 3),
+ conv_stride: Tuple[int] = (1, 2, 2),
+ # Norm configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ se_ratio: float = 0.0625,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+ inner_act: Callable = Swish,
+) -> nn.Module:
+ """
+ Bottleneck block for X3D: a sequence of Conv, Normalization with optional SE block,
+ and Activations repeated in the following order:
+
+ ::
+
+ Conv3d (conv_a)
+ ↓
+ Normalization (norm_a)
+ ↓
+ Activation (act_a)
+ ↓
+ Conv3d (conv_b)
+ ↓
+ Normalization (norm_b)
+ ↓
+ Squeeze-and-Excitation
+ ↓
+ Activation (act_b)
+ ↓
+ Conv3d (conv_c)
+ ↓
+ Normalization (norm_c)
+
+ Args:
+ dim_in (int): input channel size to the bottleneck block.
+ dim_inner (int): intermediate channel size of the bottleneck.
+ dim_out (int): output channel size of the bottleneck.
+ conv_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ conv_stride (tuple): convolutional stride size(s) for conv_b.
+
+ norm (callable): a callable that constructs normalization layer, examples
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+ se_ratio (float): if > 0, apply SE to the 3x3x3 conv, with the SE
+ channel dimensionality being se_ratio times the 3x3x3 conv dim.
+
+ activation (callable): a callable that constructs activation layer, examples
+ include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
+ activation).
+ inner_act (callable): whether use Swish activation for act_b or not.
+
+ Returns:
+ (nn.Module): X3D bottleneck block.
+ """
+ # 1x1x1 Conv
+ conv_a = nn.Conv3d(
+ in_channels=dim_in, out_channels=dim_inner, kernel_size=(1, 1, 1), bias=False
+ )
+ norm_a = (
+ None
+ if norm is None
+ else norm(num_features=dim_inner, eps=norm_eps, momentum=norm_momentum)
+ )
+ act_a = None if activation is None else activation()
+
+ # 3x3x3 Conv
+ conv_b = nn.Conv3d(
+ in_channels=dim_inner,
+ out_channels=dim_inner,
+ kernel_size=conv_kernel_size,
+ stride=conv_stride,
+ padding=[size // 2 for size in conv_kernel_size],
+ bias=False,
+ groups=dim_inner,
+ dilation=(1, 1, 1),
+ )
+ se = (
+ SqueezeExcitation(
+ num_channels=dim_inner,
+ num_channels_reduced=round_width(dim_inner, se_ratio),
+ is_3d=True,
+ )
+ if se_ratio > 0.0
+ else nn.Identity()
+ )
+ norm_b = nn.Sequential(
+ (
+ nn.Identity()
+ if norm is None
+ else norm(num_features=dim_inner, eps=norm_eps, momentum=norm_momentum)
+ ),
+ se,
+ )
+ act_b = None if inner_act is None else inner_act()
+
+ # 1x1x1 Conv
+ conv_c = nn.Conv3d(
+ in_channels=dim_inner, out_channels=dim_out, kernel_size=(1, 1, 1), bias=False
+ )
+ norm_c = (
+ None
+ if norm is None
+ else norm(num_features=dim_out, eps=norm_eps, momentum=norm_momentum)
+ )
+
+ return BottleneckBlock(
+ conv_a=conv_a,
+ norm_a=norm_a,
+ act_a=act_a,
+ conv_b=conv_b,
+ norm_b=norm_b,
+ act_b=act_b,
+ conv_c=conv_c,
+ norm_c=norm_c,
+ )
+
+
+def create_x3d_res_block(
+ *,
+ # Bottleneck Block configs.
+ dim_in: int,
+ dim_inner: int,
+ dim_out: int,
+ bottleneck: Callable = create_x3d_bottleneck_block,
+ use_shortcut: bool = True,
+ # Conv configs.
+ conv_kernel_size: Tuple[int] = (3, 3, 3),
+ conv_stride: Tuple[int] = (1, 2, 2),
+ # Norm configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ se_ratio: float = 0.0625,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+ inner_act: Callable = Swish,
+) -> nn.Module:
+ """
+ Residual block for X3D. Performs a summation between an identity shortcut in branch1 and a
+ main block in branch2. When the input and output dimensions are different, a
+ convolution followed by a normalization will be performed.
+
+ ::
+
+ Input
+ |-------+
+ ↓ |
+ Block |
+ ↓ |
+ Summation ←-+
+ ↓
+ Activation
+
+ Args:
+ dim_in (int): input channel size to the bottleneck block.
+ dim_inner (int): intermediate channel size of the bottleneck.
+ dim_out (int): output channel size of the bottleneck.
+ bottleneck (callable): a callable for create_x3d_bottleneck_block.
+
+ conv_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ conv_stride (tuple): convolutional stride size(s) for conv_b.
+
+ norm (callable): a callable that constructs normalization layer, examples
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+ se_ratio (float): if > 0, apply SE to the 3x3x3 conv, with the SE
+ channel dimensionality being se_ratio times the 3x3x3 conv dim.
+
+ activation (callable): a callable that constructs activation layer, examples
+ include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
+ activation).
+ inner_act (callable): whether use Swish activation for act_b or not.
+
+ Returns:
+ (nn.Module): X3D block layer.
+ """
+
+ norm_model = None
+ if norm is not None and dim_in != dim_out:
+ norm_model = norm(num_features=dim_out)
+
+ return ResBlock(
+ branch1_conv=nn.Conv3d(
+ dim_in,
+ dim_out,
+ kernel_size=(1, 1, 1),
+ stride=conv_stride,
+ bias=False,
+ )
+ if (dim_in != dim_out or np.prod(conv_stride) > 1) and use_shortcut
+ else None,
+ branch1_norm=norm_model if dim_in != dim_out and use_shortcut else None,
+ branch2=bottleneck(
+ dim_in=dim_in,
+ dim_inner=dim_inner,
+ dim_out=dim_out,
+ conv_kernel_size=conv_kernel_size,
+ conv_stride=conv_stride,
+ norm=norm,
+ norm_eps=norm_eps,
+ norm_momentum=norm_momentum,
+ se_ratio=se_ratio,
+ activation=activation,
+ inner_act=inner_act,
+ ),
+ activation=None if activation is None else activation(),
+ branch_fusion=lambda x, y: x + y,
+ )
+
+
+def create_x3d_res_stage(
+ *,
+ # Stage configs.
+ depth: int,
+ # Bottleneck Block configs.
+ dim_in: int,
+ dim_inner: int,
+ dim_out: int,
+ bottleneck: Callable = create_x3d_bottleneck_block,
+ # Conv configs.
+ conv_kernel_size: Tuple[int] = (3, 3, 3),
+ conv_stride: Tuple[int] = (1, 2, 2),
+ # Norm configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ se_ratio: float = 0.0625,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+ inner_act: Callable = Swish,
+) -> nn.Module:
+ """
+ Create Residual Stage, which composes sequential blocks that make up X3D.
+
+ ::
+
+ Input
+ ↓
+ ResBlock
+ ↓
+ .
+ .
+ .
+ ↓
+ ResBlock
+
+ Args:
+
+ depth (init): number of blocks to create.
+
+ dim_in (int): input channel size to the bottleneck block.
+ dim_inner (int): intermediate channel size of the bottleneck.
+ dim_out (int): output channel size of the bottleneck.
+ bottleneck (callable): a callable for create_x3d_bottleneck_block.
+
+ conv_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ conv_stride (tuple): convolutional stride size(s) for conv_b.
+
+ norm (callable): a callable that constructs normalization layer, examples
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+ se_ratio (float): if > 0, apply SE to the 3x3x3 conv, with the SE
+ channel dimensionality being se_ratio times the 3x3x3 conv dim.
+
+ activation (callable): a callable that constructs activation layer, examples
+ include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
+ activation).
+ inner_act (callable): whether use Swish activation for act_b or not.
+
+ Returns:
+ (nn.Module): X3D stage layer.
+ """
+ res_blocks = []
+ for idx in range(depth):
+ block = create_x3d_res_block(
+ dim_in=dim_in if idx == 0 else dim_out,
+ dim_inner=dim_inner,
+ dim_out=dim_out,
+ bottleneck=bottleneck,
+ conv_kernel_size=conv_kernel_size,
+ conv_stride=conv_stride if idx == 0 else (1, 1, 1),
+ norm=norm,
+ norm_eps=norm_eps,
+ norm_momentum=norm_momentum,
+ se_ratio=(se_ratio if (idx + 1) % 2 else 0.0),
+ activation=activation,
+ inner_act=inner_act,
+ )
+ res_blocks.append(block)
+
+ return ResStage(res_blocks=nn.ModuleList(res_blocks))
+
+
+def create_x3d_head(
+ *,
+ # Projection configs.
+ dim_in: int,
+ dim_inner: int,
+ dim_out: int,
+ num_classes: int,
+ # Pooling configs.
+ pool_act: Callable = nn.ReLU,
+ pool_kernel_size: Tuple[int] = (13, 5, 5),
+ # BN configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ bn_lin5_on=False,
+ # Dropout configs.
+ dropout_rate: float = 0.5,
+ # Activation configs.
+ activation: Callable = nn.Softmax,
+ # Output configs.
+ output_with_global_average: bool = True,
+) -> nn.Module:
+ """
+ Creates X3D head. This layer performs an projected pooling operation followed
+ by an dropout, a fully-connected projection, an activation layer and a global
+ spatiotemporal averaging.
+
+ ::
+
+ ProjectedPool
+ ↓
+ Dropout
+ ↓
+ Projection
+ ↓
+ Activation
+ ↓
+ Averaging
+
+ Args:
+ dim_in (int): input channel size of the X3D head.
+ dim_inner (int): intermediate channel size of the X3D head.
+ dim_out (int): output channel size of the X3D head.
+ num_classes (int): the number of classes for the video dataset.
+
+ pool_act (callable): a callable that constructs resnet pool activation
+ layer such as nn.ReLU.
+ pool_kernel_size (tuple): pooling kernel size(s) when not using adaptive
+ pooling.
+
+ norm (callable): a callable that constructs normalization layer, examples
+ include nn.BatchNorm3d, None (not performing normalization).
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+ bn_lin5_on (bool): if True, perform normalization on the features
+ before the classifier.
+
+ dropout_rate (float): dropout rate.
+
+ activation (callable): a callable that constructs resnet head activation
+ layer, examples include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not
+ applying activation).
+
+ output_with_global_average (bool): if True, perform global averaging on temporal
+ and spatial dimensions and reshape output to batch_size x out_features.
+
+ Returns:
+ (nn.Module): X3D head layer.
+ """
+ pre_conv_module = nn.Conv3d(
+ in_channels=dim_in, out_channels=dim_inner, kernel_size=(1, 1, 1), bias=False
+ )
+
+ pre_norm_module = norm(num_features=dim_inner, eps=norm_eps, momentum=norm_momentum)
+ pre_act_module = None if pool_act is None else pool_act()
+
+ if pool_kernel_size is None:
+ pool_module = nn.AdaptiveAvgPool3d((1, 1, 1))
+ else:
+ pool_module = nn.AvgPool3d(pool_kernel_size, stride=1)
+
+ post_conv_module = nn.Conv3d(
+ in_channels=dim_inner, out_channels=dim_out, kernel_size=(1, 1, 1), bias=False
+ )
+
+ if bn_lin5_on:
+ post_norm_module = norm(
+ num_features=dim_out, eps=norm_eps, momentum=norm_momentum
+ )
+ else:
+ post_norm_module = None
+ post_act_module = None if pool_act is None else pool_act()
+
+ projected_pool_module = ProjectedPool(
+ pre_conv=pre_conv_module,
+ pre_norm=pre_norm_module,
+ pre_act=pre_act_module,
+ pool=pool_module,
+ post_conv=post_conv_module,
+ post_norm=post_norm_module,
+ post_act=post_act_module,
+ )
+
+ if activation is None:
+ activation_module = None
+ elif activation == nn.Softmax:
+ activation_module = activation(dim=1)
+ elif activation == nn.Sigmoid:
+ activation_module = activation()
+ else:
+ raise NotImplementedError(
+ "{} is not supported as an activation" "function.".format(activation)
+ )
+
+ if output_with_global_average:
+ output_pool = nn.AdaptiveAvgPool3d(1)
+ else:
+ output_pool = None
+
+ return ResNetBasicHead(
+ proj=nn.Linear(dim_out, num_classes, bias=True),
+ activation=activation_module,
+ pool=projected_pool_module,
+ dropout=nn.Dropout(dropout_rate) if dropout_rate > 0 else None,
+ output_pool=output_pool,
+ )
+
+
+def create_x3d(
+ *,
+ # Input clip configs.
+ input_channel: int = 3,
+ input_clip_length: int = 13,
+ input_crop_size: int = 160,
+ # Model configs.
+ model_num_class: int = 400,
+ dropout_rate: float = 0.5,
+ width_factor: float = 2.0,
+ depth_factor: float = 2.2,
+ # Normalization configs.
+ norm: Callable = nn.BatchNorm3d,
+ norm_eps: float = 1e-5,
+ norm_momentum: float = 0.1,
+ # Activation configs.
+ activation: Callable = nn.ReLU,
+ # Stem configs.
+ stem_dim_in: int = 12,
+ stem_conv_kernel_size: Tuple[int] = (5, 3, 3),
+ stem_conv_stride: Tuple[int] = (1, 2, 2),
+ # Stage configs.
+ stage_conv_kernel_size: Tuple[Tuple[int]] = (
+ (3, 3, 3),
+ (3, 3, 3),
+ (3, 3, 3),
+ (3, 3, 3),
+ ),
+ stage_spatial_stride: Tuple[int] = (2, 2, 2, 2),
+ stage_temporal_stride: Tuple[int] = (1, 1, 1, 1),
+ bottleneck: Callable = create_x3d_bottleneck_block,
+ bottleneck_factor: float = 2.25,
+ se_ratio: float = 0.0625,
+ inner_act: Callable = Swish,
+ # Head configs.
+ head_dim_out: int = 2048,
+ head_pool_act: Callable = nn.ReLU,
+ head_bn_lin5_on: bool = False,
+ head_activation: Callable = nn.Softmax,
+ head_output_with_global_average: bool = True,
+) -> nn.Module:
+ """
+ X3D model builder. It builds a X3D network backbone, which is a ResNet.
+
+ Christoph Feichtenhofer.
+ "X3D: Expanding Architectures for Efficient Video Recognition."
+ https://arxiv.org/abs/2004.04730
+
+ ::
+
+ Input
+ ↓
+ Stem
+ ↓
+ Stage 1
+ ↓
+ .
+ .
+ .
+ ↓
+ Stage N
+ ↓
+ Head
+
+ Args:
+ input_channel (int): number of channels for the input video clip.
+ input_clip_length (int): length of the input video clip. Value for
+ different models: X3D-XS: 4; X3D-S: 13; X3D-M: 16; X3D-L: 16.
+ input_crop_size (int): spatial resolution of the input video clip.
+ Value for different models: X3D-XS: 160; X3D-S: 160; X3D-M: 224;
+ X3D-L: 312.
+
+ model_num_class (int): the number of classes for the video dataset.
+ dropout_rate (float): dropout rate.
+ width_factor (float): width expansion factor.
+ depth_factor (float): depth expansion factor. Value for different
+ models: X3D-XS: 2.2; X3D-S: 2.2; X3D-M: 2.2; X3D-L: 5.0.
+
+ norm (callable): a callable that constructs normalization layer.
+ norm_eps (float): normalization epsilon.
+ norm_momentum (float): normalization momentum.
+
+ activation (callable): a callable that constructs activation layer.
+
+ stem_dim_in (int): input channel size for stem before expansion.
+ stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
+ stem_conv_stride (tuple): convolutional stride size(s) of stem.
+
+ stage_conv_kernel_size (tuple): convolutional kernel size(s) for conv_b.
+ stage_spatial_stride (tuple): the spatial stride for each stage.
+ stage_temporal_stride (tuple): the temporal stride for each stage.
+ bottleneck_factor (float): bottleneck expansion factor for the 3x3x3 conv.
+ se_ratio (float): if > 0, apply SE to the 3x3x3 conv, with the SE
+ channel dimensionality being se_ratio times the 3x3x3 conv dim.
+ inner_act (callable): whether use Swish activation for act_b or not.
+
+ head_dim_out (int): output channel size of the X3D head.
+ head_pool_act (callable): a callable that constructs resnet pool activation
+ layer such as nn.ReLU.
+ head_bn_lin5_on (bool): if True, perform normalization on the features
+ before the classifier.
+ head_activation (callable): a callable that constructs activation layer.
+ head_output_with_global_average (bool): if True, perform global averaging on
+ the head output.
+
+ Returns:
+ (nn.Module): the X3D network.
+ """
+ blocks = []
+ # Create stem for X3D.
+ stem_dim_out = round_width(stem_dim_in, width_factor)
+ stem = create_x3d_stem(
+ in_channels=input_channel,
+ out_channels=stem_dim_out,
+ conv_kernel_size=stem_conv_kernel_size,
+ conv_stride=stem_conv_stride,
+ conv_padding=[size // 2 for size in stem_conv_kernel_size],
+ norm=norm,
+ norm_eps=norm_eps,
+ norm_momentum=norm_momentum,
+ activation=activation,
+ )
+ blocks.append(stem)
+
+ # Compute the depth and dimension for each stage
+ stage_depths = [1, 2, 5, 3]
+ exp_stage = 2.0
+ stage_dim1 = stem_dim_in
+ stage_dim2 = round_width(stage_dim1, exp_stage, divisor=8)
+ stage_dim3 = round_width(stage_dim2, exp_stage, divisor=8)
+ stage_dim4 = round_width(stage_dim3, exp_stage, divisor=8)
+ stage_dims = [stage_dim1, stage_dim2, stage_dim3, stage_dim4]
+
+ dim_in = stem_dim_out
+ # Create each stage for X3D.
+ for idx in range(len(stage_depths)):
+ dim_out = round_width(stage_dims[idx], width_factor)
+ dim_inner = int(bottleneck_factor * dim_out)
+ depth = round_repeats(stage_depths[idx], depth_factor)
+
+ stage_conv_stride = (
+ stage_temporal_stride[idx],
+ stage_spatial_stride[idx],
+ stage_spatial_stride[idx],
+ )
+
+ stage = create_x3d_res_stage(
+ depth=depth,
+ dim_in=dim_in,
+ dim_inner=dim_inner,
+ dim_out=dim_out,
+ bottleneck=bottleneck,
+ conv_kernel_size=stage_conv_kernel_size[idx],
+ conv_stride=stage_conv_stride,
+ norm=norm,
+ norm_eps=norm_eps,
+ norm_momentum=norm_momentum,
+ se_ratio=se_ratio,
+ activation=activation,
+ inner_act=inner_act,
+ )
+ blocks.append(stage)
+ dim_in = dim_out
+
+ # Create head for X3D.
+ total_spatial_stride = stem_conv_stride[1] * np.prod(stage_spatial_stride)
+ total_temporal_stride = stem_conv_stride[0] * np.prod(stage_temporal_stride)
+
+ assert (
+ input_clip_length >= total_temporal_stride
+ ), "Clip length doesn't match temporal stride!"
+ assert (
+ input_crop_size >= total_spatial_stride
+ ), "Crop size doesn't match spatial stride!"
+
+ head_pool_kernel_size = (
+ input_clip_length // total_temporal_stride,
+ input_crop_size // total_spatial_stride,
+ input_crop_size // total_spatial_stride,
+ )
+
+ head = create_x3d_head(
+ dim_in=dim_out,
+ dim_inner=dim_inner,
+ dim_out=head_dim_out,
+ num_classes=model_num_class,
+ pool_act=head_pool_act,
+ pool_kernel_size=head_pool_kernel_size,
+ norm=norm,
+ norm_eps=norm_eps,
+ norm_momentum=norm_momentum,
+ bn_lin5_on=head_bn_lin5_on,
+ dropout_rate=dropout_rate,
+ activation=head_activation,
+ output_with_global_average=head_output_with_global_average,
+ )
+ blocks.append(head)
+ return Net(blocks=nn.ModuleList(blocks))
+
+
+class ProjectedPool(nn.Module):
+ """
+ A pooling module augmented with Conv, Normalization and Activation both
+ before and after pooling for the head layer of X3D.
+
+ ::
+
+ Conv3d (pre_conv)
+ ↓
+ Normalization (pre_norm)
+ ↓
+ Activation (pre_act)
+ ↓
+ Pool3d
+ ↓
+ Conv3d (post_conv)
+ ↓
+ Normalization (post_norm)
+ ↓
+ Activation (post_act)
+ """
+
+ def __init__(
+ self,
+ *,
+ pre_conv: nn.Module = None,
+ pre_norm: nn.Module = None,
+ pre_act: nn.Module = None,
+ pool: nn.Module = None,
+ post_conv: nn.Module = None,
+ post_norm: nn.Module = None,
+ post_act: nn.Module = None,
+ ) -> None:
+ """
+ Args:
+ pre_conv (torch.nn.modules): convolutional module.
+ pre_norm (torch.nn.modules): normalization module.
+ pre_act (torch.nn.modules): activation module.
+ pool (torch.nn.modules): pooling module.
+ post_conv (torch.nn.modules): convolutional module.
+ post_norm (torch.nn.modules): normalization module.
+ post_act (torch.nn.modules): activation module.
+ """
+ super().__init__()
+ set_attributes(self, locals())
+ assert self.pre_conv is not None
+ assert self.pool is not None
+ assert self.post_conv is not None
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.pre_conv(x)
+
+ if self.pre_norm is not None:
+ x = self.pre_norm(x)
+ if self.pre_act is not None:
+ x = self.pre_act(x)
+
+ x = self.pool(x)
+ x = self.post_conv(x)
+
+ if self.post_norm is not None:
+ x = self.post_norm(x)
+ if self.post_act is not None:
+ x = self.post_act(x)
+ return x
diff --git a/pytorchvideo/transforms/__init__.py b/pytorchvideo/transforms/__init__.py
new file mode 100644
index 00000000..d663f798
--- /dev/null
+++ b/pytorchvideo/transforms/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from .transforms import * # noqa
diff --git a/pytorchvideo/transforms/functional.py b/pytorchvideo/transforms/functional.py
new file mode 100644
index 00000000..514354ad
--- /dev/null
+++ b/pytorchvideo/transforms/functional.py
@@ -0,0 +1,212 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import math
+from typing import Tuple
+
+import numpy as np
+import torch
+
+
+try:
+ import cv2
+except ImportError:
+ _HAS_CV2 = False
+else:
+ _HAS_CV2 = True
+
+
+def uniform_temporal_subsample(
+ x: torch.Tensor, num_samples: int, temporal_dim: int = 1
+) -> torch.Tensor:
+ """
+ Uniformly subsamples num_samples indices from the temporal dimension of the video.
+ When num_samples is larger than the size of temporal dimension of the video, it
+ will sample frames based on nearest neighbor interpolation.
+
+ Args:
+ x (torch.Tensor): A video tensor with dimension larger than one with torch
+ tensor type includes int, long, float, complex, etc.
+ num_samples (int): The number of equispaced samples to be selected
+ temporal_dim (int): dimension of temporal to perform temporal subsample.
+
+ Returns:
+ An x-like Tensor with subsampled temporal dimension.
+ """
+ t = x.shape[temporal_dim]
+ assert num_samples > 0 and t > 0
+ # Sample by nearest neighbor interpolation if num_samples > t.
+ indices = torch.linspace(0, t - 1, num_samples)
+ indices = torch.clamp(indices, 0, t - 1).long()
+ return torch.index_select(x, temporal_dim, indices)
+
+
+@torch.jit.ignore
+def _interpolate_opencv(
+ x: torch.Tensor, size: Tuple[int, int], interpolation: str
+) -> torch.Tensor:
+ """
+ Down/up samples the input torch tensor x to the given size with given interpolation
+ mode.
+ Args:
+ input (Tensor): the input tensor to be down/up sampled.
+ size (Tuple[int, int]): expected output spatial size.
+ interpolation: model to perform interpolation, options include `nearest`,
+ `linear`, `bilinear`, `bicubic`.
+ """
+ if not _HAS_CV2:
+ raise ImportError(
+ "opencv is required to use opencv transforms. Please "
+ "install with 'pip install opencv-python'."
+ )
+
+ _opencv_pytorch_interpolation_map = {
+ "nearest": cv2.INTER_NEAREST,
+ "linear": cv2.INTER_LINEAR,
+ "bilinear": cv2.INTER_AREA,
+ "bicubic": cv2.INTER_CUBIC,
+ }
+ assert interpolation in _opencv_pytorch_interpolation_map
+ new_h, new_w = size
+ img_array_list = [
+ img_tensor.squeeze(0).numpy()
+ for img_tensor in x.permute(1, 2, 3, 0).split(1, dim=0)
+ ]
+ resized_img_array_list = [
+ cv2.resize(
+ img_array,
+ (new_w, new_h), # The input order for OpenCV is w, h.
+ interpolation=_opencv_pytorch_interpolation_map[interpolation],
+ )
+ for img_array in img_array_list
+ ]
+ img_array = np.concatenate(
+ [np.expand_dims(img_array, axis=0) for img_array in resized_img_array_list],
+ axis=0,
+ )
+ img_tensor = torch.from_numpy(np.ascontiguousarray(img_array))
+ img_tensor = img_tensor.permute(3, 0, 1, 2)
+ return img_tensor
+
+
+def short_side_scale(
+ x: torch.Tensor,
+ size: int,
+ interpolation: str = "bilinear",
+ backend: str = "pytorch",
+) -> torch.Tensor:
+ """
+ Determines the shorter spatial dim of the video (i.e. width or height) and scales
+ it to the given size. To maintain aspect ratio, the longer side is then scaled
+ accordingly.
+
+ Args:
+ x (torch.Tensor): A video tensor of shape (C, T, H, W) and type torch.float32.
+ size (int): The size the shorter side is scaled to.
+ interpolation (str): Algorithm used for upsampling,
+ options: nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'
+ backend (str): backend used to perform interpolation. Options includes
+ `pytorch` as default, and `opencv`. Note that opencv and pytorch behave
+ differently on linear interpolation on some versions.
+ https://discuss.pytorch.org/t/pytorch-linear-interpolation-is-different-from-pil-opencv/71181
+
+ Returns:
+ An x-like Tensor with scaled spatial dims.
+ """ # noqa
+ assert len(x.shape) == 4
+ assert x.dtype == torch.float32
+ assert backend in ("pytorch", "opencv")
+ c, t, h, w = x.shape
+ if w < h:
+ new_h = int(math.floor((float(h) / w) * size))
+ new_w = size
+ else:
+ new_h = size
+ new_w = int(math.floor((float(w) / h) * size))
+ if backend == "pytorch":
+ return torch.nn.functional.interpolate(
+ x, size=(new_h, new_w), mode=interpolation, align_corners=False
+ )
+ elif backend == "opencv":
+ return _interpolate_opencv(x, size=(new_h, new_w), interpolation=interpolation)
+ else:
+ raise NotImplementedError(f"{backend} backend not supported.")
+
+
+def repeat_temporal_frames_subsample(
+ frames: torch.Tensor, frame_ratios: Tuple[int], temporal_dim: int = 1
+) -> Tuple[torch.Tensor]:
+ """
+ Prepare output as a list of tensors subsampled from the input frames. Each tensor
+ maintain a unique copy of subsampled frames, which corresponds to a unique
+ pathway.
+ Args:
+ frames (tensor): frames of images sampled from the video. Expected to have
+ torch tensor (including int, long, float, complex, etc) with dimension
+ larger than one.
+ frame_ratios (tuple): ratio to perform temporal down-sampling for each pathways.
+ temporal_dim (int): dimension of temporal.
+ Returns:
+ frame_list (tuple): list of tensors as output.
+ """
+ temporal_length = frames.shape[temporal_dim]
+ frame_list = []
+ for ratio in frame_ratios:
+ pathway = uniform_temporal_subsample(
+ frames, temporal_length // ratio, temporal_dim
+ )
+ frame_list.append(pathway)
+ return frame_list
+
+
+def convert_to_one_hot(targets: torch.Tensor, num_class: int) -> torch.Tensor:
+ """
+ This function converts target class indices to one-hot vectors, given the number of classes.
+ """
+
+ assert (
+ torch.max(targets).item() < num_class
+ ), "Class Index must be less than number of classes"
+
+ one_hot_targets = torch.zeros(
+ (targets.shape[0], num_class), dtype=torch.long, device=targets.device
+ )
+ one_hot_targets.scatter_(1, targets.long(), 1)
+
+ return one_hot_targets
+
+
+def uniform_crop(frames: torch.Tensor, size: int, spatial_idx: int = 1) -> torch.Tensor:
+ """
+ Perform uniform spatial sampling on the frames based on three-crop setting.
+ If width is larger than height, take left, center and right crop.
+ If height is larger than width, take top, center, and bottom crop.
+ Args:
+ frames (tensor): A video tensor of shape (C, T, H, W) to perform uniform crop.
+ size (int): Desired height and weight size to crop the frames.
+ spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
+ is larger than height. Or 0, 1, or 2 for top, center, and bottom
+ crop if height is larger than width.
+ Returns:
+ cropped (tensor): A cropped video tensor of shape (C, T, size, size).
+ """
+
+ assert spatial_idx in [0, 1, 2]
+ height = frames.shape[2]
+ width = frames.shape[3]
+
+ y_offset = int(math.ceil((height - size) / 2))
+ x_offset = int(math.ceil((width - size) / 2))
+
+ if height > width:
+ if spatial_idx == 0:
+ y_offset = 0
+ elif spatial_idx == 2:
+ y_offset = height - size
+ else:
+ if spatial_idx == 0:
+ x_offset = 0
+ elif spatial_idx == 2:
+ x_offset = width - size
+ cropped = frames[:, :, y_offset : y_offset + size, x_offset : x_offset + size]
+
+ return cropped
diff --git a/pytorchvideo/transforms/transforms.py b/pytorchvideo/transforms/transforms.py
new file mode 100644
index 00000000..fd214097
--- /dev/null
+++ b/pytorchvideo/transforms/transforms.py
@@ -0,0 +1,104 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from typing import Callable, Dict
+
+import pytorchvideo.transforms.functional
+import torch
+
+
+class ApplyTransformToKey:
+ """
+ Applies transform to key of dictionary input.
+
+ Args:
+ key (str): the dictionary key the transform is applied to
+ transform (callable): the transform that is applied
+
+ Example:
+ >>> transforms.ApplyTransformToKey(
+ >>> key='video',
+ >>> transform=UniformTemporalSubsample(num_video_samples),
+ >>> )
+ """
+
+ def __init__(self, key: str, transform: Callable):
+ self._key = key
+ self._transform = transform
+
+ def __call__(self, x: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+ x[self._key] = self._transform(x[self._key])
+ return x
+
+
+class RemoveKey(torch.nn.Module):
+ def __init__(self, key: str):
+ self._key = key
+
+ def __call__(self, x: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+ if self._key in x:
+ del x[self._key]
+ return x
+
+
+class UniformTemporalSubsample(torch.nn.Module):
+ """
+ nn.Module wrapper for pytorchvideo.transforms.functional.uniform_temporal_subsample.
+ """
+
+ def __init__(self, num_samples: int):
+ super().__init__()
+ self._num_samples = num_samples
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return pytorchvideo.transforms.functional.uniform_temporal_subsample(
+ x, self._num_samples
+ )
+
+
+class ShortSideScale(torch.nn.Module):
+ """
+ nn.Module wrapper for pytorchvideo.transforms.functional.short_side_scale.
+ """
+
+ def __init__(self, size: int):
+ super().__init__()
+ self._size = size
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return pytorchvideo.transforms.functional.short_side_scale(x, self._size)
+
+
+class RandomShortSideScale(torch.nn.Module):
+ """
+ nn.Module wrapper for pytorchvideo.transforms.functional.short_side_scale. The size
+ parameter is chosen randomly in [min_size, max_size].
+ """
+
+ def __init__(self, min_size: int, max_size: int):
+ super().__init__()
+ self._min_size = min_size
+ self._max_size = max_size
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ size = torch.randint(self._min_size, self._max_size + 1, (1,)).item()
+ return pytorchvideo.transforms.functional.short_side_scale(x, size)
+
+
+class UniformCropVideo(torch.nn.Module):
+ """
+ nn.Module wrapper for pytorchvideo.transforms.functional.uniform_crop.
+ """
+
+ def __init__(
+ self, size: int, video_key: str = "video", aug_index_key: str = "aug_index"
+ ):
+ super().__init__()
+ self._size = size
+ self._video_key = video_key
+ self._aug_index_key = aug_index_key
+
+ def __call__(self, x: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+ x[self._video_key] = pytorchvideo.transforms.functional.uniform_crop(
+ x[self._video_key], self._size, x[self._aug_index_key]
+ )
+ return x
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 00000000..0f73488f
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,8 @@
+[isort]
+line_length = 88
+multi_line_output = 3
+include_trailing_comma = True
+force_grid_warp = 0
+default_section = THIRDPARTY
+lines_after_imports = 2
+combine_as_imports = True
diff --git a/setup.py b/setup.py
new file mode 100755
index 00000000..e7491202
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import os
+
+from setuptools import find_packages, setup
+
+
+def get_version():
+ init_py_path = os.path.join(
+ os.path.abspath(os.path.dirname(__file__)), "pytorchvideo", "__init__.py"
+ )
+ init_py = open(init_py_path, "r").readlines()
+ version_line = [
+ lines.strip() for lines in init_py if lines.startswith("__version__")
+ ][0]
+ version = version_line.split("=")[-1].strip().strip("'\"")
+
+ # Used by CI to build nightly packages. Users should never use it.
+ # To build a nightly wheel, run:
+ # BUILD_NIGHTLY=1 python setup.py sdist
+ if os.getenv("BUILD_NIGHTLY", "0") == "1":
+ from datetime import datetime
+
+ date_str = datetime.today().strftime("%Y%m%d")
+ # pip can perform proper comparison for ".post" suffix,
+ # i.e., "1.1.post1234" >= "1.1"
+ version = version + ".post" + date_str
+
+ new_init_py = [l for l in init_py if not l.startswith("__version__")]
+ new_init_py.append('__version__ = "{}"\n'.format(version))
+ with open(init_py_path, "w") as f:
+ f.write("".join(new_init_py))
+
+ return version
+
+
+def get_name():
+ name = "pytorchvideo"
+ if os.getenv("BUILD_NIGHTLY", "0") == "1":
+ name += "-nightly"
+ return name
+
+
+setup(
+ name=get_name(),
+ version=get_version(),
+ license="Apache 2.0",
+ author="Facebook AI",
+ url="https://github.com/facebookresearch/pytorchvideo",
+ description="A video understanding deep learning library.",
+ python_requires=">=3.7",
+ install_requires=[
+ "fvcore>=0.1.4",
+ "av",
+ "parameterized",
+ "iopath",
+ ],
+ extras_require={
+ "test": ["coverage", "pytest", "opencv-python"],
+ "dev": [
+ "opencv-python",
+ "black==20.8b1",
+ "sphinx",
+ "isort==4.3.21",
+ "flake8==3.8.1",
+ "flake8-bugbear",
+ "flake8-comprehensions",
+ "pre-commit",
+ "nbconvert",
+ "bs4",
+ "autoflake==1.4",
+ ],
+ "opencv-python": [
+ "opencv-python",
+ ],
+ },
+ packages=find_packages(exclude=("scripts", "tests")),
+)
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 00000000..801caa05
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,21 @@
+## Unit Tests
+
+
+Before running the tests, please ensure that you installed the necessary additional test dependencies.
+If not installed, check the [install-README](https://github.com/facebookresearch/pytorchvideo/blob/master/INSTALL.md) on how to do it.
+
+Use the the following command to run the tests:
+```
+# From root of the project
+python -m unittest discover -v -s ./tests
+```
+
+To generate the coverage reports, please run the following command:
+```
+#Install Coverage using
+pip install coverage
+
+# From root of the project
+coverage run -m unittest discover -v -s tests
+```
+
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..5c7f19c6
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/tests/benchmark_accelerator_efficient_blocks.py b/tests/benchmark_accelerator_efficient_blocks.py
new file mode 100644
index 00000000..0696c702
--- /dev/null
+++ b/tests/benchmark_accelerator_efficient_blocks.py
@@ -0,0 +1,355 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import logging
+import unittest
+from typing import Callable
+
+import torch
+import torch.nn as nn
+
+# import torch.quantization.quantize_fx as quantize_fx
+from fvcore.common.benchmark import benchmark
+from pytorchvideo.layers.accelerator.mobile_cpu.convolutions import (
+ Conv3d3x3x3DwBnAct,
+ Conv3dPwBnAct,
+)
+from pytorchvideo.models.accelerator.mobile_cpu.residual_blocks import (
+ X3dBottleneckBlock,
+)
+from torch.utils.mobile_optimizer import optimize_for_mobile
+
+
+class TestBenchmarkEfficientBlocks(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_benchmark_conv3d_pw_bn_relu(self, num_iters: int = 20) -> None:
+ """
+ Benchmark Conv3dPwBnAct with ReLU activation.
+ Note efficient block Conv3dPwBnAct is designed for mobile cpu with qnnpack
+ backend, and benchmarking on server with another backend (e.g., fbgemm) may
+ have different latency result compared to running on mobile cpu with qnnpack.
+ Running on x86 based server cpu with qnnpack may also have different latency as
+ running on mobile cpu with qnnpack, as qnnpack is optimized for
+ ARM based mobile cpu.
+ Args:
+ num_iters (int): number of iterations to perform benchmarking.
+ """
+
+ torch.backends.quantized.engine = "qnnpack"
+ kwargs_list = [
+ {
+ "mode": "original",
+ "input_blob_size": (1, 48, 4, 40, 40),
+ "in_channels": 48,
+ "out_channels": 108,
+ "quantize": False,
+ },
+ {
+ "mode": "deployable",
+ "input_blob_size": (1, 48, 4, 40, 40),
+ "in_channels": 48,
+ "out_channels": 108,
+ "quantize": False,
+ },
+ {
+ "mode": "original",
+ "input_blob_size": (1, 48, 4, 40, 40),
+ "in_channels": 48,
+ "out_channels": 108,
+ "quantize": True,
+ },
+ {
+ "mode": "deployable",
+ "input_blob_size": (1, 48, 4, 40, 40),
+ "in_channels": 48,
+ "out_channels": 108,
+ "quantize": True,
+ },
+ ]
+
+ def _benchmark_conv3d_pw_bn_relu_forward(**kwargs) -> Callable:
+ assert kwargs["mode"] in ("original", "deployable"), (
+ "kwargs['mode'] must be either 'original' or 'deployable',"
+ "but got {}.".format(kwargs["mode"])
+ )
+ input_tensor = torch.randn((kwargs["input_blob_size"]))
+ conv_block = Conv3dPwBnAct(
+ kwargs["in_channels"],
+ kwargs["out_channels"],
+ use_bn=False, # assume BN has already been fused for forward
+ )
+
+ if kwargs["mode"] == "deployable":
+ conv_block.convert(kwargs["input_blob_size"])
+ conv_block.eval()
+ if kwargs["quantize"] is True:
+ if kwargs["mode"] == "original": # manually fuse conv and relu
+ conv_block.kernel = torch.quantization.fuse_modules(
+ conv_block.kernel, ["conv", "act.act"]
+ )
+ conv_block = nn.Sequential(
+ torch.quantization.QuantStub(),
+ conv_block,
+ torch.quantization.DeQuantStub(),
+ )
+
+ conv_block.qconfig = torch.quantization.get_default_qconfig("qnnpack")
+ conv_block = torch.quantization.prepare(conv_block)
+ try:
+ conv_block = torch.quantization.convert(conv_block)
+ except Exception as e:
+ logging.info(
+ "benchmark_conv3d_pw_bn_relu: "
+ "catch exception '{}' with kwargs of {}".format(e, kwargs)
+ )
+
+ def func_to_benchmark_dummy() -> None:
+ return
+
+ return func_to_benchmark_dummy
+ traced_model = torch.jit.trace(conv_block, input_tensor, strict=False)
+ if kwargs["quantize"] is False:
+ traced_model = optimize_for_mobile(traced_model)
+
+ logging.info(f"model arch: {traced_model}")
+
+ def func_to_benchmark() -> None:
+ try:
+ _ = traced_model(input_tensor)
+ except Exception as e:
+ logging.info(
+ "benchmark_conv3d_pw_bn_relu: "
+ "catch exception '{}' with kwargs of {}".format(e, kwargs)
+ )
+
+ return
+
+ return func_to_benchmark
+
+ benchmark(
+ _benchmark_conv3d_pw_bn_relu_forward,
+ "benchmark_conv3d_pw_bn_relu",
+ kwargs_list,
+ num_iters=num_iters,
+ warmup_iters=2,
+ )
+
+ self.assertTrue(True)
+
+ def test_benchmark_conv3d_3x3x3_dw_bn_relu(self, num_iters: int = 20) -> None:
+ """
+ Benchmark Conv3d3x3x3DwBnAct with ReLU activation.
+ Note efficient block Conv3d3x3x3DwBnAct is designed for mobile cpu with qnnpack
+ backend, and benchmarking on server with another backend (e.g., fbgemm) may have
+ different latency result compared as running on mobile cpu.
+ Args:
+ num_iters (int): number of iterations to perform benchmarking.
+ """
+ torch.backends.quantized.engine = "qnnpack"
+ kwargs_list = [
+ {
+ "mode": "original",
+ "input_blob_size": (1, 48, 4, 40, 40),
+ "in_channels": 48,
+ "quantize": False,
+ },
+ {
+ "mode": "deployable",
+ "input_blob_size": (1, 48, 4, 40, 40),
+ "in_channels": 48,
+ "quantize": False,
+ },
+ {
+ "mode": "original",
+ "input_blob_size": (1, 48, 4, 40, 40),
+ "in_channels": 48,
+ "quantize": True,
+ },
+ {
+ "mode": "deployable",
+ "input_blob_size": (1, 48, 4, 40, 40),
+ "in_channels": 48,
+ "quantize": True,
+ },
+ ]
+
+ def _benchmark_conv3d_3x3x3_dw_bn_relu_forward(**kwargs) -> Callable:
+ assert kwargs["mode"] in ("original", "deployable"), (
+ "kwargs['mode'] must be either 'original' or 'deployable',"
+ "but got {}.".format(kwargs["mode"])
+ )
+ input_tensor = torch.randn((kwargs["input_blob_size"]))
+ conv_block = Conv3d3x3x3DwBnAct(
+ kwargs["in_channels"],
+ use_bn=False, # assume BN has already been fused for forward
+ )
+
+ if kwargs["mode"] == "deployable":
+ conv_block.convert(kwargs["input_blob_size"])
+ conv_block.eval()
+ if kwargs["quantize"] is True:
+ if kwargs["mode"] == "original": # manually fuse conv and relu
+ conv_block.kernel = torch.quantization.fuse_modules(
+ conv_block.kernel, ["conv", "act.act"]
+ )
+ conv_block = nn.Sequential(
+ torch.quantization.QuantStub(),
+ conv_block,
+ torch.quantization.DeQuantStub(),
+ )
+
+ conv_block.qconfig = torch.quantization.get_default_qconfig("qnnpack")
+ conv_block = torch.quantization.prepare(conv_block)
+ try:
+ conv_block = torch.quantization.convert(conv_block)
+ except Exception as e:
+ logging.info(
+ "benchmark_conv3d_3x3x3_dw_bn_relu: "
+ "catch exception '{}' with kwargs of {}".format(e, kwargs)
+ )
+
+ def func_to_benchmark_dummy() -> None:
+ return
+
+ return func_to_benchmark_dummy
+
+ traced_model = torch.jit.trace(conv_block, input_tensor, strict=False)
+ if kwargs["quantize"] is False:
+ traced_model = optimize_for_mobile(traced_model)
+
+ logging.info(f"model arch: {traced_model}")
+
+ def func_to_benchmark() -> None:
+ try:
+ _ = traced_model(input_tensor)
+ except Exception as e:
+ logging.info(
+ "benchmark_conv3d_3x3x3_dw_bn_relu: "
+ "catch exception '{}' with kwargs of {}".format(e, kwargs)
+ )
+ return
+
+ return func_to_benchmark
+
+ benchmark(
+ _benchmark_conv3d_3x3x3_dw_bn_relu_forward,
+ "benchmark_conv3d_3x3x3_dw_bn_relu",
+ kwargs_list,
+ num_iters=num_iters,
+ warmup_iters=2,
+ )
+
+ self.assertTrue(True)
+
+ def test_benchmark_x3d_bottleneck_block(self, num_iters: int = 20) -> None:
+ """
+ Benchmark X3dBottleneckBlock.
+ Note efficient block X3dBottleneckBlock is designed for mobile cpu with qnnpack
+ backend, and benchmarking on server/laptop may have different latency result
+ compared to running on mobile cpu.
+ Args:
+ num_iters (int): number of iterations to perform benchmarking.
+ """
+ torch.backends.quantized.engine = "qnnpack"
+ kwargs_list = [
+ {
+ "mode": "original",
+ "input_blob_size": (1, 48, 4, 20, 20),
+ "in_channels": 48,
+ "mid_channels": 108,
+ "out_channels": 48,
+ "quantize": False,
+ },
+ {
+ "mode": "deployable",
+ "input_blob_size": (1, 48, 4, 20, 20),
+ "in_channels": 48,
+ "mid_channels": 108,
+ "out_channels": 48,
+ "quantize": False,
+ },
+ {
+ "mode": "original",
+ "input_blob_size": (1, 48, 4, 20, 20),
+ "in_channels": 48,
+ "mid_channels": 108,
+ "out_channels": 48,
+ "quantize": True,
+ },
+ {
+ "mode": "deployable",
+ "input_blob_size": (1, 48, 4, 20, 20),
+ "in_channels": 48,
+ "mid_channels": 108,
+ "out_channels": 48,
+ "quantize": True,
+ },
+ ]
+
+ def _benchmark_x3d_bottleneck_forward(**kwargs) -> Callable:
+ assert kwargs["mode"] in ("original", "deployable"), (
+ "kwargs['mode'] must be either 'original' or 'deployable',"
+ "but got {}.".format(kwargs["mode"])
+ )
+ input_tensor = torch.randn((kwargs["input_blob_size"]))
+ conv_block = X3dBottleneckBlock(
+ kwargs["in_channels"],
+ kwargs["mid_channels"],
+ kwargs["out_channels"],
+ use_bn=(False, False, False), # Assume BN has been fused for forward
+ )
+
+ if kwargs["mode"] == "deployable":
+ conv_block.convert(kwargs["input_blob_size"])
+ conv_block.eval()
+ if kwargs["quantize"] is True:
+ conv_block = nn.Sequential(
+ torch.quantization.QuantStub(),
+ conv_block,
+ torch.quantization.DeQuantStub(),
+ )
+
+ conv_block.qconfig = torch.quantization.get_default_qconfig("qnnpack")
+ conv_block = torch.quantization.prepare(conv_block)
+ try:
+ conv_block = torch.quantization.convert(conv_block)
+ except Exception as e:
+ logging.info(
+ "benchmark_x3d_bottleneck_forward: "
+ "catch exception '{}' with kwargs of {}".format(e, kwargs)
+ )
+
+ def func_to_benchmark_dummy() -> None:
+ return
+
+ return func_to_benchmark_dummy
+
+ traced_model = torch.jit.trace(conv_block, input_tensor, strict=False)
+ if kwargs["quantize"] is False:
+ traced_model = optimize_for_mobile(traced_model)
+
+ logging.info(f"model arch: {traced_model}")
+
+ def func_to_benchmark() -> None:
+ try:
+ _ = traced_model(input_tensor)
+ except Exception as e:
+ logging.info(
+ "benchmark_x3d_bottleneck_forward: "
+ "catch exception '{}' with kwargs of {}".format(e, kwargs)
+ )
+ return
+
+ return func_to_benchmark
+
+ benchmark(
+ _benchmark_x3d_bottleneck_forward,
+ "benchmark_x3d_bottleneck_forward",
+ kwargs_list,
+ num_iters=num_iters,
+ warmup_iters=2,
+ )
+
+ self.assertTrue(True)
diff --git a/tests/benchmark_transforms.py b/tests/benchmark_transforms.py
new file mode 100644
index 00000000..f94aa493
--- /dev/null
+++ b/tests/benchmark_transforms.py
@@ -0,0 +1,82 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import unittest
+from typing import Callable
+
+import torch
+from fvcore.common.benchmark import benchmark
+from pytorchvideo.data.utils import thwc_to_cthw
+from pytorchvideo.transforms.functional import short_side_scale
+from utils import create_dummy_video_frames
+
+
+class TestBenchmarkTransforms(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_benchmark_short_side_scale_pytorch(self, num_iters: int = 10) -> None:
+ """
+ Benchmark scale operation with pytorch backend.
+ Args:
+ num_iters (int): number of iterations to perform benchmarking.
+ """
+ kwargs_list = [
+ {"temporal_size": 8, "ori_spatial_size": (128, 128), "dst_short_size": 112},
+ {
+ "temporal_size": 16,
+ "ori_spatial_size": (128, 128),
+ "dst_short_size": 112,
+ },
+ {
+ "temporal_size": 32,
+ "ori_spatial_size": (128, 128),
+ "dst_short_size": 112,
+ },
+ {"temporal_size": 8, "ori_spatial_size": (256, 256), "dst_short_size": 224},
+ {
+ "temporal_size": 16,
+ "ori_spatial_size": (256, 256),
+ "dst_short_size": 224,
+ },
+ {
+ "temporal_size": 32,
+ "ori_spatial_size": (256, 256),
+ "dst_short_size": 224,
+ },
+ {"temporal_size": 8, "ori_spatial_size": (320, 320), "dst_short_size": 224},
+ {
+ "temporal_size": 16,
+ "ori_spatial_size": (320, 320),
+ "dst_short_size": 224,
+ },
+ {
+ "temporal_size": 32,
+ "ori_spatial_size": (320, 320),
+ "dst_short_size": 224,
+ },
+ ]
+
+ def _init_benchmark_short_side_scale(**kwargs) -> Callable:
+ x = thwc_to_cthw(
+ create_dummy_video_frames(
+ kwargs["temporal_size"],
+ kwargs["ori_spatial_size"][0],
+ kwargs["ori_spatial_size"][1],
+ )
+ ).to(dtype=torch.float32)
+
+ def func_to_benchmark() -> None:
+ _ = short_side_scale(x, kwargs["dst_short_size"])
+ return
+
+ return func_to_benchmark
+
+ benchmark(
+ _init_benchmark_short_side_scale,
+ "benchmark_short_side_scale_pytorch",
+ kwargs_list,
+ num_iters=num_iters,
+ warmup_iters=2,
+ )
+ self.assertTrue(True)
diff --git a/tests/test_accelerator_deployment_mobile_cpu_model_conversion.py b/tests/test_accelerator_deployment_mobile_cpu_model_conversion.py
new file mode 100644
index 00000000..0ffa677d
--- /dev/null
+++ b/tests/test_accelerator_deployment_mobile_cpu_model_conversion.py
@@ -0,0 +1,83 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import logging
+import unittest
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+from pytorchvideo.accelerator.deployment.mobile_cpu.utils.model_conversion import (
+ convert_to_deployable_form,
+)
+from pytorchvideo.accelerator.efficient_blocks.efficient_block_base import (
+ EfficientBlockBase,
+)
+from pytorchvideo.models.accelerator.mobile_cpu.residual_blocks import (
+ X3dBottleneckBlock,
+)
+
+
+class TestDeploymentModelConversion(unittest.TestCase):
+ def test_X3dBottleneckBlock_model_conversion(self):
+ # Input tensor
+ input_blob_size = (1, 3, 4, 6, 6)
+ input_tensor = torch.randn(input_blob_size)
+
+ # Helper class to emulate mix of efficient block and non efficient block
+ class _quant_wrapper(nn.Module):
+ # A common config where user model is wrapped by QuantStub/DequantStub
+ def __init__(self):
+ super().__init__()
+ self.quant = torch.quantization.QuantStub() # Non efficient block
+ # X3dBottleneckBlock is efficient block consists of multiple efficient blocks
+ self.model = X3dBottleneckBlock(
+ 3,
+ 12,
+ 3,
+ )
+ self.dequant = torch.quantization.DeQuantStub() # Non efficient block
+
+ def forward(self, x):
+ x = self.quant(x)
+ x = self.model(x)
+ x = self.dequant(x)
+ return x
+
+ x3d_block_model_ref = _quant_wrapper()
+
+ # Get ref output
+ x3d_block_model_ref.eval()
+ out_ref = x3d_block_model_ref(input_tensor)
+ # Convert into deployment mode
+ x3d_block_model_converted = convert_to_deployable_form(
+ x3d_block_model_ref, input_tensor
+ )
+ out = x3d_block_model_converted(input_tensor)
+ # Check arithmetic equivalency
+ max_err = float(torch.max(torch.abs(out_ref - out)))
+ rel_err = torch.abs((out_ref - out) / out_ref)
+ max_rel_err = float(torch.max(rel_err))
+ logging.info(
+ (
+ "test_X3dBottleneckBlock_model_conversion: "
+ f"max_err {max_err}, max_rel_err {max_rel_err}"
+ )
+ )
+ self.assertTrue(max_err < 1e-3)
+ # Check all sub-modules converted
+ for iter_module in x3d_block_model_converted.modules():
+ if isinstance(iter_module, EfficientBlockBase) and (
+ hasattr(iter_module, "convert_flag")
+ ):
+ self.assertTrue(iter_module.convert_flag)
+ # Check all hooks removed
+ for iter_module in x3d_block_model_ref.modules():
+ assert iter_module._forward_hooks == OrderedDict(), (
+ f"{iter_module} in x3d_block_model_ref has non-empty _forward_hooks "
+ f"{iter_module._forward_hooks}"
+ )
+ for iter_module in x3d_block_model_converted.modules():
+ assert iter_module._forward_hooks == OrderedDict(), (
+ f"{iter_module} in x3d_block_model_converted has non-empty _forward_hooks "
+ f"{iter_module._forward_hooks}"
+ )
diff --git a/tests/test_accelerator_deployment_model_transmuter.py b/tests/test_accelerator_deployment_model_transmuter.py
new file mode 100644
index 00000000..6fa824c0
--- /dev/null
+++ b/tests/test_accelerator_deployment_model_transmuter.py
@@ -0,0 +1,87 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import logging
+import unittest
+from copy import deepcopy
+
+# Registers mobile_cpu transmuter functions
+import pytorchvideo.accelerator.deployment.mobile_cpu.transmuter # noqa: F401
+import torch
+import torch.nn as nn
+from pytorchvideo.accelerator.deployment.common.model_transmuter import transmute_model
+from pytorchvideo.accelerator.deployment.mobile_cpu.utils.model_conversion import (
+ convert_to_deployable_form,
+)
+from pytorchvideo.accelerator.efficient_blocks.efficient_block_base import (
+ EfficientBlockBase,
+)
+
+
+class TestModelTransmuter(unittest.TestCase):
+ def test_mobile_cpu_transmuter(self):
+ # Input tensor
+ input_blob_size = (1, 3, 2, 6, 6)
+ input_tensor = torch.randn(input_blob_size)
+
+ # Helper class to emulate user input model
+ class _residual_block(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.stem0 = nn.Conv3d(3, 3, kernel_size=(3, 1, 1), padding=(1, 0, 0))
+ self.stem1 = nn.Conv3d(3, 3, kernel_size=(5, 1, 1), padding=(2, 0, 0))
+ self.pw = nn.Conv3d(3, 6, kernel_size=1)
+ self.relu = nn.ReLU()
+ self.dw = nn.Conv3d(6, 6, kernel_size=3, padding=1, groups=6)
+ self.relu1 = nn.ReLU()
+ self.pwl = nn.Conv3d(6, 3, kernel_size=1)
+ self.relu2 = nn.ReLU()
+
+ def forward(self, x):
+ out = self.stem0(x)
+ out = self.stem1(out)
+ out = self.pw(out)
+ out = self.relu(out)
+ out = self.dw(out)
+ out = self.relu1(out)
+ out = self.pwl(out)
+ return self.relu2(out + x)
+
+ user_model_ref = _residual_block()
+
+ user_model_ref.eval()
+ out_ref = user_model_ref(input_tensor)
+
+ user_model_efficient = deepcopy(user_model_ref)
+ transmute_model(
+ user_model_efficient,
+ target_device="mobile_cpu",
+ )
+ logging.info(f"after convert_model {user_model_efficient}")
+ # Check whether blocks has been replaced by efficientBlock
+ assert isinstance(user_model_efficient.pw, EfficientBlockBase), (
+ f"user_model_efficient.pw {user_model_efficient.pw.__class__.__name__} "
+ "is not converted!"
+ )
+ assert isinstance(user_model_efficient.dw, EfficientBlockBase), (
+ f"user_model_efficient.dw {user_model_efficient.dw.__class__.__name__} "
+ "is not converted!"
+ )
+ assert isinstance(user_model_efficient.pwl, EfficientBlockBase), (
+ f"user_model_efficient.pwl {user_model_efficient.pwl.__class__.__name__} "
+ "is not converted!"
+ )
+ user_model_efficient_converted = convert_to_deployable_form(
+ user_model_efficient, input_tensor
+ )
+ out = user_model_efficient_converted(input_tensor)
+ # Check arithmetic equivalency
+ max_err = float(torch.max(torch.abs(out_ref - out)))
+ rel_err = torch.abs((out_ref - out) / out_ref)
+ max_rel_err = float(torch.max(rel_err))
+ logging.info(
+ (
+ "test_mobile_cpu_transmuter: "
+ f"max_err {max_err}, max_rel_err {max_rel_err}"
+ )
+ )
+ self.assertTrue(max_err < 1e-3)
diff --git a/tests/test_accelerator_efficient_blocks_mobile_cpu_activation_attention.py b/tests/test_accelerator_efficient_blocks_mobile_cpu_activation_attention.py
new file mode 100644
index 00000000..2f6e4a63
--- /dev/null
+++ b/tests/test_accelerator_efficient_blocks_mobile_cpu_activation_attention.py
@@ -0,0 +1,55 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import logging
+import unittest
+from copy import deepcopy
+
+import torch
+from pytorchvideo.layers.accelerator.mobile_cpu.activation_functions import (
+ supported_act_functions,
+)
+from pytorchvideo.layers.accelerator.mobile_cpu.attention import SqueezeExcitation
+
+
+class TestActivationAttentionEquivalency(unittest.TestCase):
+ def test_activation_equivalency(self):
+ # Input tensor
+ input_tensor = torch.randn(1, 3, 4, 6, 6)
+ for iter_activation_name in supported_act_functions:
+ act_func_ref = supported_act_functions[iter_activation_name]()
+ act_func_convert = deepcopy(act_func_ref)
+ act_func_convert.convert()
+ # Get output of both activations
+ out0 = act_func_ref(input_tensor)
+ out1 = act_func_convert(input_tensor)
+ # Check arithmetic equivalency
+ max_err = float(torch.max(torch.abs(out0 - out1)))
+
+ logging.info(
+ f"test_activation_equivalency: {iter_activation_name} max_err {max_err}"
+ )
+ self.assertTrue(max_err < 1e-3)
+
+ def test_squeeze_excite_equivalency(self):
+ # Input tensor
+ input_tensor = torch.randn(1, 16, 4, 6, 6)
+ # Instantiate ref and convert se modules.
+ se_ref = SqueezeExcitation(16, num_channels_reduced=2, is_3d=True)
+ se_ref.eval()
+ se_convert = deepcopy(se_ref)
+ se_convert.convert((1, 16, 4, 6, 6))
+ # Get output of both activations
+ out0 = se_ref(input_tensor)
+ out1 = se_convert(input_tensor)
+ # Check arithmetic equivalency
+ max_err = float(torch.max(torch.abs(out0 - out1)))
+ rel_err = torch.abs((out0 - out1) / out0)
+ max_rel_err = float(torch.max(rel_err))
+
+ logging.info(
+ (
+ "test_squeeze_excite_equivalency: "
+ f"max_err {max_err}, max_rel_err {max_rel_err}"
+ )
+ )
+ self.assertTrue(max_err < 1e-3)
diff --git a/tests/test_accelerator_efficient_blocks_mobile_cpu_conv3d.py b/tests/test_accelerator_efficient_blocks_mobile_cpu_conv3d.py
new file mode 100644
index 00000000..1ebbcb93
--- /dev/null
+++ b/tests/test_accelerator_efficient_blocks_mobile_cpu_conv3d.py
@@ -0,0 +1,144 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import logging
+import unittest
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+from pytorchvideo.layers.accelerator.mobile_cpu.convolutions import (
+ Conv3d3x1x1BnAct,
+ Conv3d3x3x3DwBnAct,
+ Conv3d5x1x1BnAct,
+ Conv3dPwBnAct,
+)
+
+
+class TestConv3dBlockEquivalency(unittest.TestCase):
+ def test_Conv3dPwBnAct_equivalency(self):
+ # Input tensor
+ input_tensor = torch.randn(1, 3, 4, 6, 6)
+ # A conv block
+ l0 = Conv3dPwBnAct(3, 12)
+ l1 = Conv3dPwBnAct(
+ 12, 3, bias=True, activation="identity"
+ ) # Skip relu to avoid NaN for rel error
+ seq0 = nn.Sequential(l0, l1)
+ seq0.eval()
+ out0 = seq0(input_tensor)
+ # Replicate the conv block
+ l0_1 = deepcopy(l0)
+ l1_1 = deepcopy(l1)
+ # Convert into deployment mode
+ l0_1.convert((1, 3, 4, 6, 6)) # Input tensor size is (1,3,4,6,6)
+ l1_1.convert((1, 12, 4, 6, 6)) # Input tensor size is (1,12,4,6,6)
+ seq1 = nn.Sequential(l0_1, l1_1)
+ out1 = seq1(input_tensor)
+ # Check arithmetic equivalency
+ max_err = float(torch.max(torch.abs(out0 - out1)))
+ rel_err = torch.abs((out0 - out1) / out0)
+ max_rel_err = float(torch.max(rel_err))
+
+ logging.info(
+ (
+ "test_Conv3dPwBnAct_equivalency: "
+ f"max_err {max_err}, max_rel_err {max_rel_err}"
+ )
+ )
+ self.assertTrue(max_err < 1e-3)
+
+ def test_Conv3d3x3x3DwBnAct_equivalency(self):
+ # Input tensor
+ input_tensor = torch.randn(1, 3, 4, 6, 6)
+ # A conv block
+ l0 = Conv3dPwBnAct(3, 12)
+ l1 = Conv3d3x3x3DwBnAct(12)
+ l2 = Conv3dPwBnAct(
+ 12, 3, bias=True, activation="identity"
+ ) # Skip relu to avoid NaN for relative error
+ seq0 = nn.Sequential(l0, l1, l2)
+ seq0.eval()
+ out0 = seq0(input_tensor)
+ # Replicate the conv block
+ l0_1 = deepcopy(l0)
+ l1_1 = deepcopy(l1)
+ l2_1 = deepcopy(l2)
+ # Convert into deployment mode
+ l0_1.convert((1, 3, 4, 6, 6)) # Input tensor size is (1,3,4,6,6)
+ l1_1.convert((1, 12, 4, 6, 6)) # Input tensor size is (1,12,4,6,6)
+ l2_1.convert((1, 12, 4, 6, 6)) # Input tensor size is (1,12,4,6,6)
+ seq1 = nn.Sequential(l0_1, l1_1, l2_1)
+ out1 = seq1(input_tensor)
+ # Check arithmetic equivalency
+ max_err = float(torch.max(torch.abs(out0 - out1)))
+ rel_err = torch.abs((out0 - out1) / out0)
+ max_rel_err = float(torch.max(rel_err))
+ logging.info(
+ (
+ "test_Conv3d3x3x3DwBnAct_equivalency: "
+ f"max_err {max_err}, max_rel_err {max_rel_err}"
+ )
+ )
+ self.assertTrue(max_err < 1e-3)
+
+ def test_Conv3d3x1x1BnAct_equivalency(self):
+ for input_temporal in range(3):
+ input_size = (1, 3, input_temporal + 1, 6, 6)
+ # Input tensor
+ input_tensor = torch.randn(input_size)
+ # A conv block
+ l0 = Conv3d3x1x1BnAct(3, 6)
+ l0.eval()
+ out0 = l0(input_tensor)
+ # Replicate the conv block
+ l0_1 = deepcopy(l0)
+ # Convert into deployment mode
+ l0_1.convert(input_size) # Input tensor size is (1,3,4,6,6)
+ out1 = l0_1(input_tensor)
+ # Check output size
+ assert (
+ out0.size() == out1.size()
+ ), f"Sizes of out0 {out0.size()} and out1 {out1.size()} are different."
+ # Check arithmetic equivalency
+ max_err = float(torch.max(torch.abs(out0 - out1)))
+ rel_err = torch.abs((out0 - out1) / out0)
+ max_rel_err = float(torch.max(rel_err))
+ logging.info(
+ (
+ "test_Conv3d3x1x1BnAct_equivalency: "
+ f"input tensor size: {input_size}"
+ f"max_err {max_err}, max_rel_err {max_rel_err}"
+ )
+ )
+ self.assertTrue(max_err < 1e-3)
+
+ def test_Conv3d5x1x1BnAct_equivalency(self):
+ for input_temporal in range(5):
+ input_size = (1, 3, input_temporal + 1, 6, 6)
+ # Input tensor
+ input_tensor = torch.randn(input_size)
+ # A conv block
+ l0 = Conv3d5x1x1BnAct(3, 6)
+ l0.eval()
+ out0 = l0(input_tensor)
+ # Replicate the conv block
+ l0_1 = deepcopy(l0)
+ # Convert into deployment mode
+ l0_1.convert(input_size) # Input tensor size is (1,3,4,6,6)
+ out1 = l0_1(input_tensor)
+ # Check output size
+ assert (
+ out0.size() == out1.size()
+ ), f"Sizes of out0 {out0.size()} and out1 {out1.size()} are different."
+ # Check arithmetic equivalency
+ max_err = float(torch.max(torch.abs(out0 - out1)))
+ rel_err = torch.abs((out0 - out1) / out0)
+ max_rel_err = float(torch.max(rel_err))
+ logging.info(
+ (
+ "test_Conv3d5x1x1BnAct_equivalency: "
+ f"input tensor size: {input_size}"
+ f"max_err {max_err}, max_rel_err {max_rel_err}"
+ )
+ )
+ self.assertTrue(max_err < 1e-3)
diff --git a/tests/test_accelerator_efficient_blocks_mobile_cpu_head_layer.py b/tests/test_accelerator_efficient_blocks_mobile_cpu_head_layer.py
new file mode 100644
index 00000000..79fd2f3d
--- /dev/null
+++ b/tests/test_accelerator_efficient_blocks_mobile_cpu_head_layer.py
@@ -0,0 +1,81 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import logging
+import unittest
+from copy import deepcopy
+
+import torch
+from pytorchvideo.layers.accelerator.mobile_cpu.fully_connected import FullyConnected
+from pytorchvideo.layers.accelerator.mobile_cpu.pool import (
+ AdaptiveAvgPool2d,
+ AdaptiveAvgPool2dOutSize1,
+ AdaptiveAvgPool3d,
+ AdaptiveAvgPool3dOutSize1,
+)
+
+
+class TestHeadLayerEquivalency(unittest.TestCase):
+ def test_head_layer_equivalency(self):
+ for input_dim in (4, 5): # 4 for BCHW, 5 for BCTHW
+ input_tensor_size = (1, 3, 4, 6, 6) if input_dim == 5 else (1, 3, 6, 6)
+ input_tensor = torch.randn(input_tensor_size)
+ # Build up common head layer: pool + linear
+ if input_dim == 5:
+ pool_efficient_block_ref = AdaptiveAvgPool3d(1)
+ pool_efficient_block_1 = AdaptiveAvgPool3d(1)
+ pool_efficient_block_2 = AdaptiveAvgPool3dOutSize1()
+
+ else:
+ pool_efficient_block_ref = AdaptiveAvgPool2d(1)
+ pool_efficient_block_1 = AdaptiveAvgPool2d(1)
+ pool_efficient_block_2 = AdaptiveAvgPool2dOutSize1()
+ pool_efficient_block_1.convert()
+ pool_efficient_block_2.convert(input_tensor_size)
+ linear_ref = FullyConnected(3, 8)
+ linear_1 = deepcopy(linear_ref)
+ linear_1.convert()
+
+ ref_out = pool_efficient_block_ref(input_tensor)
+ if input_dim == 5:
+ ref_out = ref_out.permute((0, 2, 3, 4, 1))
+ else:
+ ref_out = ref_out.permute((0, 2, 3, 1))
+ ref_out = linear_ref(ref_out)
+
+ head_out_1 = pool_efficient_block_1(input_tensor)
+ if input_dim == 5:
+ head_out_1 = head_out_1.permute((0, 2, 3, 4, 1))
+ else:
+ head_out_1 = head_out_1.permute((0, 2, 3, 1))
+ head_out_1 = linear_1(head_out_1)
+ # Check arithmetic equivalency
+ max_err = float(torch.max(torch.abs(ref_out - head_out_1)))
+ rel_err = torch.abs((ref_out - head_out_1) / ref_out)
+ max_rel_err = float(torch.max(rel_err))
+ logging.info(
+ (
+ "test_head_layer_equivalency: AdaptiveAvgPool + Linear"
+ f"input tensor size: {input_tensor_size}"
+ f"max_err {max_err}, max_rel_err {max_rel_err}"
+ )
+ )
+ self.assertTrue(max_err < 1e-3)
+
+ head_out_2 = pool_efficient_block_2(input_tensor)
+ if input_dim == 5:
+ head_out_2 = head_out_2.permute((0, 2, 3, 4, 1))
+ else:
+ head_out_2 = head_out_2.permute((0, 2, 3, 1))
+ head_out_2 = linear_1(head_out_2)
+ # Check arithmetic equivalency
+ max_err = float(torch.max(torch.abs(ref_out - head_out_2)))
+ rel_err = torch.abs((ref_out - head_out_2) / ref_out)
+ max_rel_err = float(torch.max(rel_err))
+ logging.info(
+ (
+ "test_head_layer_equivalency: AdaptiveAvgPoolOutSize1 + Linear"
+ f"input tensor size: {input_tensor_size}"
+ f"max_err {max_err}, max_rel_err {max_rel_err}"
+ )
+ )
+ self.assertTrue(max_err < 1e-3)
diff --git a/tests/test_accelerator_efficient_blocks_mobile_cpu_residual_block.py b/tests/test_accelerator_efficient_blocks_mobile_cpu_residual_block.py
new file mode 100644
index 00000000..f1a6eca3
--- /dev/null
+++ b/tests/test_accelerator_efficient_blocks_mobile_cpu_residual_block.py
@@ -0,0 +1,56 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import logging
+import unittest
+from copy import deepcopy
+
+import torch
+from pytorchvideo.models.accelerator.mobile_cpu.residual_blocks import (
+ X3dBottleneckBlock,
+)
+
+
+class TestConv3dBlockEquivalency(unittest.TestCase):
+ def test_X3dBottleneckBlock_equivalency(self):
+ # Input tensor
+ input_blob_size = (1, 3, 4, 6, 6)
+ input_tensor = torch.randn(input_blob_size)
+ for use_residual in (True, False):
+ for spatial_stride in (1, 2):
+ for se_ratio in (0, 0.5):
+ for act_func_0 in ("relu", "swish", "hswish", "identity"):
+ for act_func_1 in ("relu", "swish", "hswish", "identity"):
+ for act_func_2 in ("relu", "swish", "hswish", "identity"):
+ act_func_tuple = (act_func_0, act_func_1, act_func_2)
+ # X3dBottleneckBlock
+ x3d_block_ref = X3dBottleneckBlock(
+ 3,
+ 16,
+ 3,
+ use_residual=use_residual,
+ spatial_stride=spatial_stride,
+ se_ratio=se_ratio,
+ act_functions=act_func_tuple,
+ )
+ x3d_block = deepcopy(x3d_block_ref)
+ # Get ref output
+ x3d_block_ref.eval()
+ out_ref = x3d_block_ref(input_tensor)
+ # Convert into deployment mode
+ x3d_block.convert(input_blob_size)
+ out = x3d_block(input_tensor)
+ # Check arithmetic equivalency
+ max_err = float(torch.max(torch.abs(out_ref - out)))
+ rel_err = torch.abs((out_ref - out) / out_ref)
+ max_rel_err = float(torch.max(rel_err))
+ logging.info(
+ (
+ "test_X3dBottleneckBlock_equivalency: "
+ f"current setting: use_residual {use_residual}, "
+ f"spatial_stride {spatial_stride}, "
+ f"se_ratio {se_ratio}, "
+ f"act_func_tuple {act_func_tuple}, "
+ f"max_err {max_err}, max_rel_err {max_rel_err}"
+ )
+ )
+ self.assertTrue(max_err < 1e-3)
diff --git a/tests/test_accelerator_models_efficient_x3d.py b/tests/test_accelerator_models_efficient_x3d.py
new file mode 100644
index 00000000..dffca3d5
--- /dev/null
+++ b/tests/test_accelerator_models_efficient_x3d.py
@@ -0,0 +1,102 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import os
+import unittest
+
+import torch
+from pytorchvideo.models.accelerator.mobile_cpu.efficient_x3d import create_x3d
+
+
+class TestEfficientX3d(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_create_x3d(self):
+ """
+ To test different versions, set the (expansion, clip_length, crop_size) to:
+ X3D-XS: ("XS", 4, 160)
+ X3D-S: ("S", 13, 160)
+ X3D-M: ("M", 16, 224)
+ X3D-L: ("L", 16, 312)
+ """
+ for (expansion, input_clip_length, input_crop_size,) in [
+ ("XS", 4, 160),
+ ]:
+ model = create_x3d(expansion=expansion)
+
+ # Test forwarding.
+ for tensor in TestEfficientX3d._get_inputs(
+ input_clip_length, input_crop_size
+ ):
+ if tensor.shape[1] != 3:
+ with self.assertRaises(RuntimeError):
+ out = model(tensor)
+ continue
+
+ out = model(tensor)
+
+ output_shape = out.shape
+ output_shape_gt = (tensor.shape[0], 400)
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_load_hubconf(self):
+ path = os.path.join(
+ os.path.dirname(os.path.realpath(__file__)),
+ "..",
+ )
+ for (input_clip_length, input_crop_size, model_name) in [
+ (4, 160, "efficient_x3d_xs"),
+ (13, 160, "efficient_x3d_s"),
+ ]:
+ model = torch.hub.load(
+ repo_or_dir=path,
+ source="local",
+ model=model_name,
+ pretrained=False,
+ )
+ self.assertIsNotNone(model)
+
+ # Test forwarding.
+ for tensor in TestEfficientX3d._get_inputs(
+ input_clip_length, input_crop_size
+ ):
+ if tensor.shape[1] != 3:
+ with self.assertRaises(RuntimeError):
+ out = model(tensor)
+ continue
+
+ out = model(tensor)
+
+ output_shape = out.shape
+ output_shape_gt = (tensor.shape[0], 400)
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ @staticmethod
+ def _get_inputs(clip_length: int = 4, crop_size: int = 160) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random inputs as test cases.
+ shapes = (
+ (1, 3, clip_length, crop_size, crop_size),
+ (2, 3, clip_length, crop_size, crop_size),
+ )
+ for shape in shapes:
+ yield torch.rand(shape)
diff --git a/tests/test_data_charades_dataset.py b/tests/test_data_charades_dataset.py
new file mode 100644
index 00000000..177d1549
--- /dev/null
+++ b/tests/test_data_charades_dataset.py
@@ -0,0 +1,114 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import contextlib
+import pathlib
+import tempfile
+import unittest
+
+from pytorchvideo.data import Charades
+from pytorchvideo.data.clip_sampling import make_clip_sampler
+from torch.utils.data import SequentialSampler
+from utils import temp_frame_video
+
+
+@contextlib.contextmanager
+def temp_charades_dataset():
+ frame_names = [f"{str(i)}.png" for i in range(3)]
+
+ # Create csv containing 2 test frame videos.
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as f:
+ f.write("original_vido_id video_id frame_id path labels\n".encode())
+
+ # Frame video 1
+ with temp_frame_video(frame_names) as (frame_1_video_dir, data_1):
+ for i, frame_name in enumerate(frame_names):
+ original_video_id = str(frame_1_video_dir)
+ video_id = "1"
+ frame_id = str(i)
+ path = pathlib.Path(frame_1_video_dir) / frame_name
+ label = "0"
+ f.write(
+ f"{original_video_id} {video_id} {frame_id} {path} {label}\n".encode()
+ )
+
+ # Frame video 2
+ with temp_frame_video(frame_names) as (frame_2_video_dir, data_2):
+ for i, frame_name in enumerate(frame_names):
+ original_video_id = str(frame_2_video_dir)
+ video_id = "2"
+ frame_id = str(i)
+ path = pathlib.Path(frame_2_video_dir) / frame_name
+ label = "1"
+ f.write(
+ f"{original_video_id} {video_id} {frame_id} {path} {label}\n".encode()
+ )
+
+ f.close()
+ yield f.name, data_1, data_2
+
+
+class TestCharadesDataset(unittest.TestCase):
+ def test_single_clip_per_video_works(self):
+ with temp_charades_dataset() as (filename, video_1, video_2):
+ clip_sampler = make_clip_sampler(
+ "uniform", 0.1 # Total duration of 3 frames at 30fps is 0.1 seconds.
+ )
+ dataset = Charades(
+ filename, clip_sampler=clip_sampler, video_sampler=SequentialSampler
+ )
+ expected = [([[0], [0], [0]], video_1), ([[1], [1], [1]], video_2)]
+ for sample, expected_sample in zip(dataset, expected):
+ self.assertEqual(sample["label"], expected_sample[0])
+ self.assertTrue(sample["video"].equal(expected_sample[1]))
+
+ def test_multiple_clips_per_video_works(self):
+ with temp_charades_dataset() as (filename, video_1, video_2):
+ clip_sampler = make_clip_sampler(
+ "uniform", 0.033 # Expects each clip to have 1 frame each.
+ )
+ dataset = Charades(
+ filename, clip_sampler=clip_sampler, video_sampler=SequentialSampler
+ )
+
+ expected = [
+ ([[0]], video_1[:, 0:1]),
+ ([[0]], video_1[:, 1:2]),
+ ([[0]], video_1[:, 2:3]),
+ ([[1]], video_2[:, 0:1]),
+ ([[1]], video_2[:, 1:2]),
+ ([[1]], video_2[:, 2:3]),
+ ]
+ for sample, expected_sample in zip(dataset, expected):
+ self.assertEqual(sample["label"], expected_sample[0])
+ self.assertTrue(sample["video"].equal(expected_sample[1]))
+
+ def test_multiple_labels_per_frame(self):
+ frame_names = [f"{str(i)}.png" for i in range(3)]
+
+ # Create csv containing a test frame videos.
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as f:
+ f.write("original_vido_id video_id frame_id path labels\n".encode())
+ with temp_frame_video(frame_names) as (frame_1_video_dir, data_1):
+ for i, frame_name in enumerate(frame_names):
+ original_video_id = str(frame_1_video_dir)
+ video_id = "1"
+ frame_id = str(i)
+ path = pathlib.Path(frame_1_video_dir) / frame_name
+ label = "0,100"
+ f.write(
+ f"{original_video_id} {video_id} {frame_id} {path} {label}\n".encode()
+ )
+
+ f.close()
+
+ clip_sampler = make_clip_sampler(
+ "random",
+ 0.1, # Total duration of 3 frames at 30fps is 0.1 seconds. )
+ )
+ dataset = Charades(
+ f.name, clip_sampler=clip_sampler, video_sampler=SequentialSampler
+ )
+
+ sample = next(dataset)
+ self.assertEqual(sample["label"], [[0, 100], [0, 100], [0, 100]])
+ self.assertTrue(sample["video"].equal(data_1))
diff --git a/tests/test_data_dataset_manifest_utils.py b/tests/test_data_dataset_manifest_utils.py
new file mode 100644
index 00000000..7293091a
--- /dev/null
+++ b/tests/test_data_dataset_manifest_utils.py
@@ -0,0 +1,148 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import unittest
+import unittest.mock
+
+from pytorchvideo.data.dataset_manifest_utils import (
+ EncodedVideoInfo,
+ VideoDataset,
+ VideoFrameInfo,
+ VideoInfo,
+)
+from utils import MOCK_VIDEO_IDS, MOCK_VIDEO_INFOS, get_flat_video_frames
+
+
+class TestDatasetManifestUtils(unittest.TestCase):
+ def test_VideoFrameInfo(self):
+ video_frame_info = VideoFrameInfo(
+ # This is a key-mapping as the underlying
+ # annotation files are of these string columns
+ **{
+ "video_id": "P01_012",
+ "location": "c:/",
+ "frame_file_stem": "P01_012_",
+ "frame_string_length": "20",
+ "min_frame_number": "0",
+ "max_frame_number": "22",
+ "file_extension": "png",
+ }
+ )
+ self.assertEqual(video_frame_info.video_id, "P01_012")
+ self.assertEqual(video_frame_info.location, "c:/")
+ self.assertEqual(video_frame_info.frame_file_stem, "P01_012_")
+ self.assertEqual(video_frame_info.frame_string_length, 20)
+ self.assertEqual(video_frame_info.min_frame_number, 0)
+ self.assertEqual(video_frame_info.max_frame_number, 22)
+ self.assertEqual(video_frame_info.file_extension, "png")
+
+ def test_EncodedVideoInfo(self):
+ encoded_video_info = EncodedVideoInfo(
+ # This is a key-mapping as the underlying epic-kitchen
+ # annotation files are of these string columns
+ **{"video_id": "P01_12", "file_path": "c:/P01_12.mp4"}
+ )
+ self.assertEqual(encoded_video_info.video_id, "P01_12")
+ self.assertEqual(encoded_video_info.file_path, "c:/P01_12.mp4")
+
+ def test_VideoInfo(self):
+ video_info = VideoInfo(
+ # This is a key-mapping as the underlying epic-kitchen
+ # annotation files are of these string columns
+ **{
+ "video_id": "P01_01",
+ "resolution": "1000x200",
+ "duration": "123.45",
+ "fps": "59.9",
+ }
+ )
+ self.assertEqual(video_info.video_id, "P01_01")
+ self.assertEqual(video_info.resolution, "1000x200")
+ self.assertEqual(video_info.duration, 123.45)
+ self.assertEqual(video_info.fps, 59.9)
+
+ def test_frame_number_to_filepath(self):
+ file_names_vid4 = VideoDataset._frame_number_to_filepaths(
+ MOCK_VIDEO_IDS[3],
+ get_flat_video_frames("testdirectory", "jpg"),
+ MOCK_VIDEO_INFOS,
+ )
+ file_path = file_names_vid4[100]
+ self.assertEqual(
+ file_path, f"testdirectory/{MOCK_VIDEO_IDS[3]}/frame_0000000101.jpg"
+ )
+ with self.assertRaises(IndexError):
+ file_path = file_names_vid4[10000]
+ file_path = file_names_vid4[-1]
+ self.assertEqual(
+ file_path, f"testdirectory/{MOCK_VIDEO_IDS[3]}/frame_0000001530.jpg"
+ )
+
+ file_names_vid2 = VideoDataset._frame_number_to_filepaths(
+ MOCK_VIDEO_IDS[1],
+ get_flat_video_frames("testdirectory2", "png"),
+ MOCK_VIDEO_INFOS,
+ )
+ file_path = file_names_vid2[0]
+ self.assertEqual(
+ file_path, f"testdirectory2/{MOCK_VIDEO_IDS[1]}/frame_0000000002.png"
+ )
+ file_path = file_names_vid2[2999]
+ self.assertEqual(
+ file_path, f"testdirectory2/{MOCK_VIDEO_IDS[1]}/frame_0000003001.png"
+ )
+ with self.assertRaises(IndexError):
+ file_path = file_names_vid2[3000]
+
+ def test_remove_video_info_missing_or_incomplete_videos(self):
+ video_infos_a = MOCK_VIDEO_INFOS.copy()
+ video_frames_a = get_flat_video_frames("testdirectory2", "jpg")
+ video_frames_a_copy = video_frames_a.copy()
+
+ # No-Op
+ VideoDataset._remove_video_info_missing_or_incomplete_videos(
+ video_frames_a, video_infos_a
+ )
+
+ self.assertEqual(len(video_infos_a), len(MOCK_VIDEO_INFOS))
+ for video_id in video_infos_a:
+ self.assertEqual(video_infos_a[video_id], MOCK_VIDEO_INFOS[video_id])
+
+ self.assertEqual(len(video_frames_a), len(video_frames_a_copy))
+ for video_id in video_frames_a:
+ self.assertEqual(video_frames_a[video_id], video_frames_a_copy[video_id])
+
+ video_infos_b = MOCK_VIDEO_INFOS.copy()
+ video_frames_b = video_frames_a_copy.copy()
+
+ # Unmatched video info, should be removed
+ video_infos_b["P07_001"] = VideoInfo(
+ video_id="P07_001", resolution="720x1280", duration=17.001, fps=30
+ )
+
+ # Unmatched video frame entry, should be removed
+ video_frames_b["P07_002"]: VideoFrameInfo(
+ min_frame_number=1, max_frame_number=1530, frame_string_length=8
+ )
+
+ # Video info that defines approximately 6000 frames with 600 present from frame manifest
+ # Should be dropped
+ video_frames_b["P08_001"]: VideoFrameInfo(
+ min_frame_number=1, max_frame_number=600, frame_string_length=8
+ )
+
+ video_infos_b["P08_001"] = VideoInfo(
+ video_id="P08_001", resolution="720x1280", duration=100, fps=60
+ )
+
+ VideoDataset._remove_video_info_missing_or_incomplete_videos(
+ video_frames_b, video_infos_b
+ )
+
+ # All newly added fields should be removed
+ self.assertEqual(len(video_infos_b), len(MOCK_VIDEO_INFOS))
+ for video_id in video_infos_b:
+ self.assertEqual(video_infos_b[video_id], MOCK_VIDEO_INFOS[video_id])
+
+ self.assertEqual(len(video_frames_b), len(video_frames_a_copy))
+ for video_id in video_frames_b:
+ self.assertEqual(video_frames_b[video_id], video_frames_a_copy[video_id])
diff --git a/tests/test_data_domsev_dataset.py b/tests/test_data_domsev_dataset.py
new file mode 100644
index 00000000..98bdd04f
--- /dev/null
+++ b/tests/test_data_domsev_dataset.py
@@ -0,0 +1,242 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import tempfile
+import unittest
+import unittest.mock
+from contextlib import ExitStack
+from pathlib import Path
+
+import torch
+from parameterized import parameterized
+from pytorchvideo.data.dataset_manifest_utils import VideoClipInfo, VideoDatasetType
+from pytorchvideo.data.domsev import (
+ ActivityData,
+ DomsevDataset,
+ frame_index_to_seconds,
+ get_overlap_for_time_range_pair,
+ seconds_to_frame_index,
+)
+from pytorchvideo.data.utils import save_dataclass_objs_to_headered_csv
+from utils import (
+ MOCK_VIDEO_IDS,
+ MOCK_VIDEO_INFOS,
+ get_encoded_video_infos,
+ get_flat_video_frames,
+)
+
+
+class TestDomsevDataset(unittest.TestCase):
+
+ # video_id: str
+ # start_time: float # Start time of the activity, in seconds
+ # stop_time: float # Stop time of the activity, in seconds
+ # start_frame: int # 0-indexed ID of the start frame (inclusive)
+ # stop_frame: int # 0-index ID of the stop frame (inclusive)
+ # activity_id: int
+ # activity_name: str
+ ACTIVITIES_DATA = {
+ MOCK_VIDEO_IDS[0]: [
+ ActivityData(
+ MOCK_VIDEO_IDS[0],
+ 0.0,
+ 6.0,
+ 1,
+ 181,
+ 1,
+ "walking",
+ ),
+ ActivityData(
+ MOCK_VIDEO_IDS[0],
+ 6.0333333,
+ 10.0,
+ 182,
+ 301,
+ 2,
+ "running",
+ ),
+ ActivityData(
+ MOCK_VIDEO_IDS[0],
+ 10.033333,
+ 20.0,
+ 302,
+ 601,
+ 0,
+ "none",
+ ),
+ ],
+ MOCK_VIDEO_IDS[1]: [
+ ActivityData(
+ MOCK_VIDEO_IDS[1],
+ 3.0,
+ 5.0,
+ 181,
+ 301,
+ 7,
+ "cooking",
+ ),
+ ],
+ MOCK_VIDEO_IDS[2]: [
+ ActivityData(
+ MOCK_VIDEO_IDS[2],
+ 100.0,
+ 200.0,
+ 3001,
+ 6001,
+ 9,
+ "observing",
+ ),
+ ],
+ MOCK_VIDEO_IDS[3]: [
+ ActivityData(
+ MOCK_VIDEO_IDS[3],
+ 10.0,
+ 20.0,
+ 901,
+ 1801,
+ 5,
+ "driving",
+ ),
+ ],
+ }
+
+ def setUp(self):
+ pass
+
+ def test_seconds_to_frame_index(self):
+ self.assertEqual(seconds_to_frame_index(10.56, 1, zero_indexed=True), 10)
+ self.assertEqual(seconds_to_frame_index(10.56, 1, zero_indexed=False), 11)
+
+ self.assertEqual(seconds_to_frame_index(9.99, 1, zero_indexed=True), 9)
+ self.assertEqual(seconds_to_frame_index(9.99, 1, zero_indexed=False), 10)
+
+ self.assertEqual(seconds_to_frame_index(1.01, 10, zero_indexed=True), 10)
+ self.assertEqual(seconds_to_frame_index(1.01, 10, zero_indexed=False), 11)
+
+ def test_frame_index_to_seconds(self):
+ self.assertEqual(frame_index_to_seconds(1, 1, zero_indexed=True), 1.0)
+ self.assertEqual(frame_index_to_seconds(1, 1, zero_indexed=False), 0.0)
+ self.assertEqual(frame_index_to_seconds(2, 1, zero_indexed=False), 1.0)
+
+ self.assertEqual(frame_index_to_seconds(30, 30, zero_indexed=True), 1.0)
+ self.assertEqual(frame_index_to_seconds(30, 30, zero_indexed=False), 29 / 30)
+
+ self.assertEqual(frame_index_to_seconds(1, 10, zero_indexed=True), 0.1)
+ self.assertEqual(frame_index_to_seconds(2, 10, zero_indexed=False), 0.1)
+
+ def test_get_overlap_for_time_range_pair(self):
+ self.assertEqual(get_overlap_for_time_range_pair(0, 1, 0.1, 0.2), (0.1, 0.2))
+ self.assertEqual(get_overlap_for_time_range_pair(0.1, 0.2, 0, 1), (0.1, 0.2))
+ self.assertEqual(get_overlap_for_time_range_pair(0, 1, 0.9, 1.1), (0.9, 1.0))
+ self.assertEqual(get_overlap_for_time_range_pair(0, 0.2, 0.1, 1), (0.1, 0.2))
+
+ @parameterized.expand([(VideoDatasetType.Frame,), (VideoDatasetType.EncodedVideo,)])
+ def test__len__(self, dataset_type):
+ with tempfile.TemporaryDirectory(prefix=f"{TestDomsevDataset}") as tempdir:
+ tempdir = Path(tempdir)
+
+ video_info_file = tempdir / "test_video_info.csv"
+ save_dataclass_objs_to_headered_csv(
+ list(MOCK_VIDEO_INFOS.values()), video_info_file
+ )
+ activity_file = tempdir / "activity_video_info.csv"
+ activities = []
+ for activity_list in self.ACTIVITIES_DATA.values():
+ for activity in activity_list:
+ activities.append(activity)
+ save_dataclass_objs_to_headered_csv(activities, activity_file)
+
+ video_data_manifest_file_path = (
+ tempdir / "video_data_manifest_file_path.json"
+ )
+ with ExitStack() as stack:
+ if dataset_type == VideoDatasetType.Frame:
+ video_data_dict = get_flat_video_frames(tempdir, "jpg")
+ elif dataset_type == VideoDatasetType.EncodedVideo:
+ video_data_dict = get_encoded_video_infos(tempdir, stack)
+
+ save_dataclass_objs_to_headered_csv(
+ list(video_data_dict.values()), video_data_manifest_file_path
+ )
+ video_ids = list(self.ACTIVITIES_DATA)
+ dataset = DomsevDataset(
+ video_data_manifest_file_path=str(video_data_manifest_file_path),
+ video_info_file_path=str(video_info_file),
+ activities_file_path=str(activity_file),
+ dataset_type=dataset_type,
+ clip_sampler=lambda x, y: [
+ VideoClipInfo(video_ids[i // 2], i * 2.0, i * 2.0 + 0.9)
+ for i in range(0, 7)
+ ],
+ )
+
+ self.assertEqual(len(dataset._videos), 4)
+ total_activities = [
+ activity
+ for video_activities in list(dataset._activities.values())
+ for activity in video_activities
+ ]
+ self.assertEqual(len(total_activities), 6)
+ self.assertEqual(len(dataset), 7) # Num clips
+
+ @parameterized.expand([(VideoDatasetType.Frame,), (VideoDatasetType.EncodedVideo,)])
+ def test__getitem__(self, dataset_type):
+ with tempfile.TemporaryDirectory(prefix=f"{TestDomsevDataset}") as tempdir:
+ tempdir = Path(tempdir)
+
+ video_info_file = tempdir / "test_video_info.csv"
+ save_dataclass_objs_to_headered_csv(
+ list(MOCK_VIDEO_INFOS.values()), video_info_file
+ )
+ activity_file = tempdir / "activity_video_info.csv"
+ activities = []
+ for activity_list in self.ACTIVITIES_DATA.values():
+ for activity in activity_list:
+ activities.append(activity)
+ save_dataclass_objs_to_headered_csv(activities, activity_file)
+
+ video_data_manifest_file_path = (
+ tempdir / "video_data_manifest_file_path.json"
+ )
+ with ExitStack() as stack:
+ if dataset_type == VideoDatasetType.Frame:
+ video_data_dict = get_flat_video_frames(tempdir, "jpg")
+ elif dataset_type == VideoDatasetType.EncodedVideo:
+ video_data_dict = get_encoded_video_infos(tempdir, stack)
+
+ save_dataclass_objs_to_headered_csv(
+ list(video_data_dict.values()), video_data_manifest_file_path
+ )
+ video_ids = list(self.ACTIVITIES_DATA)
+ dataset = DomsevDataset(
+ video_data_manifest_file_path=str(video_data_manifest_file_path),
+ video_info_file_path=str(video_info_file),
+ activities_file_path=str(activity_file),
+ dataset_type=dataset_type,
+ clip_sampler=lambda x, y: [
+ VideoClipInfo(video_ids[i // 2], i * 2.0, i * 2.0 + 0.9)
+ for i in range(0, 7)
+ ],
+ )
+
+ get_clip_string = (
+ "pytorchvideo.data.frame_video.FrameVideo.get_clip"
+ if dataset_type == VideoDatasetType.Frame
+ else "pytorchvideo.data.encoded_video.EncodedVideo.get_clip"
+ )
+ with unittest.mock.patch(
+ get_clip_string,
+ return_value=({"video": torch.rand(3, 5, 10, 20), "audio": []}),
+ ) as _:
+ clip_1 = dataset.__getitem__(1)
+ for i, a in enumerate(clip_1["activities"]):
+ self.assertEqual(a, self.ACTIVITIES_DATA[video_ids[0]][i])
+ self.assertEqual(clip_1["start_time"], 2.0)
+ self.assertEqual(clip_1["stop_time"], 2.9)
+ self.assertEqual(clip_1["video_id"], MOCK_VIDEO_IDS[0])
+
+ clip_2 = dataset.__getitem__(2)
+ for i, a in enumerate(clip_2["activities"]):
+ self.assertEqual(a, self.ACTIVITIES_DATA[video_ids[1]][i])
+ self.assertEqual(clip_2["start_time"], 4.0)
+ self.assertEqual(clip_2["stop_time"], 4.9)
+ self.assertEqual(clip_2["video_id"], MOCK_VIDEO_IDS[1])
diff --git a/tests/test_data_encoded_video.py b/tests/test_data_encoded_video.py
new file mode 100644
index 00000000..eb3d79d6
--- /dev/null
+++ b/tests/test_data_encoded_video.py
@@ -0,0 +1,140 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import tempfile
+import unittest
+
+import pytest
+from pytorchvideo.data.encoded_video import EncodedVideo
+from utils import temp_encoded_video, temp_encoded_video_with_audio
+
+
+class TestEncodedVideo(unittest.TestCase):
+ def test_video_works(self):
+ num_frames = 11
+ fps = 5
+ with temp_encoded_video(num_frames=num_frames, fps=fps) as (file_name, data):
+ test_video = EncodedVideo.from_path(file_name)
+ self.assertAlmostEqual(test_video.duration, num_frames / fps)
+
+ # All frames (0 - test_video.duration seconds)
+ clip = test_video.get_clip(0, test_video.duration)
+ frames, audio_samples = clip["video"], clip["audio"]
+ self.assertTrue(frames.equal(data))
+ self.assertEqual(audio_samples, None)
+
+ # Half frames
+ clip = test_video.get_clip(0, test_video.duration / 2)
+ frames, audio_samples = clip["video"], clip["audio"]
+ self.assertTrue(frames.equal(data[:, : round(num_frames / 2)]))
+ self.assertEqual(audio_samples, None)
+
+ # No frames
+ clip = test_video.get_clip(test_video.duration + 1, test_video.duration + 3)
+ frames, audio_samples = clip["video"], clip["audio"]
+ self.assertEqual(frames, None)
+ self.assertEqual(audio_samples, None)
+ test_video.close()
+
+ def test_video_with_shorter_audio_works(self):
+ num_audio_samples = 8000
+ num_frames = 5
+ fps = 5
+ audio_rate = 8000
+ with temp_encoded_video_with_audio(
+ num_frames=num_frames,
+ fps=fps,
+ num_audio_samples=num_audio_samples,
+ audio_rate=audio_rate,
+ ) as (file_name, video_data, audio_data):
+ test_video = EncodedVideo.from_path(file_name)
+
+ # Duration is max of both streams, therefore, the video duration will be expected.
+ self.assertEqual(test_video.duration, num_frames / fps)
+
+ # All audio (0 - 2 seconds)
+ clip = test_video.get_clip(0, test_video.duration)
+ frames, audio_samples = clip["video"], clip["audio"]
+ self.assertTrue(frames.equal(video_data))
+ self.assertTrue(audio_samples.equal(audio_data))
+
+ # Half frames
+ clip = test_video.get_clip(0, test_video.duration / 2)
+ frames, audio_samples = clip["video"], clip["audio"]
+
+ self.assertTrue(frames.equal(video_data[:, : num_frames // 2]))
+ self.assertTrue(audio_samples.equal(audio_data))
+
+ test_video.close()
+
+ def test_video_with_longer_audio_works(self):
+ audio_rate = 10000
+ fps = 5
+ num_frames = 5
+ num_audio_samples = 40000
+ with temp_encoded_video_with_audio(
+ num_frames=num_frames,
+ fps=fps,
+ num_audio_samples=num_audio_samples,
+ audio_rate=audio_rate,
+ ) as (file_name, video_data, audio_data):
+ test_video = EncodedVideo.from_path(file_name)
+
+ # All audio
+ clip = test_video.get_clip(0, test_video.duration)
+ frames, audio_samples = clip["video"], clip["audio"]
+ self.assertTrue(frames.equal(video_data))
+ self.assertTrue(audio_samples.equal(audio_data))
+
+ # No frames (3 - 5 seconds)
+ clip = test_video.get_clip(test_video.duration + 1, test_video.duration + 2)
+ frames, audio_samples = clip["video"], clip["audio"]
+ self.assertEqual(frames, None)
+ self.assertEqual(audio_samples, None)
+
+ test_video.close()
+
+ def test_decode_audio_is_false(self):
+ audio_rate = 10000
+ fps = 5
+ num_frames = 5
+ num_audio_samples = 40000
+ with temp_encoded_video_with_audio(
+ num_frames=num_frames,
+ fps=fps,
+ num_audio_samples=num_audio_samples,
+ audio_rate=audio_rate,
+ ) as (file_name, video_data, audio_data):
+ test_video = EncodedVideo.from_path(file_name, decode_audio=False)
+
+ # All audio
+ clip = test_video.get_clip(0, test_video.duration)
+ frames, audio_samples = clip["video"], clip["audio"]
+ self.assertTrue(frames.equal(video_data))
+ self.assertEqual(audio_samples, None)
+
+ test_video.close()
+
+ def test_file_api(self):
+ num_frames = 11
+ fps = 5
+ with temp_encoded_video(num_frames=num_frames, fps=fps) as (file_name, data):
+ with open(file_name, "rb") as f:
+ test_video = EncodedVideo(f)
+
+ self.assertAlmostEqual(test_video.duration, num_frames / fps)
+ clip = test_video.get_clip(0, test_video.duration)
+ frames, audio_samples = clip["video"], clip["audio"]
+ self.assertTrue(frames.equal(data))
+ self.assertEqual(audio_samples, None)
+
+ def test_open_video_failure(self):
+ with pytest.raises(FileNotFoundError):
+ test_video = EncodedVideo.from_path("non_existent_file.txt")
+ test_video.close()
+
+ def test_decode_video_failure(self):
+ with tempfile.NamedTemporaryFile(suffix=".mp4") as f:
+ f.write(b"This is not an mp4 file")
+ with pytest.raises(RuntimeError):
+ test_video = EncodedVideo.from_path(f.name)
+ test_video.close()
diff --git a/tests/test_data_encoded_video_dataset.py b/tests/test_data_encoded_video_dataset.py
new file mode 100644
index 00000000..2c62fc40
--- /dev/null
+++ b/tests/test_data_encoded_video_dataset.py
@@ -0,0 +1,686 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import collections
+import contextlib
+import itertools
+import math
+import multiprocessing
+import os
+import pathlib
+import tempfile
+import unittest
+import unittest.mock
+from typing import List, Tuple
+from unittest.mock import Mock, patch
+
+# av import has to be added for `buck test` to work.
+import av # noqa: F401
+import torch
+import torch.distributed as dist
+from parameterized import parameterized
+from pytorchvideo.data import Hmdb51
+from pytorchvideo.data.clip_sampling import make_clip_sampler
+from pytorchvideo.data.encoded_video_dataset import (
+ EncodedVideoDataset,
+ labeled_encoded_video_dataset,
+)
+from pytorchvideo.data.labeled_video_paths import LabeledVideoPaths
+from pytorchvideo.data.utils import MultiProcessSampler, thwc_to_cthw
+from torch.multiprocessing import Process
+from torch.utils.data import (
+ DataLoader,
+ DistributedSampler,
+ RandomSampler,
+ SequentialSampler,
+ TensorDataset,
+)
+from utils import create_dummy_video_frames, temp_encoded_video
+
+
+DECODER_LIST = [("pyav",), ("torchvision",)]
+
+
+class TestEncodedVideoDataset(unittest.TestCase):
+ # Clip sampling is start time inclusive so we need to subtract _EPS from
+ # total_duration / 2 to sample half of the frames of a video.
+ _EPS = 1e-9
+
+ def setUp(self):
+ # Fail fast for tests
+ EncodedVideoDataset._MAX_CONSECUTIVE_FAILURES = 1
+
+ @parameterized.expand(DECODER_LIST)
+ def test_single_clip_per_video_works(self, decoder):
+ with mock_encoded_video_dataset_file() as (mock_csv, expected, total_duration):
+ clip_sampler = make_clip_sampler("uniform", total_duration)
+ dataset = labeled_encoded_video_dataset(
+ data_path=mock_csv,
+ clip_sampler=clip_sampler,
+ video_sampler=SequentialSampler,
+ decode_audio=False,
+ decoder=decoder,
+ )
+ test_dataloader = DataLoader(dataset, batch_size=None, num_workers=2)
+
+ for _ in range(2):
+ actual = [
+ (sample["label"], sample["video"]) for sample in test_dataloader
+ ]
+ assert_unordered_list_compare_true(self, expected, actual)
+
+ @parameterized.expand(DECODER_LIST)
+ def test_video_name_with_whitespace_works(self, decoder):
+ num_frames = 10
+ fps = 5
+ with temp_encoded_video(num_frames=num_frames, fps=fps, prefix="pre fix") as (
+ video_file_name,
+ data,
+ ):
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f:
+ f.write(f"{video_file_name} 0\n".encode())
+ f.write(f"{video_file_name} 1\n".encode())
+
+ total_duration = num_frames / fps
+ clip_sampler = make_clip_sampler("uniform", total_duration)
+ labeled_video_paths = LabeledVideoPaths.from_path(f.name)
+ dataset = EncodedVideoDataset(
+ labeled_video_paths,
+ clip_sampler=clip_sampler,
+ video_sampler=SequentialSampler,
+ decode_audio=False,
+ decoder=decoder,
+ )
+
+ expected = [(0, data), (1, data)]
+ for i, sample in enumerate(dataset):
+ self.assertTrue(sample["video"].equal(expected[i][1]))
+ self.assertEqual(sample["label"], expected[i][0])
+
+ @parameterized.expand(DECODER_LIST)
+ def test_random_clip_sampling_works(self, decoder):
+ with mock_encoded_video_dataset_file() as (
+ mock_csv,
+ label_videos,
+ total_duration,
+ ):
+ half_duration = total_duration / 2 - self._EPS
+ clip_sampler = make_clip_sampler("random", half_duration)
+ labeled_video_paths = LabeledVideoPaths.from_path(mock_csv)
+ dataset = EncodedVideoDataset(
+ labeled_video_paths,
+ clip_sampler=clip_sampler,
+ video_sampler=SequentialSampler,
+ decode_audio=False,
+ decoder=decoder,
+ )
+
+ expected_labels = [label for label, _ in label_videos]
+ for i, sample in enumerate(dataset):
+ expected_t_shape = 5
+ self.assertEqual(sample["video"].shape[1], expected_t_shape)
+ self.assertEqual(sample["label"], expected_labels[i])
+
+ @parameterized.expand(DECODER_LIST)
+ def test_reading_from_directory_structure_hmdb51(self, decoder):
+ # For an unknown reason this import has to be here for `buck test` to work.
+ import torchvision.io as io
+
+ with tempfile.TemporaryDirectory() as root_dir:
+
+ # Create test directory structure with two classes and a video in each.
+ root_dir_name = pathlib.Path(root_dir)
+ action_1 = "running"
+ action_2 = "cleaning_windows"
+
+ videos_root_dir = root_dir_name / "videos"
+ videos_root_dir.mkdir()
+
+ test_class_1 = videos_root_dir / action_1
+ test_class_1.mkdir()
+ data_1 = create_dummy_video_frames(15, 10, 10)
+ test_class_2 = videos_root_dir / action_2
+ test_class_2.mkdir()
+ data_2 = create_dummy_video_frames(20, 15, 15)
+
+ test_splits = root_dir_name / "folds"
+ test_splits.mkdir()
+
+ with tempfile.NamedTemporaryFile(
+ suffix="_u_nm_np1_ba_goo_19.avi", dir=test_class_1
+ ) as f_1, tempfile.NamedTemporaryFile(
+ suffix="_u_nm_np1_fr_med_1.avi", dir=test_class_2
+ ) as f_2:
+ f_1.close()
+ f_2.close()
+
+ # Write lossless video for each class.
+ io.write_video(
+ f_1.name,
+ data_1,
+ fps=30,
+ video_codec="libx264rgb",
+ options={"crf": "0"},
+ )
+ io.write_video(
+ f_2.name,
+ data_2,
+ fps=30,
+ video_codec="libx264rgb",
+ options={"crf": "0"},
+ )
+
+ _, video_name_1 = os.path.split(f_1.name)
+ _, video_name_2 = os.path.split(f_2.name)
+
+ with open(
+ os.path.join(test_splits, action_1 + "_test_split1.txt"), "w"
+ ) as f:
+ f.write(f"{video_name_1} 1\n")
+
+ with open(
+ os.path.join(test_splits, action_2 + "_test_split1.txt"), "w"
+ ) as f:
+ f.write(f"{video_name_2} 1\n")
+
+ clip_sampler = make_clip_sampler("uniform", 3)
+ dataset = Hmdb51(
+ data_path=test_splits,
+ video_path_prefix=root_dir_name / "videos",
+ clip_sampler=clip_sampler,
+ video_sampler=SequentialSampler,
+ split_id=1,
+ split_type="train",
+ decode_audio=False,
+ decoder=decoder,
+ )
+
+ # Videos are sorted alphabetically so "cleaning windows" (i.e. data_2)
+ # will be first.
+ sample_1 = next(dataset)
+ sample_2 = next(dataset)
+
+ self.assertTrue(sample_1["label"] in [action_1, action_2])
+ if sample_1["label"] == action_2:
+ sample_1, sample_2 = sample_2, sample_1
+
+ self.assertEqual(sample_1["label"], action_1)
+ self.assertEqual(5, len(sample_1["meta_tags"]))
+ self.assertTrue(
+ sample_1["video"].equal(thwc_to_cthw(data_1).to(torch.float32))
+ )
+
+ self.assertEqual(sample_2["label"], action_2)
+ self.assertEqual(5, len(sample_2["meta_tags"]))
+ self.assertTrue(
+ sample_2["video"].equal(thwc_to_cthw(data_2).to(torch.float32))
+ )
+
+ @parameterized.expand(DECODER_LIST)
+ def test_constant_clips_per_video_sampling_works(self, decoder):
+ # Make one video with 15 frames and one with 10 frames, producing 3 clips and 2
+ # clips respectively.
+ num_frames = 10
+ fps = 5
+ with temp_encoded_video(num_frames=int(num_frames * 1.5), fps=fps) as (
+ video_file_name_1,
+ data_1,
+ ):
+ with temp_encoded_video(num_frames=num_frames, fps=fps) as (
+ video_file_name_2,
+ data_2,
+ ):
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f:
+ f.write(f"{video_file_name_1} 0\n".encode())
+ f.write(f"{video_file_name_2} 1\n".encode())
+
+ clip_frames = 2
+ duration_for_frames = clip_frames / fps - self._EPS
+ clip_sampler = make_clip_sampler(
+ "constant_clips_per_video", duration_for_frames, 2
+ )
+ labeled_video_paths = LabeledVideoPaths.from_path(f.name)
+ dataset = EncodedVideoDataset(
+ labeled_video_paths,
+ clip_sampler=clip_sampler,
+ video_sampler=SequentialSampler,
+ decode_audio=False,
+ decoder=decoder,
+ )
+
+ # Dataset has 2 videos. Each video has two evenly spaced clips of size
+ # clip_frames sampled. The first clip of each video will always be
+ # sampled at second 0. The second clip of the video is the next frame
+ # from time: (total_duration - clip_duration) / 2
+ half_frames_1 = math.ceil((data_1.shape[1] - clip_frames) / 2)
+ half_frames_2 = math.ceil((data_2.shape[1] - clip_frames) / 2)
+ expected = [
+ (0, data_1[:, :clip_frames]),
+ (0, data_1[:, half_frames_1 : half_frames_1 + clip_frames]),
+ (1, data_2[:, :clip_frames]),
+ (1, data_2[:, half_frames_2 : half_frames_2 + clip_frames]),
+ ]
+ for i, sample in enumerate(dataset):
+ self.assertTrue(sample["video"].equal(expected[i][1]))
+ self.assertEqual(sample["label"], expected[i][0])
+
+ @parameterized.expand(DECODER_LIST)
+ def test_reading_from_directory_structure(self, decoder):
+ # For an unknown reason this import has to be here for `buck test` to work.
+ import torchvision.io as io
+
+ with tempfile.TemporaryDirectory() as root_dir:
+
+ # Create test directory structure with two classes and a video in each.
+ root_dir_name = pathlib.Path(root_dir)
+ test_class_1 = root_dir_name / "running"
+ test_class_1.mkdir()
+ data_1 = create_dummy_video_frames(15, 10, 10)
+ test_class_2 = root_dir_name / "cleaning windows"
+ test_class_2.mkdir()
+ data_2 = create_dummy_video_frames(20, 15, 15)
+ with tempfile.NamedTemporaryFile(
+ suffix=".mp4", dir=test_class_1
+ ) as f_1, tempfile.NamedTemporaryFile(
+ suffix=".mp4", dir=test_class_2
+ ) as f_2:
+ f_1.close()
+ f_2.close()
+
+ # Write lossless video for each class.
+ io.write_video(
+ f_1.name,
+ data_1,
+ fps=30,
+ video_codec="libx264rgb",
+ options={"crf": "0"},
+ )
+ io.write_video(
+ f_2.name,
+ data_2,
+ fps=30,
+ video_codec="libx264rgb",
+ options={"crf": "0"},
+ )
+
+ clip_sampler = make_clip_sampler("uniform", 3)
+ labeled_video_paths = LabeledVideoPaths.from_path(root_dir)
+ dataset = EncodedVideoDataset(
+ labeled_video_paths,
+ clip_sampler=clip_sampler,
+ video_sampler=SequentialSampler,
+ decode_audio=False,
+ decoder=decoder,
+ )
+
+ # Videos are sorted alphabetically so "cleaning windows" (i.e. data_2)
+ # will be first.
+ sample_1 = next(dataset)
+ self.assertEqual(sample_1["label"], 0)
+ self.assertTrue(
+ sample_1["video"].equal(thwc_to_cthw(data_2).to(torch.float32))
+ )
+
+ sample_2 = next(dataset)
+ self.assertEqual(sample_2["label"], 1)
+ self.assertTrue(
+ sample_2["video"].equal(thwc_to_cthw(data_1).to(torch.float32))
+ )
+
+ @parameterized.expand(DECODER_LIST)
+ def test_random_video_sampler(self, decoder):
+ with mock_encoded_video_dataset_file() as (mock_csv, expected, total_duration):
+ clip_sampler = make_clip_sampler("uniform", total_duration)
+ dataset = labeled_encoded_video_dataset(
+ data_path=mock_csv,
+ clip_sampler=clip_sampler,
+ video_sampler=RandomSampler,
+ decode_audio=False,
+ decoder=decoder,
+ )
+
+ for _ in range(2):
+ actual = [(sample["label"], sample["video"]) for sample in dataset]
+ assert_unordered_list_compare_true(self, expected, actual)
+
+ @parameterized.expand(itertools.product([0, 1, 2], ["pyav", "torchvision"]))
+ def test_random_video_sampler_multiprocessing(self, num_workers, decoder):
+ with mock_encoded_video_dataset_file() as (mock_csv, expected, total_duration):
+ clip_sampler = make_clip_sampler("uniform", total_duration)
+ dataset = labeled_encoded_video_dataset(
+ data_path=mock_csv,
+ clip_sampler=clip_sampler,
+ video_sampler=RandomSampler,
+ decode_audio=False,
+ decoder=decoder,
+ )
+ test_dataloader = DataLoader(
+ dataset, batch_size=None, num_workers=num_workers
+ )
+
+ for _ in range(2):
+ actual = [
+ (sample["label"], sample["video"]) for sample in test_dataloader
+ ]
+ assert_unordered_list_compare_true(self, expected, actual)
+
+ @parameterized.expand(DECODER_LIST)
+ def test_sampling_with_multiple_processes(self, decoder):
+ with mock_encoded_video_dataset_file() as (
+ mock_csv,
+ label_videos,
+ total_duration,
+ ):
+ half_duration = total_duration / 2 - self._EPS
+ clip_sampler = make_clip_sampler("uniform", half_duration)
+ labeled_video_paths = LabeledVideoPaths.from_path(mock_csv)
+ dataset = EncodedVideoDataset(
+ labeled_video_paths,
+ clip_sampler=clip_sampler,
+ video_sampler=SequentialSampler,
+ decode_audio=False,
+ decoder=decoder,
+ )
+
+ # Split each full video into two clips.
+ expected = []
+ for label, data in label_videos:
+ num_frames = data.shape[0]
+ half_frames = num_frames // 2
+ first_half_data = data[:, :half_frames]
+ second_half_data = data[:, half_frames:]
+ expected.append((label, first_half_data))
+ expected.append((label, second_half_data))
+
+ test_dataloader = DataLoader(dataset, batch_size=None, num_workers=2)
+ actual = [(sample["label"], sample["video"]) for sample in test_dataloader]
+ assert_unordered_list_compare_true(self, expected, actual)
+
+ @parameterized.expand(DECODER_LIST)
+ def test_sampling_with_non_divisible_processes_by_videos(self, decoder):
+ with mock_encoded_video_dataset_file() as (
+ mock_csv,
+ label_videos,
+ total_duration,
+ ):
+ half_duration = total_duration / 2 - self._EPS
+ clip_sampler = make_clip_sampler("uniform", half_duration)
+ labeled_video_paths = LabeledVideoPaths.from_path(mock_csv)
+ dataset = EncodedVideoDataset(
+ labeled_video_paths,
+ clip_sampler=clip_sampler,
+ video_sampler=SequentialSampler,
+ decode_audio=False,
+ decoder=decoder,
+ )
+
+ # Split each full video into two clips.
+ expected = []
+ for label, data in label_videos:
+ num_frames = data.shape[0]
+ half_frames = num_frames // 2
+ first_half_data = data[:, :half_frames]
+ second_half_data = data[:, half_frames:]
+ expected.append((label, first_half_data))
+ expected.append((label, second_half_data))
+
+ test_dataloader = DataLoader(dataset, batch_size=None, num_workers=4)
+ actual = [(sample["label"], sample["video"]) for sample in test_dataloader]
+ assert_unordered_list_compare_true(self, expected, actual)
+
+ @parameterized.expand(DECODER_LIST)
+ def test_sampling_with_more_processes_than_videos(self, decoder):
+ with mock_encoded_video_dataset_file() as (
+ mock_csv,
+ label_videos,
+ total_duration,
+ ):
+ half_duration = total_duration / 2 - self._EPS
+ clip_sampler = make_clip_sampler("uniform", half_duration)
+ labeled_video_paths = LabeledVideoPaths.from_path(mock_csv)
+ dataset = EncodedVideoDataset(
+ labeled_video_paths,
+ clip_sampler=clip_sampler,
+ video_sampler=SequentialSampler,
+ decode_audio=False,
+ decoder=decoder,
+ )
+
+ # Split each full video into two clips.
+ expected = []
+ for label, data in label_videos:
+ num_frames = data.shape[0]
+ half_frames = num_frames // 2
+ first_half_data = data[:, :half_frames]
+ second_half_data = data[:, half_frames:]
+ expected.append((label, first_half_data))
+ expected.append((label, second_half_data))
+
+ test_dataloader = DataLoader(dataset, batch_size=None, num_workers=16)
+ actual = [(sample["label"], sample["video"]) for sample in test_dataloader]
+ assert_unordered_list_compare_true(self, expected, actual)
+
+ @parameterized.expand(DECODER_LIST)
+ def test_sampling_with_non_divisible_processes_by_clips(self, decoder):
+
+ # Make one video with 15 frames and one with 10 frames, producing 3 clips and 2
+ # clips respectively.
+ num_frames = 10
+ fps = 5
+ with temp_encoded_video(num_frames=int(num_frames * 1.5), fps=fps) as (
+ video_file_name_1,
+ data_1,
+ ):
+ with temp_encoded_video(num_frames=num_frames, fps=fps) as (
+ video_file_name_2,
+ data_2,
+ ):
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f:
+ f.write(f"{video_file_name_1} 0\n".encode())
+ f.write(f"{video_file_name_2} 1\n".encode())
+
+ total_duration = num_frames / fps
+ half_duration = total_duration / 2 - self._EPS
+ clip_sampler = make_clip_sampler("uniform", half_duration)
+ labeled_video_paths = LabeledVideoPaths.from_path(f.name)
+ dataset = EncodedVideoDataset(
+ labeled_video_paths,
+ clip_sampler=clip_sampler,
+ video_sampler=SequentialSampler,
+ decode_audio=False,
+ decoder=decoder,
+ )
+
+ half_frames = num_frames // 2
+ expected = {
+ (0, data_1[:, half_frames * 2 :]), # 1/3 clip
+ (0, data_1[:, half_frames : half_frames * 2]), # 2/3 clip
+ (0, data_1[:, :half_frames]), # 3/3/ clip
+ (1, data_2[:, :half_frames]), # First half
+ (1, data_2[:, half_frames:]), # Second half
+ }
+
+ test_dataloader = DataLoader(dataset, batch_size=None, num_workers=2)
+ actual = [
+ (sample["label"], sample["video"]) for sample in test_dataloader
+ ]
+ assert_unordered_list_compare_true(self, expected, actual)
+
+ def test_multi_process_sampler(self):
+ # Test coverage ignores multi-process lines of code so we need to mock out
+ # the multiprocess environment information to test in a single process.
+ with patch("torch.utils.data.get_worker_info") as get_worker_info:
+ get_worker_info.return_value = Mock(id=2, num_workers=3)
+ inps = torch.arange(10 * 5, dtype=torch.float32).view(10, 5)
+ tgts = torch.arange(10 * 5, dtype=torch.float32).view(10, 5)
+ dataset = TensorDataset(inps, tgts)
+ sampler = iter(MultiProcessSampler(SequentialSampler(dataset)))
+
+ # Sampler indices will be split into 3. The last worker (id=2) will have the
+ # last 3 indices (7, 8, 9).
+ self.assertEqual(list(sampler), [7, 8, 9])
+
+ @parameterized.expand(DECODER_LIST)
+ def test_sampling_with_distributed_sampler(self, decoder):
+
+ # Make one video with 15 frames and one with 10 frames, producing 3 clips and 2
+ # clips respectively.
+ num_frames = 10
+ fps = 5
+ with temp_encoded_video(num_frames=int(num_frames * 1.5), fps=fps) as (
+ video_file_name_1,
+ data_1,
+ ):
+ with temp_encoded_video(num_frames=num_frames, fps=fps) as (
+ video_file_name_2,
+ data_2,
+ ):
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f:
+ f.write(f"{video_file_name_1} 0\n".encode())
+ f.write(f"{video_file_name_2} 1\n".encode())
+
+ total_duration = num_frames / fps
+ half_duration = total_duration / 2 - self._EPS
+
+ # Create several processes initialized in a PyTorch distributed process
+ # group so that distributed sampler is setup correctly when dataset is
+ # constructed.
+ num_processes = 2
+ processes = []
+ return_dict = multiprocessing.Manager().dict()
+ for rank in range(num_processes):
+ p = Process(
+ target=run_distributed,
+ args=(
+ rank,
+ num_processes,
+ decoder,
+ half_duration,
+ f.name,
+ return_dict,
+ ),
+ )
+ p.start()
+ processes.append(p)
+
+ for p in processes:
+ p.join()
+
+ # After joining all distributed processes we expect all these label,
+ # video pairs to be returned in random order.
+ half_frames = num_frames // 2
+ expected = {
+ (0, data_1[:, :half_frames]), # 1/3 clip
+ (0, data_1[:, half_frames : half_frames * 2]), # 2/3 clip
+ (0, data_1[:, half_frames * 2 :]), # 3/3 clip
+ (1, data_2[:, :half_frames]), # First half
+ (1, data_2[:, half_frames:]), # Second half
+ }
+
+ epoch_results = collections.defaultdict(list)
+ for v in return_dict.values():
+ for k_2, v_2 in v.items():
+ epoch_results[k_2].extend(v_2)
+
+ assert_unordered_list_compare_true(
+ self, expected, epoch_results["epoch_1"]
+ )
+ assert_unordered_list_compare_true(
+ self, expected, epoch_results["epoch_2"]
+ )
+
+
+def assert_unordered_list_compare_true(
+ self,
+ expected: List[Tuple[int, torch.Tensor]],
+ actual: List[Tuple[int, torch.Tensor]],
+):
+ """
+ Asserts True if all tuple values from expected found in actual and lengths are equal.
+ """
+ expected_str = str([(label, clip.shape) for label, clip in expected])
+ actual = str([(label, clip.shape) for label, clip in actual])
+ failure_str = f"Expected set: {expected_str}\n actual set: {actual}"
+ self.assertTrue(unordered_list_compare, msg=failure_str)
+
+
+def unordered_list_compare(
+ expected: List[Tuple[int, torch.Tensor]], actual: List[Tuple[int, torch.Tensor]]
+):
+ """
+ Returns:
+ True if all tuple values from expected found in actual and lengths are equal.
+ """
+ if len(actual) != len(expected):
+ return False
+
+ for expected_x in expected:
+
+ # Uses torch comparator for Tensor.
+ if not any(
+ actual_x[0] == expected_x[0] and actual_x[1].equal(expected_x[1])
+ for actual_x in actual
+ ):
+ return False
+
+ return True
+
+
+def run_distributed(rank, size, decoder, clip_duration, data_name, return_dict):
+ """
+ This function is run by each distributed process. It samples videos
+ based on the distributed split (determined by the
+ DistributedSampler) and returns the dataset clips in the return_dict.
+ """
+ os.environ["MASTER_ADDR"] = "127.0.0.1"
+ os.environ["MASTER_PORT"] = "29500"
+ dist.init_process_group("gloo", rank=rank, world_size=size)
+ clip_sampler = make_clip_sampler("uniform", clip_duration)
+ labeled_video_paths = LabeledVideoPaths.from_path(data_name)
+ dataset = EncodedVideoDataset(
+ labeled_video_paths,
+ clip_sampler=clip_sampler,
+ video_sampler=DistributedSampler,
+ decode_audio=False,
+ decoder=decoder,
+ )
+ test_dataloader = DataLoader(dataset, batch_size=None, num_workers=1)
+
+ # Run two epochs, simulating use in a training loop
+ dataset.video_sampler.set_epoch(0)
+ epoch_1 = [(sample["label"], sample["video"]) for sample in test_dataloader]
+ dataset.video_sampler.set_epoch(1)
+ epoch_2 = [(sample["label"], sample["video"]) for sample in test_dataloader]
+ return_dict[rank] = {"epoch_1": epoch_1, "epoch_2": epoch_2}
+
+
+@contextlib.contextmanager
+def mock_encoded_video_dataset_file():
+ """
+ Creates a temporary mock encoded video dataset with 4 videos labeled from 0 - 4.
+ Returns a labeled video file which points to this mock encoded video dataset, the
+ ordered label and videos tuples and the video duration in seconds.
+ """
+ num_frames = 10
+ fps = 5
+ with temp_encoded_video(num_frames=num_frames, fps=fps) as (
+ video_file_name_1,
+ data_1,
+ ):
+ with temp_encoded_video(num_frames=num_frames, fps=fps) as (
+ video_file_name_2,
+ data_2,
+ ):
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f:
+ f.write(f"{video_file_name_1} 0\n".encode())
+ f.write(f"{video_file_name_2} 1\n".encode())
+ f.write(f"{video_file_name_1} 2\n".encode())
+ f.write(f"{video_file_name_2} 3\n".encode())
+
+ label_videos = [
+ (0, data_1),
+ (1, data_2),
+ (2, data_1),
+ (3, data_2),
+ ]
+ video_duration = num_frames / fps
+ yield f.name, label_videos, video_duration
diff --git a/tests/test_data_epic_kitchen_dataset.py b/tests/test_data_epic_kitchen_dataset.py
new file mode 100644
index 00000000..f681e9ec
--- /dev/null
+++ b/tests/test_data_epic_kitchen_dataset.py
@@ -0,0 +1,258 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import tempfile
+import unittest
+import unittest.mock
+from contextlib import ExitStack
+from pathlib import Path
+
+import torch
+from parameterized import parameterized
+from pytorchvideo.data.dataset_manifest_utils import VideoClipInfo, VideoDatasetType
+from pytorchvideo.data.epic_kitchen import ActionData, EpicKitchenDataset
+from pytorchvideo.data.utils import save_dataclass_objs_to_headered_csv
+from utils import (
+ MOCK_VIDEO_IDS,
+ MOCK_VIDEO_INFOS,
+ get_encoded_video_infos,
+ get_flat_video_frames,
+)
+
+
+class TestEpicKitchenDataset(unittest.TestCase):
+
+ ACTIONS_DATAS = {
+ MOCK_VIDEO_IDS[0]: [
+ ActionData(
+ "P01",
+ "P01_01",
+ "turn on light",
+ "00:00:04.00",
+ "00:00:06.00",
+ 262,
+ 370,
+ "turn-on",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ ),
+ ActionData(
+ "P01",
+ "P01_01",
+ "close door",
+ "00:00:05.00",
+ "00:00:07.00",
+ 418,
+ 569,
+ "close",
+ 3,
+ "door",
+ 8,
+ "['door']",
+ "[8]",
+ ),
+ ActionData(
+ "P01",
+ "P01_01",
+ "close fridge",
+ "00:01:1.91",
+ "01:00:5.33",
+ 1314,
+ 1399,
+ "close",
+ 3,
+ "fridge",
+ 10,
+ "['fridge']",
+ "[10]",
+ ),
+ ],
+ MOCK_VIDEO_IDS[1]: [
+ ActionData(
+ "P02",
+ "P02_002",
+ "turn on light",
+ "00:00:04.00",
+ "00:00:06.00",
+ 262,
+ 370,
+ "turn-on",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ )
+ ],
+ MOCK_VIDEO_IDS[2]: [
+ ActionData(
+ "P02",
+ "P02_005",
+ "turn on light",
+ "00:00:04.00",
+ "00:00:06.00",
+ 262,
+ 370,
+ "turn-on",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ )
+ ],
+ MOCK_VIDEO_IDS[3]: [
+ ActionData(
+ "P07",
+ "P07_002",
+ "turn on light",
+ "00:00:04.00",
+ "00:00:06.00",
+ 262,
+ 370,
+ "turn-on",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ )
+ ],
+ }
+
+ def test_ActionData(self):
+
+ action = ActionData(
+ # This is a key-mapping as the underlying epic-kitchen
+ # annotation files are of these string columns
+ **{
+ "participant_id": "P07",
+ "video_id": "P07_002",
+ "narration": "turn on light",
+ "start_timestamp": "00:00:04.00",
+ "stop_timestamp": "00:00:06.50",
+ "start_frame": "262",
+ "stop_frame": "370",
+ "verb": "turn-on",
+ "verb_class": "12",
+ "noun": "light",
+ "noun_class": "113",
+ "all_nouns": "['light', 'finger', 'wall']",
+ "all_noun_classes": "[113, 1232, 1]",
+ }
+ )
+ self.assertEqual(action.video_id, "P07_002")
+ self.assertEqual(action.start_time, 4.0)
+ self.assertEqual(action.stop_time, 6.5)
+ self.assertEqual(action.verb_class, 12)
+ self.assertEqual(action.noun_class, 113)
+ self.assertEqual(action.all_nouns, ["light", "finger", "wall"])
+
+ self.assertEqual(action.all_noun_classes, [113, 1232, 1])
+
+ @parameterized.expand([(VideoDatasetType.Frame,), (VideoDatasetType.EncodedVideo,)])
+ def test__len__(self, dataset_type):
+ with tempfile.TemporaryDirectory(prefix=f"{TestEpicKitchenDataset}") as tempdir:
+ tempdir = Path(tempdir)
+
+ video_info_file = tempdir / "test_video_info.csv"
+ save_dataclass_objs_to_headered_csv(
+ list(MOCK_VIDEO_INFOS.values()), video_info_file
+ )
+ action_file = tempdir / "action_video_info.csv"
+ actions = []
+ for action_list in self.ACTIONS_DATAS.values():
+ for action in action_list:
+ actions.append(action)
+ save_dataclass_objs_to_headered_csv(actions, action_file)
+
+ video_data_manifest_file_path = (
+ tempdir / "video_data_manifest_file_path.json"
+ )
+ with ExitStack() as stack:
+ if dataset_type == VideoDatasetType.Frame:
+ video_data_dict = get_flat_video_frames(tempdir, "jpg")
+ elif dataset_type == VideoDatasetType.EncodedVideo:
+ video_data_dict = get_encoded_video_infos(tempdir, stack)
+
+ save_dataclass_objs_to_headered_csv(
+ list(video_data_dict.values()), video_data_manifest_file_path
+ )
+
+ dataset = EpicKitchenDataset(
+ video_info_file_path=str(video_info_file),
+ actions_file_path=str(action_file),
+ clip_sampler=lambda x, y: [
+ VideoClipInfo(str(i), i * 2.0, i * 2.0 + 0.9)
+ for i in range(0, 7)
+ ],
+ video_data_manifest_file_path=str(video_data_manifest_file_path),
+ dataset_type=dataset_type,
+ )
+
+ self.assertEqual(len(dataset), 7)
+
+ @parameterized.expand([(VideoDatasetType.Frame,), (VideoDatasetType.EncodedVideo,)])
+ def test__getitem__(self, dataset_type):
+ with tempfile.TemporaryDirectory(prefix=f"{TestEpicKitchenDataset}") as tempdir:
+ tempdir = Path(tempdir)
+
+ video_info_file = tempdir / "test_video_info.csv"
+ save_dataclass_objs_to_headered_csv(
+ list(MOCK_VIDEO_INFOS.values()), video_info_file
+ )
+ action_file = tempdir / "action_video_info.csv"
+ actions = []
+ for action_list in self.ACTIONS_DATAS.values():
+ for action in action_list:
+ actions.append(action)
+ save_dataclass_objs_to_headered_csv(actions, action_file)
+
+ video_data_manifest_file_path = (
+ tempdir / "video_data_manifest_file_path.json"
+ )
+ with ExitStack() as stack:
+ if dataset_type == VideoDatasetType.Frame:
+ video_data_dict = get_flat_video_frames(tempdir, "jpg")
+ elif dataset_type == VideoDatasetType.EncodedVideo:
+ video_data_dict = get_encoded_video_infos(tempdir, stack)
+
+ save_dataclass_objs_to_headered_csv(
+ list(video_data_dict.values()), video_data_manifest_file_path
+ )
+ video_ids = list(self.ACTIONS_DATAS)
+ dataset = EpicKitchenDataset(
+ video_info_file_path=str(video_info_file),
+ actions_file_path=str(action_file),
+ clip_sampler=lambda x, y: [
+ VideoClipInfo(video_ids[i // 2], i * 2.0, i * 2.0 + 0.9)
+ for i in range(0, 7)
+ ],
+ video_data_manifest_file_path=str(video_data_manifest_file_path),
+ dataset_type=dataset_type,
+ )
+
+ get_clip_string = (
+ "pytorchvideo.data.frame_video.FrameVideo.get_clip"
+ if dataset_type == VideoDatasetType.Frame
+ else "pytorchvideo.data.encoded_video.EncodedVideo.get_clip"
+ )
+ with unittest.mock.patch(
+ get_clip_string,
+ return_value=({"video": torch.rand(3, 5, 10, 20), "audio": []}),
+ ) as _:
+ clip_1 = dataset.__getitem__(1)
+ for i, a in enumerate(clip_1["actions"]):
+ self.assertEqual(a, self.ACTIONS_DATAS[video_ids[0]][i])
+ self.assertEqual(clip_1["start_time"], 2.0)
+ self.assertEqual(clip_1["stop_time"], 2.9)
+ self.assertEqual(clip_1["video_id"], MOCK_VIDEO_IDS[0])
+
+ clip_2 = dataset.__getitem__(2)
+ for i, a in enumerate(clip_2["actions"]):
+ self.assertEqual(a, self.ACTIONS_DATAS[video_ids[1]][i])
+ self.assertEqual(clip_2["start_time"], 4.0)
+ self.assertEqual(clip_2["stop_time"], 4.9)
+ self.assertEqual(clip_2["video_id"], MOCK_VIDEO_IDS[1])
diff --git a/tests/test_data_epic_kitchen_forecasting.py b/tests/test_data_epic_kitchen_forecasting.py
new file mode 100644
index 00000000..0a7983e9
--- /dev/null
+++ b/tests/test_data_epic_kitchen_forecasting.py
@@ -0,0 +1,424 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import unittest
+import unittest.mock
+
+import torch
+from pytorchvideo.data import EpicKitchenForecasting
+from pytorchvideo.data.epic_kitchen import ActionData
+from pytorchvideo.data.epic_kitchen_forecasting import ClipSampling
+from pytorchvideo.data.frame_video import FrameVideo
+
+
+class TestEpicKitchenForecasting(unittest.TestCase):
+ def test_transform_generator(self):
+ clip = {
+ "start_time": 2.5,
+ "stop_time": 6.5,
+ "video": torch.rand(3, 8, 10, 20),
+ "actions": [
+ ActionData(
+ "P01",
+ "P01_01",
+ "turn off light",
+ "00:00:01.00",
+ "00:00:02.00",
+ 262,
+ 370,
+ "turn-off",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ ),
+ ActionData(
+ "P01",
+ "P01_01",
+ "turn on light",
+ "00:00:04.00",
+ "00:00:06.00",
+ 262,
+ 370,
+ "turn-on",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ ),
+ ActionData(
+ "P01",
+ "P01_01",
+ "close door",
+ "00:00:06.00",
+ "00:00:07.00",
+ 418,
+ 569,
+ "close",
+ 3,
+ "door",
+ 8,
+ "['door']",
+ "[8]",
+ ),
+ ActionData(
+ "P01",
+ "P01_01",
+ "slam door",
+ "00:00:10.00",
+ "00:00:11.00",
+ 408,
+ 509,
+ "slam",
+ 3,
+ "door",
+ 8,
+ "['door']",
+ "[8]",
+ ),
+ ActionData(
+ "P01",
+ "P01_01",
+ "slam door",
+ "00:00:11.00",
+ "00:00:12.00",
+ 408,
+ 509,
+ "slam",
+ 3,
+ "door",
+ 8,
+ "['door']",
+ "[8]",
+ ),
+ ActionData(
+ "P01",
+ "P01_01",
+ "slam door",
+ "00:00:12.00",
+ "00:00:13.00",
+ 408,
+ 509,
+ "slam",
+ 3,
+ "door",
+ 8,
+ "['door']",
+ "[8]",
+ ),
+ ],
+ }
+
+ def additional_transform(clip):
+ clip["video"] = clip["video"].permute(1, 2, 3, 4, 0)
+ return clip
+
+ transform_fn = EpicKitchenForecasting._transform_generator(
+ additional_transform,
+ num_forecast_actions=3,
+ num_input_clips=2,
+ frames_per_clip=4,
+ )
+
+ transformed_clip = transform_fn(clip)
+
+ self.assertEqual(len(transformed_clip["actions"]), 3)
+
+ self.assertEqual(transformed_clip["actions"][0].narration, "slam door")
+ self.assertEqual(transformed_clip["actions"][1].narration, "slam door")
+ self.assertEqual(transformed_clip["actions"][2].narration, "slam door")
+
+ self.assertEqual(transformed_clip["actions"][0].start_time, 10.0)
+ self.assertEqual(transformed_clip["actions"][1].start_time, 11.0)
+ self.assertEqual(transformed_clip["actions"][2].start_time, 12.0)
+
+ self.assertEqual(transformed_clip["start_time"], 2.5)
+ self.assertEqual(transformed_clip["stop_time"], 6.5)
+
+ self.assertEqual(
+ transformed_clip["video"].size(), torch.Size([3, 4, 10, 20, 2])
+ )
+
+ def test_frame_filter_generator(self):
+ # 11 seconds of video at 4 fps
+ input_list = list(range(44))
+
+ # 11 second clip at 4 fps, all frames are included
+ frame_filter_fn = EpicKitchenForecasting._frame_filter_generator(
+ seconds_per_clip=1,
+ num_input_clips=11,
+ frames_per_clip=4,
+ clip_time_stride=1,
+ )
+
+ all_elements = frame_filter_fn(input_list)
+
+ self.assertEqual(all_elements, input_list)
+
+ # 11 second clip at 4 fps, seconds 0-1 and 10-11 are included
+ frame_filter_fn = EpicKitchenForecasting._frame_filter_generator(
+ seconds_per_clip=1,
+ num_input_clips=2,
+ frames_per_clip=4,
+ clip_time_stride=10,
+ )
+ elements_2_clips = frame_filter_fn(input_list)
+ self.assertEqual(len(elements_2_clips), 8)
+ self.assertEqual(elements_2_clips, input_list[:4] + input_list[-4:])
+
+ # 11 second clip at 2 fps, seconds 0-1 and 10-11 are included
+ frame_filter_fn = EpicKitchenForecasting._frame_filter_generator(
+ seconds_per_clip=1,
+ num_input_clips=2,
+ frames_per_clip=2,
+ clip_time_stride=10,
+ )
+ elements_2_clips_2fps = frame_filter_fn(input_list)
+ self.assertEqual(len(elements_2_clips_2fps), 4)
+ self.assertEqual(elements_2_clips_2fps, [0, 2, 40, 42])
+
+ def test_define_clip_structure_generator(self):
+ frame_videos = {
+ "P01_003": FrameVideo.from_frame_paths(
+ [f"root/P01_003/frame_{i}" for i in range(200)], 10
+ ),
+ "P02_004": FrameVideo.from_frame_paths(
+ [f"root/P02_004/frame_{i}" for i in range(300)], 10
+ ),
+ "P11_010": FrameVideo.from_frame_paths(
+ [f"root/P11_010/frame_{i}" for i in range(600)], 30
+ ),
+ }
+ actions = {
+ "P01_003": [
+ ActionData(
+ "P01",
+ "P01_003",
+ "turn off light",
+ "00:00:01.00",
+ "00:00:02.00",
+ 262,
+ 370,
+ "turn-off",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ ),
+ ActionData(
+ "P01",
+ "P01_003",
+ "turn on light",
+ "00:00:04.00",
+ "00:00:05.00",
+ 262,
+ 370,
+ "turn-on",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ ),
+ ActionData(
+ "P01",
+ "P01_003",
+ "close door",
+ "00:00:06.00",
+ "00:00:07.00",
+ 418,
+ 569,
+ "close",
+ 3,
+ "door",
+ 8,
+ "['door']",
+ "[8]",
+ ),
+ ActionData(
+ "P01",
+ "P01_003",
+ "slam door",
+ "00:00:10.00",
+ "00:00:11.00",
+ 408,
+ 509,
+ "slam",
+ 3,
+ "door",
+ 8,
+ "['door']",
+ "[8]",
+ ),
+ ],
+ "P02_004": [
+ ActionData(
+ "P02",
+ "P02_004",
+ "turn off light",
+ "00:00:04.00",
+ "00:00:05.00",
+ 262,
+ 370,
+ "turn-off",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ ),
+ ActionData(
+ "P02",
+ "P02_004",
+ "turn on light",
+ "00:00:05.00",
+ "00:00:06.00",
+ 262,
+ 370,
+ "turn-on",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ ),
+ ActionData(
+ "P02",
+ "P02_004",
+ "close door",
+ "00:00:08.00",
+ "00:00:09.00",
+ 418,
+ 569,
+ "close",
+ 3,
+ "door",
+ 8,
+ "['door']",
+ "[8]",
+ ),
+ ActionData(
+ "P02",
+ "P02_004",
+ "slam door",
+ "00:00:10.00",
+ "00:00:11.00",
+ 408,
+ 509,
+ "slam",
+ 3,
+ "door",
+ 8,
+ "['door']",
+ "[8]",
+ ),
+ ],
+ "P11_010": [
+ ActionData(
+ "P11",
+ "P11_010",
+ "turn off light",
+ "00:00:01.00",
+ "00:00:02.00",
+ 262,
+ 370,
+ "turn-off",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ ),
+ ActionData(
+ "P11",
+ "P11_010",
+ "turn on light",
+ "00:00:04.00",
+ "00:00:05.50",
+ 262,
+ 370,
+ "turn-on",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ ),
+ ActionData(
+ "P11",
+ "P11_010",
+ "turn on light",
+ "00:00:04.00",
+ "00:00:06.00",
+ 262,
+ 370,
+ "turn-on",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ ),
+ ActionData(
+ "P11",
+ "P11_010",
+ "close door",
+ "00:00:06.00",
+ "00:00:07.00",
+ 418,
+ 569,
+ "close",
+ 3,
+ "door",
+ 8,
+ "['door']",
+ "[8]",
+ ),
+ ActionData(
+ "P11",
+ "P11_010",
+ "slam door",
+ "00:00:10.00",
+ "00:00:11.00",
+ 408,
+ 509,
+ "slam",
+ 3,
+ "door",
+ 8,
+ "['door']",
+ "[8]",
+ ),
+ ],
+ }
+ random_value = 0.5
+ with unittest.mock.patch("random.random", return_value=random_value) as _:
+ define_clip_structure_fn = (
+ EpicKitchenForecasting._define_clip_structure_generator(
+ seconds_per_clip=1,
+ clip_time_stride=3,
+ num_input_clips=2,
+ num_forecast_actions=2,
+ clip_sampling=ClipSampling.Random,
+ )
+ )
+ clips = define_clip_structure_fn(frame_videos, actions)
+ sorted_clips = sorted(clips, key=lambda c: c.start_time) # For stability
+ for clip in sorted_clips:
+ self.assertEqual(clip.stop_time - clip.start_time, 4.0)
+
+ clips_P01_003 = [c for c in sorted_clips if c.video_id == "P01_003"]
+ self.assertEqual(len(clips_P01_003), 1)
+
+ clips_P01_003[0].start_time == actions["P01_003"][1].stop_time
+
+ clips_P02_004 = [c for c in sorted_clips if c.video_id == "P02_004"]
+ self.assertEqual(len(clips_P02_004), 2)
+ clips_P02_004[0].start_time == actions["P02_004"][0].stop_time
+ clips_P02_004[1].start_time == actions["P02_004"][1].stop_time
+
+ clips_P11_010 = [c for c in sorted_clips if c.video_id == "P11_010"]
+ self.assertEqual(len(clips_P11_010), 1)
+ clips_P11_010[0].start_time == actions["P11_010"][1].stop_time
diff --git a/tests/test_data_epic_kitchen_recognition.py b/tests/test_data_epic_kitchen_recognition.py
new file mode 100644
index 00000000..bde826ad
--- /dev/null
+++ b/tests/test_data_epic_kitchen_recognition.py
@@ -0,0 +1,166 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import unittest
+import unittest.mock
+
+import torch
+from pytorchvideo.data import EpicKitchenRecognition
+from pytorchvideo.data.epic_kitchen import ActionData
+from pytorchvideo.data.epic_kitchen_recognition import ClipSampling
+from pytorchvideo.data.frame_video import FrameVideo
+
+
+class TestEpicKitchenRecognition(unittest.TestCase):
+ def test_transform_generator(self):
+ clip = {
+ "start_time": 2.5,
+ "stop_time": 6.5,
+ "video": torch.rand(3, 4, 10, 20),
+ "actions": [
+ ActionData(
+ "P01",
+ "P01_01",
+ "turn off light",
+ "00:00:01.00",
+ "00:00:02.00",
+ 262,
+ 370,
+ "turn-off",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ ),
+ ActionData(
+ "P01",
+ "P01_01",
+ "turn on light",
+ "00:00:04.00",
+ "00:00:06.00",
+ 262,
+ 370,
+ "turn-on",
+ 12,
+ "light",
+ 113,
+ "['light']",
+ "[113]",
+ ),
+ ActionData(
+ "P01",
+ "P01_01",
+ "close door",
+ "00:00:06.00",
+ "00:00:07.00",
+ 418,
+ 569,
+ "close",
+ 3,
+ "door",
+ 8,
+ "['door']",
+ "[8]",
+ ),
+ ActionData(
+ "P01",
+ "P01_01",
+ "slam door",
+ "00:00:10.00",
+ "00:00:11.00",
+ 408,
+ 509,
+ "slam",
+ 3,
+ "door",
+ 8,
+ "['door']",
+ "[8]",
+ ),
+ ],
+ }
+
+ def additional_transform(clip):
+ clip["video"] = clip["video"].permute(1, 2, 3, 0)
+ return clip
+
+ transform_fn = EpicKitchenRecognition._transform_generator(additional_transform)
+
+ transformed_clip = transform_fn(clip)
+
+ self.assertEqual(len(transformed_clip["actions"]), 2)
+ # Sort for stability
+ sorted_actions = sorted(transformed_clip["actions"], key=lambda a: a.start_time)
+
+ self.assertEqual(sorted_actions[0].narration, "turn on light")
+ self.assertEqual(sorted_actions[1].narration, "close door")
+
+ self.assertEqual(transformed_clip["start_time"], 2.5)
+ self.assertEqual(transformed_clip["stop_time"], 6.5)
+
+ self.assertEqual(transformed_clip["video"].size(), torch.Size([4, 10, 20, 3]))
+
+ def test_frame_filter_generator(self):
+ input_list = list(range(10))
+
+ frame_filter_fn = EpicKitchenRecognition._frame_filter_generator(10)
+ all_elements = frame_filter_fn(input_list)
+ self.assertEqual(all_elements, input_list)
+
+ frame_filter_fn = EpicKitchenRecognition._frame_filter_generator(5)
+ half_elements = frame_filter_fn(input_list)
+ self.assertEqual(len(half_elements), 5)
+ self.assertEqual(half_elements, [i for i in input_list if not i % 2])
+
+ frame_filter_fn = EpicKitchenRecognition._frame_filter_generator(1)
+ half_elements = frame_filter_fn(input_list)
+ self.assertEqual(len(half_elements), 1)
+ self.assertEqual(half_elements[0], 0)
+
+ def test_define_clip_structure_generator(self):
+ seconds_per_clip = 5
+ define_clip_structure_fn = (
+ EpicKitchenRecognition._define_clip_structure_generator(
+ seconds_per_clip=5, clip_sampling=ClipSampling.RandomOffsetUniform
+ )
+ )
+ frame_videos = {
+ "P01_003": FrameVideo.from_frame_paths(
+ [f"root/P01_003/frame_{i}" for i in range(100)], 10
+ ),
+ "P02_004": FrameVideo.from_frame_paths(
+ [f"root/P02_004/frame_{i}" for i in range(300)], 10
+ ),
+ "P11_010": FrameVideo.from_frame_paths(
+ [f"root/P11_010/frame_{i}" for i in range(600)], 30
+ ),
+ }
+ actions = {video_id: [] for video_id in frame_videos}
+ random_value = 0.5
+ with unittest.mock.patch("random.random", return_value=random_value) as _:
+ clips = define_clip_structure_fn(frame_videos, actions)
+ sorted_clips = sorted(clips, key=lambda c: c.start_time) # For stability
+
+ for clip in sorted_clips:
+ self.assertEqual(clip.stop_time - clip.start_time, seconds_per_clip)
+
+ clips_P01_003 = [c for c in sorted_clips if c.video_id == "P01_003"]
+ self.assertEqual(len(clips_P01_003), 1)
+ for i in range(len(clips_P01_003)):
+ self.assertEqual(
+ clips_P01_003[i].start_time, seconds_per_clip * (i + random_value)
+ )
+
+ clips_P02_004 = [c for c in sorted_clips if c.video_id == "P02_004"]
+ self.assertEqual(len(clips_P02_004), 5)
+ for i in range(len(clips_P02_004)):
+ self.assertEqual(
+ clips_P02_004[i].start_time, seconds_per_clip * (i + random_value)
+ )
+
+ clips_P11_010 = [c for c in sorted_clips if c.video_id == "P11_010"]
+ self.assertEqual(len(clips_P11_010), 3)
+ for i in range(len(clips_P11_010)):
+ self.assertEqual(
+ clips_P11_010[i].start_time, seconds_per_clip * (i + random_value)
+ )
diff --git a/tests/test_data_epic_kitchen_utils.py b/tests/test_data_epic_kitchen_utils.py
new file mode 100644
index 00000000..ba0395a2
--- /dev/null
+++ b/tests/test_data_epic_kitchen_utils.py
@@ -0,0 +1,190 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import os
+import tempfile
+import unittest
+import unittest.mock
+from pathlib import Path
+
+from pytorchvideo.data.dataset_manifest_utils import EncodedVideoInfo, VideoFrameInfo
+from pytorchvideo.data.epic_kitchen.utils import (
+ build_encoded_manifest_from_nested_directory,
+ build_frame_manifest_from_flat_directory,
+ build_frame_manifest_from_nested_directory,
+)
+
+
+def write_mock_frame_files(video_frames, tempdir, ext):
+ tempdir = Path(tempdir)
+ for _, video_frame_info in video_frames.items():
+ if not os.path.isdir(video_frame_info.location):
+ os.mkdir(video_frame_info.location)
+
+ for frame_num in reversed(
+ range(
+ video_frame_info.min_frame_number, video_frame_info.max_frame_number + 1
+ )
+ ): # Here we reverse the order of the frames we write to test that code
+ # doesn't rely on ls returning frames in order due to
+ # frames being written in order temporally.
+ frame_num_str = str(frame_num)
+ stem = video_frame_info.frame_file_stem
+ frame_num_zeros = "0" * (
+ video_frame_info.frame_string_length - len(frame_num_str) - len(stem)
+ )
+ frame_file_name = f"{stem}{frame_num_zeros}{frame_num_str}.{ext}"
+ with open(f"{video_frame_info.location}/{frame_file_name}", "w") as f:
+ f.write("0")
+
+
+def get_flat_video_frames(directory, file_extension):
+ return {
+ "P02_001": VideoFrameInfo(
+ video_id="P02_001",
+ location=f"{directory}/P02_001",
+ frame_file_stem="frame_",
+ frame_string_length=16,
+ min_frame_number=1,
+ max_frame_number=3000,
+ file_extension=file_extension,
+ ),
+ "P02_002": VideoFrameInfo(
+ video_id="P02_002",
+ location=f"{directory}/P02_002",
+ frame_file_stem="frame_",
+ frame_string_length=16,
+ min_frame_number=2,
+ max_frame_number=3001,
+ file_extension=file_extension,
+ ),
+ "P02_005": VideoFrameInfo(
+ video_id="P02_005",
+ location=f"{directory}/P02_005",
+ frame_file_stem="frame_",
+ frame_string_length=16,
+ min_frame_number=1,
+ max_frame_number=30003,
+ file_extension=file_extension,
+ ),
+ "P07_002": VideoFrameInfo(
+ video_id="P07_002",
+ location=f"{directory}/P07_002",
+ frame_file_stem="frame_",
+ frame_string_length=16,
+ min_frame_number=2,
+ max_frame_number=1530,
+ file_extension=file_extension,
+ ),
+ }
+
+
+def get_nested_video_frames(directory, file_extension):
+ return {
+ "P02_001": VideoFrameInfo(
+ video_id="P02_001",
+ location=f"{directory}/P02",
+ frame_file_stem="P02_001_",
+ frame_string_length=16,
+ min_frame_number=1,
+ max_frame_number=3000,
+ file_extension=file_extension,
+ ),
+ "P02_002": VideoFrameInfo(
+ video_id="P02_002",
+ location=f"{directory}/P02",
+ frame_file_stem="P02_002_",
+ frame_string_length=16,
+ min_frame_number=2,
+ max_frame_number=3001,
+ file_extension=file_extension,
+ ),
+ "P02_005": VideoFrameInfo(
+ video_id="P02_005",
+ location=f"{directory}/P02",
+ frame_file_stem="P02_005_",
+ frame_string_length=16,
+ min_frame_number=1,
+ max_frame_number=30003,
+ file_extension=file_extension,
+ ),
+ "P07_002": VideoFrameInfo(
+ video_id="P07_002",
+ location=f"{directory}/P07",
+ frame_file_stem="P07_002_",
+ frame_string_length=16,
+ min_frame_number=2,
+ max_frame_number=1530,
+ file_extension=file_extension,
+ ),
+ }
+
+
+class TestEpicKitchenUtils(unittest.TestCase):
+ def test_build_frame_manifest_from_flat_directory_sync(self):
+ self.test_build_frame_manifest_from_flat_directory(multithreading=False)
+
+ def test_build_frame_manifest_from_flat_directory(self, multithreading=True):
+ with tempfile.TemporaryDirectory(prefix="TestEpicKitchenUtils") as tempdir:
+ video_frames_expected = get_flat_video_frames(tempdir, "jpg")
+ write_mock_frame_files(video_frames_expected, tempdir, "jpg")
+
+ video_frames = build_frame_manifest_from_flat_directory(
+ tempdir, multithreading
+ )
+
+ self.assertEqual(len(video_frames_expected), len(video_frames))
+ for video_id in video_frames_expected:
+ self.assertEqual(
+ video_frames[video_id], video_frames_expected[video_id]
+ )
+
+ def test_build_frame_manifest_from_nested_directory_sync(self):
+ self.test_build_frame_manifest_from_nested_directory(multithreading=False)
+
+ def test_build_frame_manifest_from_nested_directory(self, multithreading=True):
+ with tempfile.TemporaryDirectory(prefix="TestEpicKitchenUtils") as tempdir:
+ video_frames_expected = get_nested_video_frames(tempdir, "png")
+ write_mock_frame_files(video_frames_expected, tempdir, "png")
+
+ video_frames = build_frame_manifest_from_nested_directory(
+ tempdir, multithreading
+ )
+ self.assertEqual(len(video_frames_expected), len(video_frames))
+ for video_id in video_frames_expected:
+ self.assertEqual(
+ video_frames[video_id], video_frames_expected[video_id]
+ )
+
+ def test_build_encoded_manifest_from_nested_directory(self):
+ file_names = ["P01_01.mp4", "P01_07.mp4", "P23_11.mp4", "P11_00.mp4"]
+ with tempfile.TemporaryDirectory(prefix="TestEpicKitchenUtils") as tempdir:
+
+ for file_name in file_names:
+ participant_path = Path(tempdir) / file_name[:3]
+ if not os.path.isdir(participant_path):
+ os.mkdir(participant_path)
+
+ with open(participant_path / file_name, "w") as f:
+ f.write("0")
+
+ encoded_video_dict = build_encoded_manifest_from_nested_directory(tempdir)
+
+ self.assertEqual(
+ sorted(encoded_video_dict), ["P01_01", "P01_07", "P11_00", "P23_11"]
+ )
+ self.assertEqual(
+ encoded_video_dict["P01_01"],
+ EncodedVideoInfo("P01_01", str(Path(tempdir) / "P01/P01_01.mp4")),
+ )
+ self.assertEqual(
+ encoded_video_dict["P01_07"],
+ EncodedVideoInfo("P01_07", str(Path(tempdir) / "P01/P01_07.mp4")),
+ )
+ self.assertEqual(
+ encoded_video_dict["P11_00"],
+ EncodedVideoInfo("P11_00", str(Path(tempdir) / "P11/P11_00.mp4")),
+ )
+ self.assertEqual(
+ encoded_video_dict["P23_11"],
+ EncodedVideoInfo("P23_11", str(Path(tempdir) / "P23/P23_11.mp4")),
+ )
diff --git a/tests/test_data_frame_video.py b/tests/test_data_frame_video.py
new file mode 100644
index 00000000..3ea3f5f2
--- /dev/null
+++ b/tests/test_data_frame_video.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import unittest
+
+import pytest
+from pytorchvideo.data.frame_video import FrameVideo
+from utils import temp_frame_video
+
+
+class TestFrameVideo(unittest.TestCase):
+ def test_frame_video_works(self):
+ frame_names = [f"{str(i)}.png" for i in range(3)]
+ with temp_frame_video(frame_names) as (f_name, data):
+ frame_paths = [f_name / x for x in frame_names]
+ test_video = FrameVideo.from_frame_paths(frame_paths)
+ expected_duration = (
+ 0.1 # Total duration of 3 frames at 30fps is 0.1 seconds.
+ )
+ self.assertEqual(test_video.duration, expected_duration)
+
+ # All frames (0 - 0.1 seconds)
+ clip = test_video.get_clip(0, 0.1)
+ frames, indices = clip["video"], clip["frame_indices"]
+ self.assertTrue(frames.equal(data))
+ self.assertEqual(indices, [0, 1, 2])
+
+ # All frames (0 - 0.1 seconds), filtred to middle frame
+ clip = test_video.get_clip(0, 0.1, lambda lst: lst[1:2])
+ frames, indices = clip["video"], clip["frame_indices"]
+ self.assertTrue(frames.equal(data[:, 1:2]))
+ self.assertEqual(indices, [1])
+
+ # 2 frames (0 - 0.066 seconds)
+ clip = test_video.get_clip(0, 0.066)
+ frames, indices = clip["video"], clip["frame_indices"]
+ self.assertTrue(frames.equal(data[:, :2]))
+ self.assertEqual(indices, [0, 1])
+
+ # No frames (3 - 5 seconds)
+ result = test_video.get_clip(3, 5)
+ self.assertEqual(result, None)
+
+ def test_open_video_failure(self):
+ test_video = FrameVideo.from_frame_paths(["non_existent_file.txt"])
+ with pytest.raises(Exception):
+ test_video.get_clip(0, 0.01) # duration is 1 / 30 because one frame
+
+ def test_empty_frames_failure(self):
+ with pytest.raises(AssertionError):
+ FrameVideo.from_frame_paths([])
diff --git a/tests/test_data_ssv2_dataset.py b/tests/test_data_ssv2_dataset.py
new file mode 100644
index 00000000..e32319d6
--- /dev/null
+++ b/tests/test_data_ssv2_dataset.py
@@ -0,0 +1,96 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import contextlib
+import json
+import pathlib
+import tempfile
+import unittest
+
+from pytorchvideo.data import SSv2
+from pytorchvideo.data.clip_sampling import make_clip_sampler
+from torch.utils.data import SequentialSampler
+from utils import temp_frame_video
+
+
+@contextlib.contextmanager
+def temp_ssv2_dataset():
+ frame_names = [f"{str(i)}.png" for i in range(7)]
+
+ # Create json file for label names.
+ labels = [
+ "Approaching something with your camera",
+ "Attaching something to something",
+ ]
+ label_names = {labels[0]: "0", labels[1]: "1"}
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
+ json.dump(label_names, f)
+ label_name_file = f.name
+
+ # Create csv containing 2 test frame videos.
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as f:
+ f.write("original_vido_id video_id frame_id path labels\n".encode())
+
+ # Frame video 1
+ with temp_frame_video(frame_names) as (frame_1_video_dir, data_1):
+ for i, frame_name in enumerate(frame_names):
+ original_video_id = str(frame_1_video_dir)
+ video_id = "1"
+ frame_id = str(i)
+ path = pathlib.Path(frame_1_video_dir) / frame_name
+ f.write(
+ f"{original_video_id} {video_id} {frame_id} {path} ''\n".encode()
+ )
+
+ # Frame video 2
+ with temp_frame_video(frame_names) as (frame_2_video_dir, data_2):
+ for i, frame_name in enumerate(frame_names):
+ original_video_id = str(frame_2_video_dir)
+ video_id = "2"
+ frame_id = str(i)
+ path = pathlib.Path(frame_2_video_dir) / frame_name
+ f.write(
+ f"{original_video_id} {video_id} {frame_id} {path} ''\n".encode()
+ )
+
+ f.close()
+ video_path_file = f.name
+
+ # Create json file for lable names.
+ with tempfile.NamedTemporaryFile(
+ mode="w", delete=False, suffix=".json"
+ ) as f:
+ videos = [
+ {"id": str(frame_1_video_dir), "template": labels[0]},
+ {"id": str(frame_2_video_dir), "template": labels[1]},
+ ]
+ json.dump(videos, f)
+ video_label_file = f.name
+
+ yield label_name_file, video_label_file, video_path_file, data_1, data_2
+
+
+class TestSSv2Dataset(unittest.TestCase):
+ def test_single_clip_per_video_works(self):
+ with temp_ssv2_dataset() as (
+ label_name_file,
+ video_label_file,
+ video_path_file,
+ video_1,
+ video_2,
+ ):
+
+ # Put arbitrary duration as ssv2 always needs full video clip.
+ clip_sampler = make_clip_sampler("constant_clips_per_video", 1.0, 1)
+ # Expect taking 2 frames (1-th and 4-th among 7 frames).
+ dataset = SSv2(
+ label_name_file,
+ video_label_file,
+ video_path_file,
+ clip_sampler=clip_sampler,
+ video_sampler=SequentialSampler,
+ frames_per_clip=2,
+ )
+ expected = [(0, video_1), (1, video_2)]
+ for sample, expected_sample in zip(dataset, expected):
+ self.assertEqual(sample["label"], expected_sample[0])
+ self.assertTrue(sample["video"].equal(expected_sample[1][:, (1, 4)]))
diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py
new file mode 100644
index 00000000..504aa4fb
--- /dev/null
+++ b/tests/test_data_utils.py
@@ -0,0 +1,126 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import tempfile
+import unittest
+import unittest.mock
+from dataclasses import dataclass
+from pathlib import Path
+
+from pytorchvideo.data.utils import (
+ DataclassFieldCaster,
+ load_dataclass_dict_from_csv,
+ save_dataclass_objs_to_headered_csv,
+)
+
+
+@dataclass
+class TestDataclass(DataclassFieldCaster):
+ a: str
+ b: int
+ b_plus_1: int = DataclassFieldCaster.complex_initialized_dataclass_field(
+ lambda v: int(v) + 1
+ )
+ c: float
+ d: list
+ e: dict = DataclassFieldCaster.complex_initialized_dataclass_field(lambda v: {v: v})
+
+
+@dataclass
+class TestDataclass2(DataclassFieldCaster):
+ a: str
+ b: int
+
+
+class TestDataUtils(unittest.TestCase):
+ def test_DataclassFieldCaster(self):
+ test_obj = TestDataclass("1", "1", "1", "1", "abc", "k")
+
+ self.assertEqual(test_obj.a, "1")
+ self.assertEqual(type(test_obj.a), str)
+
+ self.assertEqual(test_obj.b, 1)
+ self.assertEqual(type(test_obj.b), int)
+ self.assertEqual(test_obj.b_plus_1, 2)
+
+ self.assertEqual(test_obj.c, 1.0)
+ self.assertEqual(type(test_obj.c), float)
+
+ self.assertEqual(test_obj.d, ["a", "b", "c"])
+ self.assertEqual(type(test_obj.d), list)
+
+ self.assertEqual(test_obj.e, {"k": "k"})
+ self.assertEqual(type(test_obj.e), dict)
+
+ def test_load_dataclass_dict_from_csv_value_dict(self):
+ dataclass_objs = [
+ TestDataclass2("a", 1),
+ TestDataclass2("b", 2),
+ TestDataclass2("c", 3),
+ TestDataclass2("d", 4),
+ ]
+ with tempfile.TemporaryDirectory(prefix=f"{TestDataUtils}") as tempdir:
+ csv_file_name = Path(tempdir) / "data.csv"
+ save_dataclass_objs_to_headered_csv(dataclass_objs, csv_file_name)
+
+ test_dict = load_dataclass_dict_from_csv(
+ csv_file_name, TestDataclass2, "a", list_per_key=False
+ )
+ self.assertEqual(len(test_dict), 4)
+ self.assertEqual(test_dict["c"].b, 3)
+
+ def test_load_dataclass_dict_from_csv_list_dict(self):
+ dataclass_objs = [
+ TestDataclass2("a", 1),
+ TestDataclass2("a", 2),
+ TestDataclass2("b", 3),
+ TestDataclass2("c", 4),
+ TestDataclass2("c", 4),
+ TestDataclass2("c", 4),
+ ]
+ with tempfile.TemporaryDirectory(prefix=f"{TestDataUtils}") as tempdir:
+ csv_file_name = Path(tempdir) / "data.csv"
+ save_dataclass_objs_to_headered_csv(dataclass_objs, csv_file_name)
+ test_dict = load_dataclass_dict_from_csv(
+ csv_file_name, TestDataclass2, "a", list_per_key=True
+ )
+ self.assertEqual(len(test_dict), 3)
+ self.assertEqual([x.b for x in test_dict["a"]], [1, 2])
+ self.assertEqual([x.b for x in test_dict["b"]], [3])
+ self.assertEqual([x.b for x in test_dict["c"]], [4, 4, 4])
+
+ def test_load_dataclass_dict_from_csv_throws(self):
+ dataclass_objs = [
+ TestDataclass2("a", 1),
+ TestDataclass2("a", 2),
+ TestDataclass2("b", 3),
+ TestDataclass2("c", 4),
+ TestDataclass2("c", 4),
+ TestDataclass2("c", 4),
+ ]
+ with tempfile.TemporaryDirectory(prefix=f"{TestDataUtils}") as tempdir:
+ csv_file_name = Path(tempdir) / "data.csv"
+ save_dataclass_objs_to_headered_csv(dataclass_objs, csv_file_name)
+ self.assertRaises(
+ AssertionError,
+ lambda: load_dataclass_dict_from_csv(
+ csv_file_name, TestDataclass2, "a", list_per_key=False
+ ),
+ )
+
+ def test_save_dataclass_objs_to_headered_csv(self):
+ dataclass_objs = [
+ TestDataclass2("a", 1),
+ TestDataclass2("a", 2),
+ TestDataclass2("b", 3),
+ ]
+
+ with tempfile.TemporaryDirectory(prefix=f"{TestDataUtils}") as tempdir:
+ csv_file_name = Path(tempdir) / "data.csv"
+ save_dataclass_objs_to_headered_csv(dataclass_objs, csv_file_name)
+ with open(csv_file_name) as f:
+ lines = list(f.readlines())
+ self.assertEqual(len(lines), 4)
+ self.assertEqual(lines[0], "a,b\n")
+ self.assertEqual(lines[1], "a,1\n")
+ self.assertEqual(lines[2], "a,2\n")
+ self.assertEqual(lines[3], "b,3\n")
diff --git a/tests/test_layers_convolutions.py b/tests/test_layers_convolutions.py
new file mode 100644
index 00000000..f79129a5
--- /dev/null
+++ b/tests/test_layers_convolutions.py
@@ -0,0 +1,219 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import itertools
+import unittest
+
+import numpy as np
+import torch
+from pytorchvideo.layers.convolutions import (
+ Conv2plus1d,
+ ConvReduce3D,
+ create_conv_2plus1d,
+)
+from torch import nn
+
+
+class TestConvReduce3D(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_create_stack_conv(self):
+ """
+ Test ConvReduce3D.
+ """
+ for input_dim, output_dim in itertools.product((2, 4), (4, 8, 16)):
+ model = ConvReduce3D(
+ in_channels=input_dim,
+ out_channels=output_dim,
+ kernel_size=((1, 1, 1), (3, 3, 3), (1, 3, 3)),
+ stride=((1, 1, 1), (1, 1, 1), None),
+ padding=((0, 0, 0), (1, 1, 1), (0, 1, 1)),
+ dilation=((2, 2, 2), (1, 1, 1), None),
+ groups=(1, 2, None),
+ bias=(True, False, None),
+ )
+ model_gt_list = [
+ nn.Conv3d(
+ in_channels=input_dim,
+ out_channels=output_dim,
+ kernel_size=(1, 1, 1),
+ stride=(1, 1, 1),
+ padding=(0, 0, 0),
+ dilation=(2, 2, 2),
+ groups=1,
+ bias=True,
+ ),
+ nn.Conv3d(
+ in_channels=input_dim,
+ out_channels=output_dim,
+ kernel_size=(3, 3, 3),
+ stride=(1, 1, 1),
+ padding=(1, 1, 1),
+ dilation=(1, 1, 1),
+ groups=2,
+ bias=False,
+ ),
+ nn.Conv3d(
+ in_channels=input_dim,
+ out_channels=output_dim,
+ kernel_size=(1, 3, 3),
+ padding=(0, 1, 1),
+ ),
+ ]
+ model.convs[0].load_state_dict(
+ model_gt_list[0].state_dict(), strict=True
+ ) # explicitly use strict mode.
+ model.convs[1].load_state_dict(
+ model_gt_list[1].state_dict(), strict=True
+ ) # explicitly use strict mode.
+ model.convs[2].load_state_dict(
+ model_gt_list[2].state_dict(), strict=True
+ ) # explicitly use strict mode.
+
+ # Test forwarding.
+ for tensor in TestConvReduce3D._get_inputs(input_dim):
+ if tensor.shape[1] != input_dim:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(tensor)
+ continue
+ else:
+ output_tensor = model(tensor)
+ output_gt = []
+ for ind in range(3):
+ output_gt.append(model_gt_list[ind](tensor))
+ output_tensor_gt = torch.stack(output_gt, dim=0).sum(
+ dim=0, keepdim=False
+ )
+
+ self.assertEqual(
+ output_tensor.shape,
+ output_tensor_gt.shape,
+ "Output shape {} is different from expected shape {}".format(
+ output_tensor.shape, output_tensor_gt.shape
+ ),
+ )
+
+ @staticmethod
+ def _get_inputs(input_dim: int = 3) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random tensor as test cases.
+ shapes = (
+ # Forward succeeded.
+ (1, input_dim, 3, 7, 7),
+ (1, input_dim, 5, 7, 7),
+ (1, input_dim, 7, 7, 7),
+ (2, input_dim, 3, 7, 7),
+ (4, input_dim, 3, 7, 7),
+ (8, input_dim, 3, 7, 7),
+ (2, input_dim, 3, 7, 14),
+ (2, input_dim, 3, 14, 7),
+ (2, input_dim, 3, 14, 14),
+ # Forward failed.
+ (8, input_dim * 2, 3, 7, 7),
+ (8, input_dim * 4, 5, 7, 7),
+ )
+ for shape in shapes:
+ yield torch.rand(shape)
+
+
+class TestConv2plus1d(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_create_2plus1d_conv(self):
+ """
+ Test Conv2plus1d.
+ """
+ for input_dim, output_dim in itertools.product((2, 4), (4, 8, 16)):
+ model = Conv2plus1d(
+ conv_t=nn.Conv3d(
+ in_channels=input_dim,
+ out_channels=output_dim,
+ kernel_size=(3, 1, 1),
+ stride=(2, 1, 1),
+ padding=(1, 0, 0),
+ bias=False,
+ ),
+ norm=nn.BatchNorm3d(output_dim),
+ activation=nn.ReLU(),
+ conv_xy=nn.Conv3d(
+ in_channels=output_dim,
+ out_channels=output_dim,
+ kernel_size=(1, 3, 3),
+ stride=(1, 2, 2),
+ padding=(0, 1, 1),
+ bias=False,
+ ),
+ )
+
+ model_gt = create_conv_2plus1d(
+ in_channels=input_dim,
+ out_channels=output_dim,
+ kernel_size=(3, 3, 3),
+ stride=(2, 2, 2),
+ padding=(1, 1, 1),
+ bias=False,
+ norm=nn.BatchNorm3d,
+ norm_eps=1e-5,
+ norm_momentum=0.1,
+ activation=nn.ReLU,
+ )
+
+ model.load_state_dict(
+ model_gt.state_dict(), strict=True
+ ) # explicitly use strict mode.
+
+ # Test forwarding.
+ for input_tensor in TestConv2plus1d._get_inputs():
+ with torch.no_grad():
+ if input_tensor.shape[1] != input_dim:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(input_tensor)
+ continue
+ else:
+ output_tensor = model(input_tensor)
+ output_tensor_gt = model_gt(input_tensor)
+ self.assertEqual(
+ output_tensor.shape,
+ output_tensor_gt.shape,
+ "Output shape {} is different from expected shape {}".format(
+ output_tensor.shape, output_tensor_gt.shape
+ ),
+ )
+ self.assertTrue(
+ np.allclose(output_tensor.numpy(), output_tensor_gt.numpy())
+ )
+
+ @staticmethod
+ def _get_inputs(input_dim: int = 3) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random tensor as test cases.
+ shapes = (
+ # Forward succeeded.
+ (1, input_dim, 3, 7, 7),
+ (1, input_dim, 5, 7, 7),
+ (1, input_dim, 7, 7, 7),
+ (2, input_dim, 3, 7, 7),
+ (4, input_dim, 3, 7, 7),
+ (8, input_dim, 3, 7, 7),
+ (2, input_dim, 3, 7, 14),
+ (2, input_dim, 3, 14, 7),
+ (2, input_dim, 3, 14, 14),
+ # Forward failed.
+ (8, input_dim * 2, 3, 7, 7),
+ (8, input_dim * 4, 5, 7, 7),
+ )
+ for shape in shapes:
+ yield torch.rand(shape)
diff --git a/tests/test_layers_fusion.py b/tests/test_layers_fusion.py
new file mode 100644
index 00000000..cc35b9f4
--- /dev/null
+++ b/tests/test_layers_fusion.py
@@ -0,0 +1,64 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import unittest
+
+import torch
+from pytorchvideo.layers import make_fusion_layer
+
+
+class TestFusion(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ self.fake_input_1 = torch.Tensor(
+ [[[4, -2], [3, 0]], [[0, 2], [4, 3]], [[3, 1], [5, 2]]]
+ ).float()
+ self.fake_input_2 = torch.Tensor(
+ [[[1, 2], [3, 4]], [[5, 6], [6, 5]], [[4, 3], [2, 1]]]
+ ).float()
+
+ def test_reduce_fusion_layers(self):
+ expected_output_for_method = {
+ "max": torch.Tensor(
+ [[[4, 2], [3, 4]], [[5, 6], [6, 5]], [[4, 3], [5, 2]]]
+ ).float(),
+ "sum": torch.Tensor(
+ [[[5, 0], [6, 4]], [[5, 8], [10, 8]], [[7, 4], [7, 3]]]
+ ).float(),
+ "prod": torch.Tensor(
+ [[[4, -4], [9, 0]], [[0, 12], [24, 15]], [[12, 3], [10, 2]]]
+ ).float(),
+ }
+
+ for method, expected_output in expected_output_for_method.items():
+ model = make_fusion_layer(
+ method, [self.fake_input_1.shape[-1], self.fake_input_2.shape[-1]]
+ )
+ output = model([self.fake_input_1, self.fake_input_2])
+ self.assertTrue(torch.equal(output, expected_output))
+ self.assertEqual(model.output_dim, self.fake_input_1.shape[-1])
+
+ def test_concat_fusion(self):
+ model = make_fusion_layer(
+ "concat", [self.fake_input_1.shape[-1], self.fake_input_2.shape[-1]]
+ )
+ input_list = [self.fake_input_1, self.fake_input_2]
+ output = model(input_list)
+ expected_output = torch.cat(input_list, dim=-1)
+ self.assertTrue(torch.equal(output, expected_output))
+
+ expected_shape = self.fake_input_1.shape[-1] + self.fake_input_2.shape[-1]
+ self.assertEqual(model.output_dim, expected_shape)
+
+ def test_temporal_concat_fusion(self):
+ model = make_fusion_layer(
+ "temporal_concat",
+ [self.fake_input_1.shape[-1], self.fake_input_2.shape[-1]],
+ )
+ input_list = [self.fake_input_1, self.fake_input_2]
+ output = model(input_list)
+
+ expected_output = torch.cat(input_list, dim=-2)
+ self.assertTrue(torch.equal(output, expected_output))
+ self.assertEqual(model.output_dim, self.fake_input_2.shape[-1])
diff --git a/tests/test_layers_mlp.py b/tests/test_layers_mlp.py
new file mode 100644
index 00000000..051ad318
--- /dev/null
+++ b/tests/test_layers_mlp.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import itertools
+import unittest
+
+import torch
+import torch.nn as nn
+from pytorchvideo.layers import make_multilayer_perceptron
+
+
+class TestMLP(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_make_multilayer_perceptron(self):
+ fake_input = torch.rand((8, 64))
+ fcs = [64, 128, 64, 32]
+ mid_activations = [nn.ReLU, nn.Sigmoid]
+ final_activations = [nn.ReLU, nn.Sigmoid, None]
+ norms = [nn.LayerNorm, nn.BatchNorm1d, None]
+ for mid_act, final_act, norm in itertools.product(
+ mid_activations, final_activations, norms
+ ):
+ mlp, output_dim = make_multilayer_perceptron(
+ fully_connected_dims=fcs,
+ mid_activation=mid_act,
+ final_activation=final_act,
+ norm=norm,
+ dropout_rate=0.5,
+ )
+
+ self.assertEqual(output_dim, 32)
+
+ output = mlp(fake_input)
+ self.assertTrue(output.shape, torch.Size([8, 32]))
diff --git a/tests/test_layers_nonlocal_net.py b/tests/test_layers_nonlocal_net.py
new file mode 100644
index 00000000..d3075aa1
--- /dev/null
+++ b/tests/test_layers_nonlocal_net.py
@@ -0,0 +1,159 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import itertools
+import unittest
+from typing import Iterable
+
+import numpy as np
+import torch
+from pytorchvideo.layers.nonlocal_net import NonLocal, create_nonlocal
+from torch import nn
+
+
+class TestNonlocal(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_build_nonlocal(self):
+ """
+ Test Nonlocal model builder.
+ """
+ for dim_in, dim_inner, pool, norm, instantiation in itertools.product(
+ (4, 8),
+ (2, 4),
+ (None, nn.MaxPool3d(2)),
+ (None, nn.BatchNorm3d),
+ ("dot_product", "softmax"),
+ ):
+ model = NonLocal(
+ conv_theta=nn.Conv3d(
+ dim_in, dim_inner, kernel_size=1, stride=1, padding=0
+ ),
+ conv_phi=nn.Conv3d(
+ dim_in, dim_inner, kernel_size=1, stride=1, padding=0
+ ),
+ conv_g=nn.Conv3d(dim_in, dim_inner, kernel_size=1, stride=1, padding=0),
+ conv_out=nn.Conv3d(
+ dim_inner, dim_in, kernel_size=1, stride=1, padding=0
+ ),
+ pool=pool,
+ norm=norm(dim_in) if norm is not None else None,
+ instantiation=instantiation,
+ )
+
+ # Test forwarding.
+ for input_tensor in TestNonlocal._get_inputs(input_dim=dim_in):
+ if input_tensor.shape[1] != dim_in:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(input_tensor)
+ continue
+ else:
+ output_tensor = model(input_tensor)
+
+ input_shape = input_tensor.shape
+ output_shape = output_tensor.shape
+
+ self.assertEqual(
+ input_shape,
+ output_shape,
+ "Input shape {} is different from output shape {}".format(
+ input_shape, output_shape
+ ),
+ )
+
+ def test_nonlocal_builder(self):
+ """
+ Test builder `create_nonlocal`.
+ """
+ for dim_in, dim_inner, pool_size, norm, instantiation in itertools.product(
+ (4, 8),
+ (2, 4),
+ ((1, 1, 1), (2, 2, 2)),
+ (None, nn.BatchNorm3d),
+ ("dot_product", "softmax"),
+ ):
+ conv_theta = nn.Conv3d(
+ dim_in, dim_inner, kernel_size=1, stride=1, padding=0
+ )
+ conv_phi = nn.Conv3d(dim_in, dim_inner, kernel_size=1, stride=1, padding=0)
+ conv_g = nn.Conv3d(dim_in, dim_inner, kernel_size=1, stride=1, padding=0)
+ conv_out = nn.Conv3d(dim_inner, dim_in, kernel_size=1, stride=1, padding=0)
+ if norm is None:
+ norm_model = None
+ else:
+ norm_model = norm(num_features=dim_in)
+ if isinstance(pool_size, Iterable) and any(size > 1 for size in pool_size):
+ pool_model = nn.MaxPool3d(
+ kernel_size=pool_size, stride=pool_size, padding=[0, 0, 0]
+ )
+ else:
+ pool_model = None
+
+ model = create_nonlocal(
+ dim_in=dim_in,
+ dim_inner=dim_inner,
+ pool_size=pool_size,
+ instantiation=instantiation,
+ norm=norm,
+ )
+
+ model_gt = NonLocal(
+ conv_theta=conv_theta,
+ conv_phi=conv_phi,
+ conv_g=conv_g,
+ conv_out=conv_out,
+ pool=pool_model,
+ norm=norm_model,
+ instantiation=instantiation,
+ )
+ model.load_state_dict(
+ model_gt.state_dict(), strict=True
+ ) # explicitly use strict mode.
+
+ # Test forwarding.
+ for input_tensor in TestNonlocal._get_inputs(input_dim=dim_in):
+ with torch.no_grad():
+ if input_tensor.shape[1] != dim_in:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(input_tensor)
+ continue
+ else:
+ output_tensor = model(input_tensor)
+ output_tensor_gt = model_gt(input_tensor)
+ self.assertEqual(
+ output_tensor.shape,
+ output_tensor_gt.shape,
+ "Output shape {} is different from expected shape {}".format(
+ output_tensor.shape, output_tensor_gt.shape
+ ),
+ )
+ self.assertTrue(
+ np.allclose(output_tensor.numpy(), output_tensor_gt.numpy())
+ )
+
+ @staticmethod
+ def _get_inputs(input_dim: int = 8) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random tensor as test cases.
+ shapes = (
+ # Forward succeeded.
+ (1, input_dim, 5, 7, 7),
+ (2, input_dim, 5, 7, 7),
+ (4, input_dim, 5, 7, 7),
+ (4, input_dim, 5, 7, 7),
+ (4, input_dim, 7, 7, 7),
+ (4, input_dim, 7, 7, 14),
+ (4, input_dim, 7, 14, 7),
+ (4, input_dim, 7, 14, 14),
+ # Forward failed.
+ (8, input_dim * 2, 3, 7, 7),
+ (8, input_dim * 4, 5, 7, 7),
+ )
+ for shape in shapes:
+ yield torch.rand(shape)
diff --git a/tests/test_layers_positional_encoding.py b/tests/test_layers_positional_encoding.py
new file mode 100644
index 00000000..9a36ee8f
--- /dev/null
+++ b/tests/test_layers_positional_encoding.py
@@ -0,0 +1,69 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import math
+import unittest
+
+import torch
+from pytorchvideo.layers import PositionalEncoding
+
+
+class TestPositionalEncoding(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ self.batch_size = 4
+ self.seq_len = 16
+ self.feature_dim = 8
+ self.fake_input = torch.randn(
+ (self.batch_size, self.seq_len, self.feature_dim)
+ ).float()
+ lengths = torch.Tensor([16, 0, 14, 15, 16, 16, 16, 16])
+ self.mask = torch.lt(
+ torch.arange(self.seq_len)[None, :], lengths[:, None].long()
+ )
+
+ def test_positional_encoding(self):
+ model = PositionalEncoding(self.feature_dim, self.seq_len)
+ output = model(self.fake_input)
+ delta = output - self.fake_input
+
+ pe = torch.zeros(self.seq_len, self.feature_dim, dtype=torch.float)
+ position = torch.arange(0, self.seq_len, dtype=torch.float).unsqueeze(1)
+ div_term = torch.exp(
+ torch.arange(0, self.feature_dim, 2).float()
+ * (-math.log(10000.0) / self.feature_dim)
+ )
+ pe[:, 0::2] = torch.sin(position * div_term)
+ pe[:, 1::2] = torch.cos(position * div_term)
+
+ for n in range(0, self.batch_size):
+ self.assertTrue(torch.allclose(delta[n], pe, atol=1e-6))
+
+ def test_positional_encoding_with_different_pe_and_data_dimensions(self):
+ """Test that model executes even if input data dimensions
+ differs from the dimension of initialized postional encoding model"""
+
+ # When self.seq_len < positional_encoding_seq_len, pe is added to input
+ positional_encoding_seq_len = self.seq_len * 3
+ model = PositionalEncoding(self.feature_dim, positional_encoding_seq_len)
+ output = model(self.fake_input)
+
+ delta = output - self.fake_input
+ pe = torch.zeros(self.seq_len, self.feature_dim, dtype=torch.float)
+ position = torch.arange(0, self.seq_len, dtype=torch.float).unsqueeze(1)
+ div_term = torch.exp(
+ torch.arange(0, self.feature_dim, 2).float()
+ * (-math.log(10000.0) / self.feature_dim)
+ )
+ pe[:, 0::2] = torch.sin(position * div_term)
+ pe[:, 1::2] = torch.cos(position * div_term)
+
+ for n in range(0, self.batch_size):
+ self.assertTrue(torch.allclose(delta[n], pe, atol=1e-6))
+
+ # When self.seq_len > positional_encoding_seq_len, assertion error
+ positional_encoding_seq_len = self.seq_len // 2
+ model = PositionalEncoding(self.feature_dim, positional_encoding_seq_len)
+ with self.assertRaises(AssertionError):
+ output = model(self.fake_input)
diff --git a/tests/test_layers_squeeze_excitation.py b/tests/test_layers_squeeze_excitation.py
new file mode 100644
index 00000000..5ceaffe4
--- /dev/null
+++ b/tests/test_layers_squeeze_excitation.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import copy
+import unittest
+
+import torch
+import torch.nn as nn
+from pytorchvideo.layers.squeeze_excitation import (
+ create_audio_2d_squeeze_excitation_block,
+)
+
+
+class Test2DSqueezeExcitationBlock(unittest.TestCase):
+ def setUp(self):
+
+ self.layer_args = {
+ "dim_in": 32,
+ "dim_out": 32,
+ "use_se": True,
+ "se_reduction_ratio": 16,
+ "branch_fusion": lambda x, y: x + y,
+ "conv_a_kernel_size": 3,
+ "conv_a_stride": 1,
+ "conv_a_padding": 1,
+ "conv_b_kernel_size": 3,
+ "conv_b_stride": 1,
+ "conv_b_padding": 1,
+ "norm": nn.BatchNorm2d,
+ "norm_eps": 1e-5,
+ "norm_momentum": 0.1,
+ "activation": nn.ReLU,
+ }
+
+ self.batchsize = 1
+ self.forward_pass_configs = [
+ {
+ "input": torch.rand(self.batchsize, self.layer_args["dim_in"], 100, 40),
+ "output_shape": torch.Size(
+ [self.batchsize, self.layer_args["dim_out"], 100, 40]
+ ),
+ },
+ ]
+
+ def test_forward_pass(self):
+ for split_config in self.forward_pass_configs:
+ layer_args = copy.deepcopy(self.layer_args)
+ model = create_audio_2d_squeeze_excitation_block(**layer_args)
+
+ out = model(split_config["input"])
+ self.assertTrue(isinstance(out, torch.Tensor))
+ self.assertEqual(out.size(), split_config["output_shape"])
diff --git a/tests/test_models_byol.py b/tests/test_models_byol.py
new file mode 100644
index 00000000..6007439a
--- /dev/null
+++ b/tests/test_models_byol.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import unittest
+
+import torch
+from pytorchvideo.models.byol import BYOL
+from torch import nn
+
+
+class TestBYOL(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_byol(self):
+ byol = BYOL(
+ backbone=nn.Linear(8, 4),
+ projector=nn.Linear(4, 4),
+ feature_dim=4,
+ norm=nn.BatchNorm1d,
+ )
+ for crop1, crop2 in TestBYOL._get_inputs():
+ byol(crop1, crop2)
+
+ @staticmethod
+ def _get_inputs() -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random inputs as test cases.
+ shapes = ((2, 8),)
+ for shape in shapes:
+ yield torch.rand(shape), torch.rand(shape)
diff --git a/tests/test_models_csn.py b/tests/test_models_csn.py
new file mode 100644
index 00000000..b3baa673
--- /dev/null
+++ b/tests/test_models_csn.py
@@ -0,0 +1,96 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import itertools
+import unittest
+
+import numpy as np
+import torch
+from pytorchvideo.models.csn import create_csn
+from pytorchvideo.models.resnet import create_bottleneck_block
+from torch import nn
+
+
+class TestCSN(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_create_csn(self):
+ """
+ Test simple CSN with different inputs.
+ """
+ for input_channel, input_clip_length, input_crop_size in itertools.product(
+ (3, 2), (4, 8), (56, 64)
+ ):
+ stage_spatial_stride = (1, 2, 2, 2)
+ stage_temporal_stride = (1, 2, 2, 1)
+
+ total_spatial_stride = 2 * np.prod(stage_spatial_stride)
+ total_temporal_stride = np.prod(stage_temporal_stride)
+ head_pool_kernel_size = (
+ input_clip_length // total_temporal_stride,
+ input_crop_size // total_spatial_stride,
+ input_crop_size // total_spatial_stride,
+ )
+
+ model = create_csn(
+ input_channel=input_channel,
+ model_depth=50,
+ model_num_class=400,
+ dropout_rate=0,
+ norm=nn.BatchNorm3d,
+ activation=nn.ReLU,
+ stem_dim_out=8,
+ stem_conv_kernel_size=(3, 7, 7),
+ stem_conv_stride=(1, 2, 2),
+ stage_conv_a_kernel_size=(1, 1, 1),
+ stage_conv_b_kernel_size=(3, 3, 3),
+ stage_conv_b_width_per_group=1,
+ stage_spatial_stride=(1, 2, 2, 2),
+ stage_temporal_stride=(1, 2, 2, 1),
+ bottleneck=create_bottleneck_block,
+ head_pool=nn.AvgPool3d,
+ head_pool_kernel_size=head_pool_kernel_size,
+ head_output_size=(1, 1, 1),
+ head_activation=nn.Softmax,
+ )
+
+ # Test forwarding.
+ for tensor in TestCSN._get_inputs(
+ input_channel, input_clip_length, input_crop_size
+ ):
+ if tensor.shape[1] != input_channel:
+ with self.assertRaises(RuntimeError):
+ out = model(tensor)
+ continue
+
+ out = model(tensor)
+
+ output_shape = out.shape
+ output_shape_gt = (tensor.shape[0], 400)
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ @staticmethod
+ def _get_inputs(
+ channel: int = 3, clip_length: int = 4, crop_size: int = 112
+ ) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random inputs as test cases.
+ shapes = (
+ (1, channel, clip_length, crop_size, crop_size),
+ (2, channel, clip_length, crop_size, crop_size),
+ )
+ for shape in shapes:
+ yield torch.rand(shape)
diff --git a/tests/test_models_head.py b/tests/test_models_head.py
new file mode 100644
index 00000000..cad57622
--- /dev/null
+++ b/tests/test_models_head.py
@@ -0,0 +1,171 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import itertools
+import unittest
+
+import numpy as np
+import torch
+from pytorchvideo.models.head import ResNetBasicHead, create_res_basic_head
+from torch import nn
+
+
+class TestHeadHelper(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_build_simple_head(self):
+ """
+ Test simple ResNetBasicHead (without dropout and activation layers).
+ """
+ for input_dim, output_dim in itertools.product((4, 8), (4, 8, 16)):
+ model = ResNetBasicHead(
+ proj=nn.Linear(input_dim, output_dim),
+ pool=nn.AdaptiveAvgPool3d(1),
+ output_pool=nn.AdaptiveAvgPool3d(1),
+ )
+
+ # Test forwarding.
+ for input_tensor in TestHeadHelper._get_inputs(input_dim=input_dim):
+ if input_tensor.shape[1] != input_dim:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(input_tensor)
+ continue
+ else:
+ output_tensor = model(input_tensor)
+
+ input_shape = input_tensor.shape
+ output_shape = output_tensor.shape
+ output_shape_gt = (input_shape[0], output_dim)
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_build_complex_head(self):
+ """
+ Test complex ResNetBasicHead.
+ """
+ for input_dim, output_dim in itertools.product((4, 8), (4, 8, 16)):
+ model = ResNetBasicHead(
+ proj=nn.Linear(input_dim, output_dim),
+ activation=nn.Softmax(),
+ pool=nn.AdaptiveAvgPool3d(1),
+ dropout=nn.Dropout(0.5),
+ output_pool=nn.AdaptiveAvgPool3d(1),
+ )
+
+ # Test forwarding.
+ for input_tensor in TestHeadHelper._get_inputs(input_dim=input_dim):
+ if input_tensor.shape[1] != input_dim:
+ with self.assertRaises(Exception):
+ output_tensor = model(input_tensor)
+ continue
+
+ output_tensor = model(input_tensor)
+
+ input_shape = input_tensor.shape
+ output_shape = output_tensor.shape
+ output_shape_gt = (input_shape[0], output_dim)
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_build_head_with_callable(self):
+ """
+ Test builder `create_res_basic_head`.
+ """
+ for (pool, activation) in itertools.product(
+ (nn.AvgPool3d, nn.MaxPool3d, nn.AdaptiveAvgPool3d, None),
+ (nn.ReLU, nn.Softmax, nn.Sigmoid, None),
+ ):
+ if activation is None:
+ activation_model = None
+ elif activation == nn.Softmax:
+ activation_model = activation(dim=1)
+ else:
+ activation_model = activation()
+
+ if pool is None:
+ pool_model = None
+ elif pool == nn.AdaptiveAvgPool3d:
+ pool_model = pool(1)
+ else:
+ pool_model = pool(kernel_size=[5, 7, 7], stride=[1, 1, 1])
+
+ model = create_res_basic_head(
+ in_features=16,
+ out_features=32,
+ pool=pool,
+ pool_kernel_size=(5, 7, 7),
+ output_size=(1, 1, 1),
+ dropout_rate=0.0,
+ activation=activation,
+ output_with_global_average=True,
+ )
+ model_gt = ResNetBasicHead(
+ proj=nn.Linear(16, 32),
+ activation=activation_model,
+ pool=pool_model,
+ dropout=None,
+ output_pool=nn.AdaptiveAvgPool3d(1),
+ )
+ model.load_state_dict(
+ model_gt.state_dict(), strict=True
+ ) # explicitly use strict mode.
+
+ # Test forwarding.
+ for input_tensor in TestHeadHelper._get_inputs(input_dim=16):
+ with torch.no_grad():
+ if input_tensor.shape[1] != 16:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(input_tensor)
+ continue
+ else:
+ output_tensor = model(input_tensor)
+ output_tensor_gt = model_gt(input_tensor)
+ self.assertEqual(
+ output_tensor.shape,
+ output_tensor_gt.shape,
+ "Output shape {} is different from expected shape {}".format(
+ output_tensor.shape, output_tensor_gt.shape
+ ),
+ )
+ self.assertTrue(
+ np.allclose(output_tensor.numpy(), output_tensor_gt.numpy())
+ )
+
+ @staticmethod
+ def _get_inputs(input_dim: int = 8) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random tensor as test cases.
+ shapes = (
+ # Forward succeeded.
+ (1, input_dim, 5, 7, 7),
+ (2, input_dim, 5, 7, 7),
+ (4, input_dim, 5, 7, 7),
+ (4, input_dim, 5, 7, 7),
+ (4, input_dim, 7, 7, 7),
+ (4, input_dim, 7, 7, 14),
+ (4, input_dim, 7, 14, 7),
+ (4, input_dim, 7, 14, 14),
+ # Forward failed.
+ (8, input_dim * 2, 3, 7, 7),
+ (8, input_dim * 4, 5, 7, 7),
+ )
+ for shape in shapes:
+ yield torch.rand(shape)
diff --git a/tests/test_models_masked_multistream.py b/tests/test_models_masked_multistream.py
new file mode 100644
index 00000000..292b6a9b
--- /dev/null
+++ b/tests/test_models_masked_multistream.py
@@ -0,0 +1,130 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import copy
+import unittest
+
+import torch
+import torch.nn
+from pytorchvideo.layers import PositionalEncoding, make_multilayer_perceptron
+from pytorchvideo.models.masked_multistream import (
+ LSTM,
+ LearnMaskedDefault,
+ MaskedSequential,
+ MaskedTemporalPooling,
+ TransposeMultiheadAttention,
+ TransposeTransformerEncoder,
+)
+
+
+class TestMaskedMultiStream(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_masked_multistream_model(self):
+ feature_dim = 8
+ mlp, out_dim = make_multilayer_perceptron([feature_dim, 2])
+ input_stream = MaskedSequential(
+ PositionalEncoding(feature_dim),
+ TransposeMultiheadAttention(feature_dim),
+ MaskedTemporalPooling(method="avg"),
+ torch.nn.LayerNorm(feature_dim),
+ mlp,
+ LearnMaskedDefault(out_dim),
+ )
+
+ seq_len = 10
+ input_tensor = torch.rand([4, seq_len, feature_dim])
+ mask = _lengths2mask(
+ torch.tensor([seq_len, seq_len, seq_len, seq_len]), input_tensor.shape[1]
+ )
+ output = input_stream(input=input_tensor, mask=mask)
+ self.assertEqual(output.shape, torch.Size([4, out_dim]))
+
+ def test_masked_temporal_pooling(self):
+ fake_input = torch.Tensor(
+ [[[4, -2], [3, 0]], [[0, 2], [4, 3]], [[3, 1], [5, 2]]]
+ ).float()
+ valid_lengths = torch.Tensor([2, 1, 0]).int()
+ valid_mask = _lengths2mask(valid_lengths, fake_input.shape[1])
+ expected_output_for_method = {
+ "max": torch.Tensor([[4, 0], [0, 2], [0, 0]]).float(),
+ "avg": torch.Tensor([[3.5, -1], [0, 2], [0, 0]]).float(),
+ "sum": torch.Tensor([[7, -2], [0, 2], [0, 0]]).float(),
+ }
+ for method, expected_output in expected_output_for_method.items():
+ model = MaskedTemporalPooling(method)
+ output = model(copy.deepcopy(fake_input), mask=valid_mask)
+ self.assertTrue(torch.equal(output, expected_output))
+
+ def test_transpose_attention(self):
+ feature_dim = 8
+ seq_len = 10
+ fake_input = torch.rand([4, seq_len, feature_dim])
+ mask = _lengths2mask(
+ torch.tensor([seq_len, seq_len, seq_len, seq_len]), fake_input.shape[1]
+ )
+ model = TransposeMultiheadAttention(feature_dim, num_heads=2)
+ output = model(fake_input, mask=mask)
+ self.assertTrue(output.shape, fake_input.shape)
+
+ def test_masked_lstm(self):
+ feature_dim = 8
+ seq_len = 10
+ fake_input = torch.rand([4, seq_len, feature_dim])
+ mask = _lengths2mask(
+ torch.tensor([seq_len, seq_len, seq_len, seq_len]), fake_input.shape[1]
+ )
+ hidden_dim = 128
+
+ model = LSTM(feature_dim, hidden_dim=hidden_dim, bidirectional=False)
+ output = model(fake_input, mask=mask)
+ self.assertTrue(output.shape, (fake_input.shape[0], hidden_dim))
+
+ model = LSTM(feature_dim, hidden_dim=hidden_dim, bidirectional=True)
+ output = model(fake_input, mask=mask)
+ self.assertTrue(output.shape, (fake_input.shape[0], hidden_dim * 2))
+
+ def test_masked_transpose_transformer_encoder(self):
+ feature_dim = 8
+ seq_len = 10
+ fake_input = torch.rand([4, seq_len, feature_dim])
+ mask = _lengths2mask(
+ torch.tensor([seq_len, seq_len, seq_len, seq_len]), fake_input.shape[1]
+ )
+
+ model = TransposeTransformerEncoder(feature_dim)
+ output = model(fake_input, mask=mask)
+ self.assertEqual(output.shape, (fake_input.shape[0], feature_dim))
+
+ def test_learn_masked_default(self):
+ feature_dim = 8
+ seq_len = 10
+ fake_input = torch.rand([4, feature_dim])
+
+ # All valid mask
+ all_valid_mask = _lengths2mask(
+ torch.tensor([seq_len, seq_len, seq_len, seq_len]), fake_input.shape[1]
+ )
+ model = LearnMaskedDefault(feature_dim)
+ output = model(fake_input, mask=all_valid_mask)
+ self.assertTrue(output.equal(fake_input))
+
+ # No valid mask
+ no_valid_mask = _lengths2mask(torch.tensor([0, 0, 0, 0]), fake_input.shape[1])
+ model = LearnMaskedDefault(feature_dim)
+ output = model(fake_input, mask=no_valid_mask)
+ self.assertTrue(output.equal(model._learned_defaults.repeat(4, 1)))
+
+ # Half valid mask
+ half_valid_mask = _lengths2mask(torch.tensor([1, 1, 0, 0]), fake_input.shape[1])
+ model = LearnMaskedDefault(feature_dim)
+ output = model(fake_input, mask=half_valid_mask)
+ self.assertTrue(output[:2].equal(fake_input[:2]))
+ self.assertTrue(output[2:].equal(model._learned_defaults.repeat(2, 1)))
+
+
+def _lengths2mask(lengths: torch.Tensor, seq_len: int) -> torch.Tensor:
+ return torch.lt(
+ torch.arange(seq_len, device=lengths.device)[None, :], lengths[:, None].long()
+ )
diff --git a/tests/test_models_memory_bank.py b/tests/test_models_memory_bank.py
new file mode 100644
index 00000000..19aa4b45
--- /dev/null
+++ b/tests/test_models_memory_bank.py
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import unittest
+
+import torch
+from pytorchvideo.models.memory_bank import MemoryBank
+from torch import nn
+
+
+class TestMemoryBank(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_memory_bank(self):
+ simclr = MemoryBank(
+ backbone=nn.Linear(8, 4),
+ mlp=nn.Linear(4, 2),
+ temperature=0.07,
+ bank_size=8,
+ dim=2,
+ )
+ for crop, ind in TestMemoryBank._get_inputs():
+ simclr(crop, ind)
+
+ @staticmethod
+ def _get_inputs(bank_size: int = 8) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random inputs as test cases.
+ shapes = ((2, 8),)
+ for shape in shapes:
+ yield torch.rand(shape), torch.randint(0, bank_size, size=(shape[0],))
diff --git a/tests/test_models_r2plus1d.py b/tests/test_models_r2plus1d.py
new file mode 100644
index 00000000..f86e38b9
--- /dev/null
+++ b/tests/test_models_r2plus1d.py
@@ -0,0 +1,102 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import itertools
+import unittest
+
+import numpy as np
+import torch
+from pytorchvideo.models.r2plus1d import (
+ create_2plus1d_bottleneck_block,
+ create_r2plus1d,
+)
+from pytorchvideo.models.resnet import create_bottleneck_block
+from torch import nn
+
+
+class TestR2plus1d(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_create_r2plus1d(self):
+ """
+ Test simple r2plus1d with different inputs.
+ """
+ for input_channel, input_clip_length, input_crop_size in itertools.product(
+ (3, 2), (4, 8), (56, 64)
+ ):
+ stage_spatial_stride = (2, 2, 2, 2)
+ stage_temporal_stride = (1, 1, 2, 2)
+
+ total_spatial_stride = 2 * np.prod(stage_spatial_stride)
+ total_temporal_stride = np.prod(stage_temporal_stride)
+ head_pool_kernel_size = (
+ input_clip_length // total_temporal_stride,
+ input_crop_size // total_spatial_stride,
+ input_crop_size // total_spatial_stride,
+ )
+
+ model = create_r2plus1d(
+ input_channel=input_channel,
+ model_depth=50,
+ model_num_class=400,
+ dropout_rate=0.0,
+ norm=nn.BatchNorm3d,
+ activation=nn.ReLU,
+ stem_dim_out=8,
+ stem_conv_kernel_size=(1, 7, 7),
+ stem_conv_stride=(1, 2, 2),
+ stage_conv_b_kernel_size=((3, 3, 3),) * 4,
+ stage_spatial_stride=stage_spatial_stride,
+ stage_temporal_stride=stage_temporal_stride,
+ stage_bottleneck=(
+ create_bottleneck_block,
+ create_2plus1d_bottleneck_block,
+ create_2plus1d_bottleneck_block,
+ create_2plus1d_bottleneck_block,
+ ),
+ head_pool=nn.AvgPool3d,
+ head_pool_kernel_size=head_pool_kernel_size,
+ head_output_size=(1, 1, 1),
+ head_activation=nn.Softmax,
+ )
+
+ # Test forwarding.
+ for tensor in TestR2plus1d._get_inputs(
+ input_channel, input_clip_length, input_crop_size
+ ):
+ if tensor.shape[1] != input_channel:
+ with self.assertRaises(RuntimeError):
+ out = model(tensor)
+ continue
+
+ out = model(tensor)
+
+ output_shape = out.shape
+ output_shape_gt = (tensor.shape[0], 400)
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ @staticmethod
+ def _get_inputs(
+ channel: int = 3, clip_length: int = 16, crop_size: int = 224
+ ) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random inputs as test cases.
+ shapes = (
+ (1, channel, clip_length, crop_size, crop_size),
+ (2, channel, clip_length, crop_size, crop_size),
+ )
+ for shape in shapes:
+ yield torch.rand(shape)
diff --git a/tests/test_models_resnet.py b/tests/test_models_resnet.py
new file mode 100644
index 00000000..a713c65e
--- /dev/null
+++ b/tests/test_models_resnet.py
@@ -0,0 +1,1440 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import itertools
+import os
+import unittest
+
+import numpy as np
+import torch
+from pytorchvideo.models.head import ResNetBasicHead
+from pytorchvideo.models.net import Net
+from pytorchvideo.models.resnet import (
+ BottleneckBlock,
+ ResBlock,
+ ResStage,
+ SeparableBottleneckBlock,
+ create_acoustic_bottleneck_block,
+ create_acoustic_building_block,
+ create_acoustic_resnet,
+ create_bottleneck_block,
+ create_res_block,
+ create_res_stage,
+ create_resnet,
+)
+from pytorchvideo.models.stem import ResNetBasicStem
+from torch import nn
+
+
+class TestBottleneckBlock(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_create_simple_bottleneck_block(self):
+ """
+ Test simple BottleneckBlock with different dimensions.
+ """
+ for dim_in, dim_inner, dim_out in itertools.product(
+ (4, 8, 16), (2, 4), (4, 8, 16)
+ ):
+ model = BottleneckBlock(
+ conv_a=nn.Conv3d(
+ dim_in, dim_inner, kernel_size=1, stride=1, padding=0, bias=False
+ ),
+ norm_a=nn.BatchNorm3d(dim_inner),
+ act_a=nn.ReLU(),
+ conv_b=nn.Conv3d(
+ dim_inner, dim_inner, kernel_size=3, stride=1, padding=1, bias=False
+ ),
+ norm_b=nn.BatchNorm3d(dim_inner),
+ act_b=nn.ReLU(),
+ conv_c=nn.Conv3d(
+ dim_inner, dim_out, kernel_size=1, stride=1, padding=0, bias=False
+ ),
+ norm_c=nn.BatchNorm3d(dim_out),
+ )
+
+ # Test forwarding.
+ for input_tensor in TestBottleneckBlock._get_inputs(dim_in):
+ if input_tensor.shape[1] != dim_in:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(input_tensor)
+ continue
+
+ output_tensor = model(input_tensor)
+ input_shape = input_tensor.shape
+ output_shape = output_tensor.shape
+
+ output_shape_gt = (
+ input_shape[0],
+ dim_out,
+ input_shape[2],
+ input_shape[3],
+ input_shape[4],
+ )
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_create_complex_bottleneck_block(self):
+ """
+ Test complex BottleneckBlock with different dimensions.
+ """
+ for dim_in, dim_inner, dim_out in itertools.product(
+ (4, 8, 16), (2, 4), (4, 8, 16)
+ ):
+ model = BottleneckBlock(
+ conv_a=nn.Conv3d(
+ dim_in,
+ dim_inner,
+ kernel_size=[3, 1, 1],
+ stride=[2, 1, 1],
+ padding=[1, 0, 0],
+ bias=False,
+ ),
+ norm_a=nn.BatchNorm3d(dim_inner),
+ act_a=nn.ReLU(),
+ conv_b=nn.Conv3d(
+ dim_inner,
+ dim_inner,
+ kernel_size=[1, 3, 3],
+ stride=[1, 2, 2],
+ padding=[0, 1, 1],
+ groups=1,
+ dilation=[1, 1, 1],
+ bias=False,
+ ),
+ norm_b=nn.BatchNorm3d(dim_inner),
+ act_b=nn.ReLU(),
+ conv_c=nn.Conv3d(
+ dim_inner,
+ dim_out,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ bias=False,
+ ),
+ norm_c=nn.BatchNorm3d(dim_out),
+ )
+
+ # Test forwarding.
+ for input_tensor in TestBottleneckBlock._get_inputs(dim_in):
+ if input_tensor.shape[1] != dim_in:
+ with self.assertRaises(Exception):
+ output_tensor = model(input_tensor)
+ continue
+
+ output_tensor = model(input_tensor)
+ input_shape = input_tensor.shape
+ output_shape = output_tensor.shape
+
+ output_shape_gt = (
+ input_shape[0],
+ dim_out,
+ (input_shape[2] - 1) // 2 + 1,
+ (input_shape[3] - 1) // 2 + 1,
+ (input_shape[4] - 1) // 2 + 1,
+ )
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_create_separable_bottleneck_block_sum(self):
+ """
+ Test SeparableBottleneckBlock with different dimensions.
+ """
+ for dim_in, dim_inner, dim_out in itertools.product(
+ (4, 8, 16), (2, 4), (4, 8, 16)
+ ):
+ model = SeparableBottleneckBlock(
+ conv_a=nn.Conv3d(
+ dim_in,
+ dim_inner,
+ kernel_size=[3, 1, 1],
+ stride=[2, 1, 1],
+ padding=[1, 0, 0],
+ bias=False,
+ ),
+ norm_a=nn.BatchNorm3d(dim_inner),
+ act_a=nn.ReLU(),
+ conv_b=nn.ModuleList(
+ [
+ nn.Conv3d(
+ dim_inner,
+ dim_inner,
+ kernel_size=[1, 3, 3],
+ stride=[1, 2, 2],
+ padding=[0, 1, 1],
+ groups=1,
+ dilation=[1, 1, 1],
+ bias=False,
+ ),
+ nn.Conv3d(
+ dim_inner,
+ dim_inner,
+ kernel_size=[1, 3, 3],
+ stride=[1, 2, 2],
+ padding=[0, 1, 1],
+ groups=1,
+ dilation=[1, 1, 1],
+ bias=False,
+ ),
+ ]
+ ),
+ norm_b=nn.ModuleList(
+ [nn.BatchNorm3d(dim_inner), nn.BatchNorm3d(dim_inner)]
+ ),
+ act_b=nn.ModuleList([nn.ReLU(), nn.ReLU()]),
+ conv_c=nn.Conv3d(
+ dim_inner,
+ dim_out,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ bias=False,
+ ),
+ norm_c=nn.BatchNorm3d(dim_out),
+ reduce_method="sum",
+ )
+
+ # Test forwarding.
+ for input_tensor in TestBottleneckBlock._get_inputs(dim_in):
+ if input_tensor.shape[1] != dim_in:
+ with self.assertRaises(Exception):
+ output_tensor = model(input_tensor)
+ continue
+
+ output_tensor = model(input_tensor)
+ input_shape = input_tensor.shape
+ output_shape = output_tensor.shape
+
+ output_shape_gt = (
+ input_shape[0],
+ dim_out,
+ (input_shape[2] - 1) // 2 + 1,
+ (input_shape[3] - 1) // 2 + 1,
+ (input_shape[4] - 1) // 2 + 1,
+ )
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_separable_complex_bottleneck_block_cat(self):
+ """
+ Test SeparableBottleneckBlock with different dimensions.
+ """
+ for dim_in, dim_inner, dim_out in itertools.product(
+ (4, 8, 16), (2, 4), (4, 8, 16)
+ ):
+ model = SeparableBottleneckBlock(
+ conv_a=nn.Conv3d(
+ dim_in,
+ dim_inner,
+ kernel_size=[3, 1, 1],
+ stride=[2, 1, 1],
+ padding=[1, 0, 0],
+ bias=False,
+ ),
+ norm_a=nn.BatchNorm3d(dim_inner),
+ act_a=nn.ReLU(),
+ conv_b=nn.ModuleList(
+ [
+ nn.Conv3d(
+ dim_inner,
+ dim_inner,
+ kernel_size=[1, 3, 3],
+ stride=[1, 2, 2],
+ padding=[0, 1, 1],
+ groups=1,
+ dilation=[1, 1, 1],
+ bias=False,
+ ),
+ nn.Conv3d(
+ dim_inner,
+ dim_inner,
+ kernel_size=[1, 3, 3],
+ stride=[1, 2, 2],
+ padding=[0, 1, 1],
+ groups=1,
+ dilation=[1, 1, 1],
+ bias=False,
+ ),
+ ]
+ ),
+ norm_b=nn.ModuleList(
+ [nn.BatchNorm3d(dim_inner), nn.BatchNorm3d(dim_inner)]
+ ),
+ act_b=nn.ModuleList([nn.ReLU(), nn.ReLU()]),
+ conv_c=nn.Conv3d(
+ dim_inner * 2,
+ dim_out,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ bias=False,
+ ),
+ norm_c=nn.BatchNorm3d(dim_out),
+ reduce_method="cat",
+ )
+
+ # Test forwarding.
+ for input_tensor in TestBottleneckBlock._get_inputs(dim_in):
+ if input_tensor.shape[1] != dim_in:
+ with self.assertRaises(Exception):
+ output_tensor = model(input_tensor)
+ continue
+
+ output_tensor = model(input_tensor)
+ input_shape = input_tensor.shape
+ output_shape = output_tensor.shape
+
+ output_shape_gt = (
+ input_shape[0],
+ dim_out,
+ (input_shape[2] - 1) // 2 + 1,
+ (input_shape[3] - 1) // 2 + 1,
+ (input_shape[4] - 1) // 2 + 1,
+ )
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_create_acoustic_bottleneck_block_with_callable(self):
+ """
+ Test builder `create_acoustic_bottleneck_block` with callable inputs.
+ """
+ for (norm_model, act_model) in itertools.product(
+ (nn.BatchNorm3d,), (nn.ReLU, nn.Softmax, nn.Sigmoid)
+ ):
+ model = create_acoustic_bottleneck_block(
+ dim_in=32,
+ dim_inner=16,
+ dim_out=64,
+ conv_a_kernel_size=(3, 1, 1),
+ conv_a_stride=(1, 1, 1),
+ conv_a_padding=(1, 0, 0),
+ conv_b_kernel_size=(3, 3, 3),
+ conv_b_stride=(1, 1, 1),
+ conv_b_padding=(1, 1, 1),
+ conv_b_num_groups=1,
+ conv_b_dilation=(1, 1, 1),
+ norm=norm_model,
+ activation=act_model,
+ )
+ model_gt = SeparableBottleneckBlock(
+ conv_a=nn.Conv3d(
+ 32,
+ 16,
+ kernel_size=[3, 1, 1],
+ stride=[1, 1, 1],
+ padding=[1, 0, 0],
+ bias=False,
+ ),
+ norm_a=norm_model(16),
+ act_a=act_model(),
+ conv_b=nn.ModuleList(
+ [
+ nn.Conv3d(
+ 16,
+ 16,
+ kernel_size=[1, 3, 3],
+ stride=[1, 1, 1],
+ padding=[0, 1, 1],
+ dilation=1,
+ bias=False,
+ ),
+ nn.Conv3d(
+ 16,
+ 16,
+ kernel_size=[3, 1, 1],
+ stride=[1, 1, 1],
+ padding=[1, 0, 0],
+ dilation=1,
+ bias=False,
+ ),
+ ]
+ ),
+ norm_b=nn.ModuleList([norm_model(16), norm_model(16)]),
+ act_b=nn.ModuleList([act_model(), act_model()]),
+ conv_c=nn.Conv3d(
+ 16,
+ 64,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ bias=False,
+ ),
+ norm_c=norm_model(64),
+ )
+
+ model.load_state_dict(
+ model_gt.state_dict(), strict=True
+ ) # explicitly use strict mode.
+
+ # Test forwarding.
+ for input_tensor in TestBottleneckBlock._get_inputs(dim_in=32):
+ with torch.no_grad():
+ if input_tensor.shape[1] != 32:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(input_tensor)
+ continue
+
+ output_tensor = model(input_tensor)
+ output_tensor_gt = model_gt(input_tensor)
+ self.assertEqual(
+ output_tensor.shape,
+ output_tensor_gt.shape,
+ "Output shape {} is different from expected shape {}".format(
+ output_tensor.shape, output_tensor_gt.shape
+ ),
+ )
+ self.assertTrue(
+ np.allclose(output_tensor.numpy(), output_tensor_gt.numpy())
+ )
+
+ def test_create_acoustic_building_block_with_callable(self):
+ """
+ Test builder `create_building_bottleneck_block` with callable inputs.
+ """
+ for (norm_model, act_model) in itertools.product(
+ (nn.BatchNorm3d,), (nn.ReLU, nn.Softmax, nn.Sigmoid)
+ ):
+ model = create_acoustic_building_block(
+ dim_in=32,
+ dim_inner=16,
+ dim_out=64,
+ conv_a_kernel_size=(3, 1, 1),
+ conv_a_stride=(1, 1, 1),
+ conv_a_padding=(1, 0, 0),
+ conv_b_kernel_size=(3, 3, 3),
+ conv_b_stride=(1, 1, 1),
+ conv_b_padding=(1, 1, 1),
+ conv_b_num_groups=1,
+ conv_b_dilation=(1, 1, 1),
+ norm=norm_model,
+ activation=act_model,
+ )
+ model_gt = SeparableBottleneckBlock(
+ conv_a=None,
+ norm_a=None,
+ act_a=None,
+ conv_b=nn.ModuleList(
+ [
+ nn.Conv3d(
+ 32,
+ 16,
+ kernel_size=[3, 1, 1],
+ stride=[1, 1, 1],
+ padding=[1, 0, 0],
+ dilation=1,
+ bias=False,
+ ),
+ nn.Conv3d(
+ 32,
+ 16,
+ kernel_size=[1, 3, 3],
+ stride=[1, 1, 1],
+ padding=[0, 1, 1],
+ dilation=1,
+ bias=False,
+ ),
+ ]
+ ),
+ norm_b=nn.ModuleList([norm_model(16), norm_model(16)]),
+ act_b=nn.ModuleList([act_model(), act_model()]),
+ conv_c=nn.Conv3d(
+ 16,
+ 64,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ bias=False,
+ ),
+ norm_c=norm_model(64),
+ )
+
+ model.load_state_dict(
+ model_gt.state_dict(), strict=True
+ ) # explicitly use strict mode.
+
+ # Test forwarding.
+ for input_tensor in TestBottleneckBlock._get_inputs(dim_in=32):
+ with torch.no_grad():
+ if input_tensor.shape[1] != 32:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(input_tensor)
+ continue
+
+ output_tensor = model(input_tensor)
+ output_tensor_gt = model_gt(input_tensor)
+ self.assertEqual(
+ output_tensor.shape,
+ output_tensor_gt.shape,
+ "Output shape {} is different from expected shape {}".format(
+ output_tensor.shape, output_tensor_gt.shape
+ ),
+ )
+ self.assertTrue(
+ np.allclose(output_tensor.numpy(), output_tensor_gt.numpy())
+ )
+
+ def test_create_bottleneck_block_with_callable(self):
+ """
+ Test builder `create_bottleneck_block` with callable inputs.
+ """
+ for (norm_model, act_model) in itertools.product(
+ (nn.BatchNorm3d,), (nn.ReLU, nn.Softmax, nn.Sigmoid)
+ ):
+ model = create_bottleneck_block(
+ dim_in=32,
+ dim_inner=16,
+ dim_out=64,
+ conv_a_kernel_size=(3, 1, 1),
+ conv_a_stride=(1, 1, 1),
+ conv_a_padding=(1, 0, 0),
+ conv_b_kernel_size=(1, 3, 3),
+ conv_b_stride=(1, 1, 1),
+ conv_b_padding=(0, 1, 1),
+ conv_b_num_groups=1,
+ conv_b_dilation=(1, 1, 1),
+ norm=norm_model,
+ activation=act_model,
+ )
+ model_gt = BottleneckBlock(
+ conv_a=nn.Conv3d(
+ 32,
+ 16,
+ kernel_size=[3, 1, 1],
+ stride=[1, 1, 1],
+ padding=[1, 0, 0],
+ bias=False,
+ ),
+ norm_a=norm_model(16),
+ act_a=act_model(),
+ conv_b=nn.Conv3d(
+ 16,
+ 16,
+ kernel_size=[1, 3, 3],
+ stride=[1, 1, 1],
+ padding=[0, 1, 1],
+ bias=False,
+ ),
+ norm_b=norm_model(16),
+ act_b=act_model(),
+ conv_c=nn.Conv3d(
+ 16,
+ 64,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ bias=False,
+ ),
+ norm_c=norm_model(64),
+ )
+
+ model.load_state_dict(
+ model_gt.state_dict(), strict=True
+ ) # explicitly use strict mode.
+
+ # Test forwarding.
+ for input_tensor in TestBottleneckBlock._get_inputs(dim_in=32):
+ with torch.no_grad():
+ if input_tensor.shape[1] != 32:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(input_tensor)
+ continue
+
+ output_tensor = model(input_tensor)
+ output_tensor_gt = model_gt(input_tensor)
+ self.assertEqual(
+ output_tensor.shape,
+ output_tensor_gt.shape,
+ "Output shape {} is different from expected shape {}".format(
+ output_tensor.shape, output_tensor_gt.shape
+ ),
+ )
+ self.assertTrue(
+ np.allclose(output_tensor.numpy(), output_tensor_gt.numpy())
+ )
+
+ @staticmethod
+ def _get_inputs(dim_in: int = 3) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random segmentation as test cases.
+ shapes = (
+ # Forward succeeded.
+ (1, dim_in, 3, 7, 7),
+ (1, dim_in, 5, 7, 7),
+ (1, dim_in, 7, 7, 7),
+ (2, dim_in, 3, 7, 7),
+ (4, dim_in, 3, 7, 7),
+ (8, dim_in, 3, 7, 7),
+ (2, dim_in, 3, 7, 14),
+ (2, dim_in, 3, 14, 7),
+ (2, dim_in, 3, 14, 14),
+ # Forward failed.
+ (8, dim_in * 2, 3, 7, 7),
+ (8, dim_in * 4, 5, 7, 7),
+ )
+ for shape in shapes:
+ yield torch.rand(shape)
+
+
+class TestResBottleneckBlock(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_create_res_block(self):
+ """
+ Test simple ResBlock with different inputs.
+ """
+ for dim_in, dim_inner, dim_out in itertools.product(
+ (4, 8, 16), (2, 4), (4, 8, 16)
+ ):
+ model = ResBlock(
+ branch1_conv=nn.Conv3d(
+ dim_in, dim_out, kernel_size=(1, 1, 1), stride=(1, 1, 1)
+ )
+ if dim_in != dim_out
+ else None,
+ branch1_norm=nn.BatchNorm3d(num_features=dim_out)
+ if dim_in != dim_out
+ else None,
+ branch2=BottleneckBlock(
+ conv_a=nn.Conv3d(
+ dim_in,
+ dim_inner,
+ kernel_size=[3, 1, 1],
+ stride=[1, 1, 1],
+ padding=[1, 0, 0],
+ bias=False,
+ ),
+ norm_a=nn.BatchNorm3d(dim_inner),
+ act_a=nn.ReLU(),
+ conv_b=nn.Conv3d(
+ dim_inner,
+ dim_inner,
+ kernel_size=[1, 3, 3],
+ stride=[1, 1, 1],
+ padding=[0, 1, 1],
+ bias=False,
+ ),
+ norm_b=nn.BatchNorm3d(dim_inner),
+ act_b=nn.ReLU(),
+ conv_c=nn.Conv3d(
+ dim_inner,
+ dim_out,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ bias=False,
+ ),
+ norm_c=nn.BatchNorm3d(dim_out),
+ ),
+ activation=nn.ReLU(),
+ branch_fusion=lambda x, y: x + y,
+ )
+
+ # Test forwarding.
+ for input_tensor in TestBottleneckBlock._get_inputs(dim_in):
+ if input_tensor.shape[1] != dim_in:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(input_tensor)
+ continue
+
+ output_tensor = model(input_tensor)
+
+ input_shape = input_tensor.shape
+ output_shape = output_tensor.shape
+ output_shape_gt = (
+ input_shape[0],
+ dim_out,
+ input_shape[2],
+ input_shape[3],
+ input_shape[4],
+ )
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_create_res_block_with_callable(self):
+ """
+ Test builder `create_res_block` with callable inputs.
+ """
+ for (norm, activation) in itertools.product(
+ (nn.BatchNorm3d, None), (nn.ReLU, nn.Softmax, nn.Sigmoid, None)
+ ):
+ model = create_res_block(
+ dim_in=32,
+ dim_inner=16,
+ dim_out=64,
+ bottleneck=create_bottleneck_block,
+ conv_a_kernel_size=(3, 1, 1),
+ conv_a_stride=(1, 1, 1),
+ conv_a_padding=(1, 0, 0),
+ conv_b_kernel_size=(1, 3, 3),
+ conv_b_stride=(1, 2, 2),
+ conv_b_padding=(0, 1, 1),
+ conv_b_num_groups=1,
+ conv_b_dilation=(1, 1, 1),
+ norm=norm,
+ norm_eps=1e-5,
+ norm_momentum=0.1,
+ activation_bottleneck=activation,
+ activation_block=activation,
+ )
+ model_gt = ResBlock(
+ branch1_conv=nn.Conv3d(
+ 32, 64, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False
+ ),
+ branch1_norm=None if norm is None else norm(num_features=64),
+ branch2=BottleneckBlock(
+ conv_a=nn.Conv3d(
+ 32,
+ 16,
+ kernel_size=[3, 1, 1],
+ stride=[1, 1, 1],
+ padding=[1, 0, 0],
+ bias=False,
+ ),
+ norm_a=None if norm is None else norm(16),
+ act_a=None if activation is None else activation(),
+ conv_b=nn.Conv3d(
+ 16,
+ 16,
+ kernel_size=[1, 3, 3],
+ stride=[1, 2, 2],
+ padding=[0, 1, 1],
+ bias=False,
+ ),
+ norm_b=None if norm is None else norm(16),
+ act_b=None if activation is None else activation(),
+ conv_c=nn.Conv3d(
+ 16,
+ 64,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ bias=False,
+ ),
+ norm_c=None if norm is None else norm(64),
+ ),
+ activation=None if activation is None else activation(),
+ branch_fusion=lambda x, y: x + y,
+ )
+
+ model.load_state_dict(
+ model_gt.state_dict(), strict=True
+ ) # explicitly use strict mode.
+
+ # Test forwarding.
+ for input_tensor in TestBottleneckBlock._get_inputs(dim_in=32):
+ with torch.no_grad():
+ if input_tensor.shape[1] != 32:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(input_tensor)
+ continue
+
+ output_tensor = model(input_tensor)
+ output_tensor_gt = model_gt(input_tensor)
+
+ self.assertEqual(
+ output_tensor.shape,
+ output_tensor_gt.shape,
+ "Output shape {} is different from expected shape {}".format(
+ output_tensor.shape, output_tensor_gt.shape
+ ),
+ )
+ self.assertTrue(
+ np.allclose(output_tensor.numpy(), output_tensor_gt.numpy())
+ )
+
+ @staticmethod
+ def _get_inputs(dim_in: int = 3) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random inputs as test cases.
+ shapes = (
+ # Forward succeeded.
+ (1, dim_in, 3, 7, 7),
+ (1, dim_in, 5, 7, 7),
+ (1, dim_in, 7, 7, 7),
+ (2, dim_in, 3, 7, 7),
+ (4, dim_in, 3, 7, 7),
+ (8, dim_in, 3, 7, 7),
+ (2, dim_in, 3, 7, 14),
+ (2, dim_in, 3, 14, 7),
+ (2, dim_in, 3, 14, 14),
+ # Forward failed.
+ (8, dim_in * 2, 3, 7, 7),
+ (8, dim_in * 4, 5, 7, 7),
+ )
+ for shape in shapes:
+ yield torch.rand(shape)
+
+
+class TestResStageTransform(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_create_res_stage(self):
+ """
+ Test simple ResStage with different inputs.
+ """
+ for dim_in, dim_inner, dim_out in itertools.product(
+ (4, 8, 16), (2, 4), (4, 8, 16)
+ ):
+ model = ResStage(
+ res_blocks=nn.ModuleList(
+ [
+ ResBlock(
+ branch1_conv=nn.Conv3d(
+ dim_in, dim_out, kernel_size=(1, 1, 1)
+ )
+ if dim_in != dim_out
+ else None,
+ branch1_norm=nn.BatchNorm3d(num_features=dim_out)
+ if dim_in != dim_out
+ else None,
+ branch2=BottleneckBlock(
+ conv_a=nn.Conv3d(
+ dim_in,
+ dim_inner,
+ kernel_size=[3, 1, 1],
+ stride=[1, 1, 1],
+ padding=[1, 0, 0],
+ bias=False,
+ ),
+ norm_a=nn.BatchNorm3d(dim_inner),
+ act_a=nn.ReLU(),
+ conv_b=nn.Conv3d(
+ dim_inner,
+ dim_inner,
+ kernel_size=[1, 3, 3],
+ stride=[1, 1, 1],
+ padding=[0, 1, 1],
+ bias=False,
+ ),
+ norm_b=nn.BatchNorm3d(dim_inner),
+ act_b=nn.ReLU(),
+ conv_c=nn.Conv3d(
+ dim_inner,
+ dim_out,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ bias=False,
+ ),
+ norm_c=nn.BatchNorm3d(dim_out),
+ ),
+ activation=nn.ReLU(),
+ branch_fusion=lambda x, y: x + y,
+ ),
+ ResBlock(
+ branch1_conv=None,
+ branch1_norm=None,
+ branch2=BottleneckBlock(
+ conv_a=nn.Conv3d(
+ dim_out,
+ dim_inner,
+ kernel_size=[3, 1, 1],
+ stride=[1, 1, 1],
+ padding=[1, 0, 0],
+ bias=False,
+ ),
+ norm_a=nn.BatchNorm3d(dim_inner),
+ act_a=nn.ReLU(),
+ conv_b=nn.Conv3d(
+ dim_inner,
+ dim_inner,
+ kernel_size=[1, 3, 3],
+ stride=[1, 1, 1],
+ padding=[0, 1, 1],
+ bias=False,
+ ),
+ norm_b=nn.BatchNorm3d(dim_inner),
+ act_b=nn.ReLU(),
+ conv_c=nn.Conv3d(
+ dim_inner,
+ dim_out,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ bias=False,
+ ),
+ norm_c=nn.BatchNorm3d(dim_out),
+ ),
+ activation=nn.ReLU(),
+ branch_fusion=lambda x, y: x + y,
+ ),
+ ]
+ )
+ )
+
+ # Test forwarding.
+ for tensor in TestResStageTransform._get_inputs(dim_in):
+ if tensor.shape[1] != dim_in:
+ with self.assertRaises(RuntimeError):
+ out = model(tensor)
+ continue
+
+ out = model(tensor)
+
+ input_shape = tensor.shape
+ output_shape = out.shape
+ output_shape_gt = (
+ input_shape[0],
+ dim_out,
+ input_shape[2],
+ input_shape[3],
+ input_shape[4],
+ )
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_create_res_stage_with_callable(self):
+ """
+ Test builder `create_res_stage` with callable inputs.
+ """
+ dim_in, dim_inner, dim_out = 32, 16, 64
+ for (norm, activation) in itertools.product(
+ (nn.BatchNorm3d, None), (nn.ReLU, nn.Sigmoid, None)
+ ):
+ model = create_res_stage(
+ depth=2,
+ dim_in=dim_in,
+ dim_inner=dim_inner,
+ dim_out=dim_out,
+ bottleneck=create_bottleneck_block,
+ conv_a_kernel_size=(3, 1, 1),
+ conv_a_stride=(1, 1, 1),
+ conv_a_padding=(1, 0, 0),
+ conv_b_kernel_size=(1, 3, 3),
+ conv_b_stride=(1, 1, 1),
+ conv_b_padding=(0, 1, 1),
+ conv_b_num_groups=1,
+ conv_b_dilation=(1, 1, 1),
+ norm=norm,
+ norm_eps=1e-5,
+ norm_momentum=0.1,
+ activation=activation,
+ )
+ model_gt = ResStage(
+ res_blocks=nn.ModuleList(
+ [
+ ResBlock(
+ branch1_conv=nn.Conv3d(
+ dim_in, dim_out, kernel_size=(1, 1, 1), bias=False
+ )
+ if dim_in != dim_out
+ else None,
+ branch1_norm=None
+ if norm is None
+ else norm(num_features=dim_out)
+ if dim_in != dim_out
+ else None,
+ branch2=BottleneckBlock(
+ conv_a=nn.Conv3d(
+ dim_in,
+ dim_inner,
+ kernel_size=[3, 1, 1],
+ stride=[1, 1, 1],
+ padding=[1, 0, 0],
+ bias=False,
+ ),
+ norm_a=None if norm is None else norm(dim_inner),
+ act_a=None if activation is None else activation(),
+ conv_b=nn.Conv3d(
+ dim_inner,
+ dim_inner,
+ kernel_size=[1, 3, 3],
+ stride=[1, 1, 1],
+ padding=[0, 1, 1],
+ bias=False,
+ ),
+ norm_b=None if norm is None else norm(dim_inner),
+ act_b=None if activation is None else activation(),
+ conv_c=nn.Conv3d(
+ dim_inner,
+ dim_out,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ bias=False,
+ ),
+ norm_c=None if norm is None else norm(dim_out),
+ ),
+ activation=None if activation is None else activation(),
+ branch_fusion=lambda x, y: x + y,
+ ),
+ ResBlock(
+ branch1_conv=None,
+ branch1_norm=None,
+ branch2=BottleneckBlock(
+ conv_a=nn.Conv3d(
+ dim_out,
+ dim_inner,
+ kernel_size=[3, 1, 1],
+ stride=[1, 1, 1],
+ padding=[1, 0, 0],
+ bias=False,
+ ),
+ norm_a=None if norm is None else norm(dim_inner),
+ act_a=None if activation is None else activation(),
+ conv_b=nn.Conv3d(
+ dim_inner,
+ dim_inner,
+ kernel_size=[1, 3, 3],
+ stride=[1, 1, 1],
+ padding=[0, 1, 1],
+ bias=False,
+ ),
+ norm_b=None if norm is None else norm(dim_inner),
+ act_b=None if activation is None else activation(),
+ conv_c=nn.Conv3d(
+ dim_inner,
+ dim_out,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ bias=False,
+ ),
+ norm_c=None if norm is None else norm(dim_out),
+ ),
+ activation=None if activation is None else activation(),
+ branch_fusion=lambda x, y: x + y,
+ ),
+ ]
+ )
+ )
+ model.load_state_dict(
+ model_gt.state_dict(), strict=True
+ ) # explicitly use strict mode.
+
+ # Test forwarding.
+ for tensor in TestResStageTransform._get_inputs(dim_in=dim_in):
+ with torch.no_grad():
+ if tensor.shape[1] != 32:
+ with self.assertRaises(RuntimeError):
+ out = model(tensor)
+ continue
+
+ out = model(tensor)
+ out_gt = model_gt(tensor)
+
+ self.assertEqual(
+ out.shape,
+ out_gt.shape,
+ "Output shape {} is different from expected shape {}".format(
+ out.shape, out_gt.shape
+ ),
+ )
+ self.assertTrue(np.allclose(out.numpy(), out_gt.numpy()))
+
+ @staticmethod
+ def _get_inputs(dim_in: int = 3) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random inputs as test cases.
+ shapes = (
+ # Forward succeeded.
+ (1, dim_in, 3, 7, 7),
+ (1, dim_in, 5, 7, 7),
+ (1, dim_in, 7, 7, 7),
+ (2, dim_in, 3, 7, 7),
+ (4, dim_in, 3, 7, 7),
+ (8, dim_in, 3, 7, 7),
+ (2, dim_in, 3, 7, 14),
+ (2, dim_in, 3, 14, 7),
+ (2, dim_in, 3, 14, 14),
+ # Forward failed.
+ (8, dim_in * 2, 3, 7, 7),
+ (8, dim_in * 4, 5, 7, 7),
+ )
+ for shape in shapes:
+ yield torch.rand(shape)
+
+
+class TestResNet(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def _build_resnet(
+ self,
+ input_channel,
+ input_clip_length,
+ input_crop_size,
+ model_depth,
+ norm,
+ activation,
+ ):
+ _MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3), 101: (3, 4, 23, 3), 152: (3, 8, 36, 3)}
+ stem_dim_out = 8
+ model_num_class = 10
+ stages = []
+ # create the Stem for ResNet
+ stem = ResNetBasicStem(
+ conv=nn.Conv3d(
+ input_channel,
+ stem_dim_out,
+ kernel_size=[3, 7, 7],
+ stride=[1, 2, 2],
+ padding=[1, 3, 3],
+ bias=False,
+ ),
+ norm=None if norm is None else norm(stem_dim_out),
+ activation=None if activation is None else activation(),
+ pool=nn.MaxPool3d(
+ kernel_size=[1, 3, 3], stride=[1, 2, 2], padding=[0, 1, 1]
+ ),
+ )
+ stages.append(stem)
+
+ # get the number of Blocks for each Stage
+ stage_depths = _MODEL_STAGE_DEPTH[model_depth]
+
+ stage_dim_in = stem_dim_out
+ stage_dim_out = stage_dim_in * 4
+ stage_spatial_stride = (2, 1, 1, 1)
+ stage_temporal_stride = (2, 1, 1, 1)
+
+ # create each Stage for ResNet
+ for i in range(len(stage_depths)):
+ stage_dim_inner = stage_dim_out // 4
+ depth = stage_depths[i]
+
+ block_dim_in = stage_dim_in
+ block_dim_inner = stage_dim_inner
+ block_dim_out = stage_dim_out
+
+ blocks = []
+ for j in range(depth):
+ spatial_stride = stage_spatial_stride[i] if j == 0 else 1
+ temporal_stride = stage_temporal_stride[i] if j == 0 else 1
+ # create each Block for the Stage
+ block = ResBlock(
+ branch1_conv=nn.Conv3d(
+ block_dim_in,
+ block_dim_out,
+ kernel_size=(1, 1, 1),
+ stride=(temporal_stride, spatial_stride, spatial_stride),
+ bias=False,
+ )
+ if block_dim_in != block_dim_out
+ else None,
+ branch1_norm=None
+ if norm is None
+ else norm(block_dim_out)
+ if block_dim_in != block_dim_out
+ else None,
+ branch2=BottleneckBlock(
+ conv_a=nn.Conv3d(
+ block_dim_in,
+ block_dim_inner,
+ kernel_size=[3, 1, 1],
+ stride=[temporal_stride, 1, 1],
+ padding=[1, 0, 0],
+ bias=False,
+ ),
+ norm_a=None if norm is None else norm(block_dim_inner),
+ act_a=None if activation is None else activation(),
+ conv_b=nn.Conv3d(
+ block_dim_inner,
+ block_dim_inner,
+ kernel_size=[1, 3, 3],
+ stride=[1, spatial_stride, spatial_stride],
+ padding=[0, 1, 1],
+ bias=False,
+ ),
+ norm_b=None if norm is None else norm(block_dim_inner),
+ act_b=None if activation is None else activation(),
+ conv_c=nn.Conv3d(
+ block_dim_inner,
+ block_dim_out,
+ kernel_size=[1, 1, 1],
+ stride=[1, 1, 1],
+ padding=[0, 0, 0],
+ bias=False,
+ ),
+ norm_c=None if norm is None else norm(block_dim_out),
+ ),
+ activation=None if activation is None else activation(),
+ branch_fusion=lambda x, y: x + y,
+ )
+
+ block_dim_in = block_dim_out
+ blocks.append(block)
+
+ stage = ResStage(nn.ModuleList(blocks))
+ stages.append(stage)
+
+ stage_dim_in = stage_dim_out
+ stage_dim_out = stage_dim_out * 2
+
+ # Create Head for ResNet
+ total_spatial_stride = 4 * np.prod(stage_spatial_stride)
+ total_temporal_stride = np.prod(stage_temporal_stride)
+ head_pool_kernel_size = (
+ input_clip_length // total_temporal_stride,
+ input_crop_size // total_spatial_stride,
+ input_crop_size // total_spatial_stride,
+ )
+
+ head = ResNetBasicHead(
+ proj=nn.Linear(stage_dim_in, model_num_class),
+ activation=nn.Softmax(),
+ pool=nn.AvgPool3d(kernel_size=head_pool_kernel_size, stride=[1, 1, 1]),
+ dropout=None,
+ output_pool=nn.AdaptiveAvgPool3d(1),
+ )
+ stages.append(head)
+
+ return (Net(blocks=nn.ModuleList(stages)), model_num_class)
+
+ def test_create_resnet(self):
+ """
+ Test simple ResNet with different inputs.
+ """
+ for input_channel, input_clip_length, input_crop_size in itertools.product(
+ (3, 2), (2, 4), (56, 64)
+ ):
+ model_depth = 50
+ model, num_class = self._build_resnet(
+ input_channel,
+ input_clip_length,
+ input_crop_size,
+ model_depth,
+ nn.BatchNorm3d,
+ nn.ReLU,
+ )
+
+ # Test forwarding.
+ for tensor in TestResNet._get_inputs(
+ input_channel, input_clip_length, input_crop_size
+ ):
+ if tensor.shape[1] != input_channel:
+ with self.assertRaises(RuntimeError):
+ out = model(tensor)
+ continue
+
+ out = model(tensor)
+
+ output_shape = out.shape
+ output_shape_gt = (tensor.shape[0], num_class)
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_create_resnet_with_callable(self):
+ """
+ Test builder `create_resnet` with callable inputs.
+ """
+ for (norm, activation) in itertools.product(
+ (nn.BatchNorm3d, None), (nn.ReLU, nn.Sigmoid, None)
+ ):
+ input_channel = 3
+ input_clip_length = 4
+ input_crop_size = 56
+ model_depth = 50
+ stage_spatial_stride = (2, 1, 1, 1)
+ stage_temporal_stride = (2, 1, 1, 1)
+ model_gt, num_class = self._build_resnet(
+ input_channel,
+ input_clip_length,
+ input_crop_size,
+ model_depth,
+ norm,
+ activation,
+ )
+
+ total_spatial_stride = 4 * np.prod(stage_spatial_stride)
+ total_temporal_stride = np.prod(stage_temporal_stride)
+ head_pool_kernel_size = (
+ input_clip_length // total_temporal_stride,
+ input_crop_size // total_spatial_stride,
+ input_crop_size // total_spatial_stride,
+ )
+
+ model = create_resnet(
+ input_channel=input_channel,
+ model_depth=50,
+ model_num_class=num_class,
+ dropout_rate=0,
+ norm=norm,
+ activation=activation,
+ stem_dim_out=8,
+ stem_conv_kernel_size=(3, 7, 7),
+ stem_conv_stride=(1, 2, 2),
+ stem_pool=nn.MaxPool3d,
+ stem_pool_kernel_size=(1, 3, 3),
+ stem_pool_stride=(1, 2, 2),
+ stage_conv_a_kernel_size=((3, 1, 1),) * 4,
+ stage_conv_b_kernel_size=((1, 3, 3),) * 4,
+ stage_spatial_stride=stage_spatial_stride,
+ stage_temporal_stride=stage_temporal_stride,
+ bottleneck=create_bottleneck_block,
+ head_pool=nn.AvgPool3d,
+ head_pool_kernel_size=head_pool_kernel_size,
+ head_output_size=(1, 1, 1),
+ head_activation=nn.Softmax,
+ )
+
+ model.load_state_dict(
+ model_gt.state_dict(), strict=True
+ ) # explicitly use strict mode.
+
+ # Test forwarding.
+ for tensor in TestResNet._get_inputs(
+ input_channel, input_clip_length, input_crop_size
+ ):
+ with torch.no_grad():
+ if tensor.shape[1] != input_channel:
+ with self.assertRaises(RuntimeError):
+ out = model(tensor)
+ continue
+
+ out = model(tensor)
+ out_gt = model_gt(tensor)
+
+ self.assertEqual(
+ out.shape,
+ out_gt.shape,
+ "Output shape {} is different from expected shape {}".format(
+ out.shape, out_gt.shape
+ ),
+ )
+ self.assertTrue(
+ np.allclose(out.numpy(), out_gt.numpy(), rtol=1e-1, atol=1e-1)
+ )
+
+ def test_create_acoustic_resnet_with_callable(self):
+ """
+ Test builder `create_acoustic_resnet` with callable inputs.
+ """
+ _input_channel = 1
+ for (norm, activation) in itertools.product(
+ (nn.BatchNorm3d, None), (nn.ReLU, nn.Sigmoid, None)
+ ):
+ model = create_acoustic_resnet(
+ input_channel=_input_channel,
+ stem_conv_kernel_size=(3, 3, 3),
+ stem_conv_padding=(1, 1, 1),
+ model_depth=50,
+ model_num_class=400,
+ dropout_rate=0,
+ norm=norm,
+ activation=activation,
+ stem_dim_out=8,
+ stem_pool=nn.MaxPool3d,
+ stem_pool_kernel_size=(1, 3, 3),
+ stem_pool_stride=(1, 2, 2),
+ stage_conv_a_kernel_size=(3, 1, 1),
+ stage_conv_b_kernel_size=(1, 3, 3),
+ stage_spatial_stride=(2, 1, 1, 1),
+ stage_temporal_stride=(2, 1, 1, 1),
+ head_pool=nn.AvgPool3d,
+ head_output_size=(1, 1, 1),
+ head_activation=nn.Softmax,
+ )
+
+ # Test forwarding.
+ for tensor in TestResNet._get_inputs(_input_channel, 1, 56):
+ with torch.no_grad():
+ if tensor.shape[1] != _input_channel:
+ with self.assertRaises(RuntimeError):
+ model(tensor)
+ continue
+ model(tensor)
+
+ def test_load_hubconf(self):
+ path = os.path.join(
+ os.path.dirname(os.path.realpath(__file__)),
+ "..",
+ )
+ input_channel = 3
+ input_clip_length = 2
+ input_crop_size = 56
+ model = torch.hub.load(
+ repo_or_dir=path, source="local", model="slow_r50", pretrained=False
+ )
+ self.assertIsNotNone(model)
+
+ # Test forwarding.
+ for tensor in TestResNet._get_inputs(
+ input_channel, input_clip_length, input_crop_size
+ ):
+ with torch.no_grad():
+ if tensor.shape[1] != input_channel:
+ with self.assertRaises(RuntimeError):
+ model(tensor)
+ continue
+
+ @staticmethod
+ def _get_inputs(
+ channel: int = 3, clip_length: int = 8, crop_size: int = 224
+ ) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random inputs as test cases.
+ shapes = (
+ (1, channel, clip_length, crop_size, crop_size),
+ (2, channel, clip_length, crop_size, crop_size),
+ )
+ for shape in shapes:
+ yield torch.rand(shape)
diff --git a/tests/test_models_slowfast.py b/tests/test_models_slowfast.py
new file mode 100644
index 00000000..27b51ef7
--- /dev/null
+++ b/tests/test_models_slowfast.py
@@ -0,0 +1,99 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import itertools
+import os
+import unittest
+from typing import Tuple
+
+import torch
+from pytorchvideo.models.slowfast import create_slowfast
+from pytorchvideo.transforms.functional import repeat_temporal_frames_subsample
+from torch import nn
+
+
+class TestSlowFast(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_load_hubconf(self):
+ path = os.path.join(
+ os.path.dirname(os.path.realpath(__file__)),
+ "..",
+ )
+ for model_name in ["slowfast_r50", "slowfast_r101"]:
+ model = torch.hub.load(
+ repo_or_dir=path, source="local", model=model_name, pretrained=False
+ )
+ self.assertIsNotNone(model)
+
+ input_clip_length = 32
+ input_crop_size = 224
+ input_channel = 3
+ # Test forwarding.
+ for tensor in TestSlowFast._get_inputs(
+ input_channel, input_clip_length, input_crop_size
+ ):
+ with torch.no_grad():
+ if tensor[0].shape[1] != input_channel:
+ with self.assertRaises(RuntimeError):
+ model(tensor)
+ continue
+
+ model(tensor)
+
+ def test_create_slowfast_with_callable(self):
+ """
+ Test builder `create_slowfast` with callable inputs.
+ """
+ for (norm, activation) in itertools.product(
+ (nn.BatchNorm3d, None), (nn.ReLU, nn.Sigmoid, None)
+ ):
+ input_clip_length = 32
+ input_crop_size = 224
+ input_channel = 3
+
+ model = create_slowfast(
+ slowfast_channel_reduction_ratio=8,
+ slowfast_conv_channel_fusion_ratio=2,
+ slowfast_fusion_conv_kernel_size=(7, 1, 1),
+ slowfast_fusion_conv_stride=(4, 1, 1),
+ input_channels=(input_channel,) * 2,
+ model_depth=18,
+ model_num_class=400,
+ dropout_rate=0,
+ norm=norm,
+ activation=activation,
+ )
+
+ # Test forwarding.
+ for tensor in TestSlowFast._get_inputs(
+ input_channel, input_clip_length, input_crop_size
+ ):
+ with torch.no_grad():
+ if tensor[0].shape[1] != input_channel:
+ with self.assertRaises(RuntimeError):
+ model(tensor)
+ continue
+
+ model(tensor)
+
+ @staticmethod
+ def _get_inputs(
+ channel: int = 3,
+ clip_length: int = 8,
+ crop_size: int = 224,
+ frame_ratios: Tuple[int] = (4, 1),
+ ) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random inputs as test cases.
+ shapes = ((1, channel, clip_length, crop_size, crop_size),)
+ for shape in shapes:
+ yield repeat_temporal_frames_subsample(
+ torch.rand(shape), frame_ratios=frame_ratios, temporal_dim=2
+ )
diff --git a/tests/test_models_stem.py b/tests/test_models_stem.py
new file mode 100644
index 00000000..5056bf8a
--- /dev/null
+++ b/tests/test_models_stem.py
@@ -0,0 +1,303 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import itertools
+import unittest
+
+import numpy as np
+import torch
+from pytorchvideo.layers.convolutions import ConvReduce3D
+from pytorchvideo.models.stem import (
+ ResNetBasicStem,
+ create_acoustic_res_basic_stem,
+ create_res_basic_stem,
+)
+from torch import nn
+
+
+class TestResNetBasicStem(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_create_simple_stem(self):
+ """
+ Test simple ResNetBasicStem (without pooling layer).
+ """
+ for input_dim, output_dim in itertools.product((2, 3), (4, 8, 16)):
+ model = ResNetBasicStem(
+ conv=nn.Conv3d(
+ input_dim,
+ output_dim,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=False,
+ ),
+ norm=nn.BatchNorm3d(output_dim),
+ activation=nn.ReLU(),
+ pool=None,
+ )
+
+ # Test forwarding.
+ for tensor in TestResNetBasicStem._get_inputs(input_dim):
+ if tensor.shape[1] != input_dim:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(tensor)
+ continue
+ else:
+ output_tensor = model(tensor)
+
+ input_shape = tensor.shape
+ output_shape = output_tensor.shape
+ output_shape_gt = (
+ input_shape[0],
+ output_dim,
+ input_shape[2],
+ input_shape[3],
+ input_shape[4],
+ )
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_create_stem_with_conv_reduced_3d(self):
+ """
+ Test simple ResNetBasicStem with ConvReduce3D.
+ """
+ for input_dim, output_dim in itertools.product((2, 3), (4, 8, 16)):
+ model = ResNetBasicStem(
+ conv=ConvReduce3D(
+ in_channels=input_dim,
+ out_channels=output_dim,
+ kernel_size=(3, 3),
+ stride=(1, 1),
+ padding=(1, 1),
+ bias=(False, False),
+ ),
+ norm=nn.BatchNorm3d(output_dim),
+ activation=nn.ReLU(),
+ pool=None,
+ )
+
+ # Test forwarding.
+ for tensor in TestResNetBasicStem._get_inputs(input_dim):
+ if tensor.shape[1] != input_dim:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(tensor)
+ continue
+ else:
+ output_tensor = model(tensor)
+
+ input_shape = tensor.shape
+ output_shape = output_tensor.shape
+ output_shape_gt = (
+ input_shape[0],
+ output_dim,
+ input_shape[2],
+ input_shape[3],
+ input_shape[4],
+ )
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_create_complex_stem(self):
+ """
+ Test complex ResNetBasicStem.
+ """
+ for input_dim, output_dim in itertools.product((2, 3), (4, 8, 16)):
+ model = ResNetBasicStem(
+ conv=nn.Conv3d(
+ input_dim,
+ output_dim,
+ kernel_size=[3, 7, 7],
+ stride=[1, 2, 2],
+ padding=[1, 3, 3],
+ bias=False,
+ ),
+ norm=nn.BatchNorm3d(output_dim),
+ activation=nn.ReLU(),
+ pool=nn.MaxPool3d(
+ kernel_size=[1, 3, 3], stride=[1, 2, 2], padding=[0, 1, 1]
+ ),
+ )
+
+ # Test forwarding.
+ for input_tensor in TestResNetBasicStem._get_inputs(input_dim):
+ if input_tensor.shape[1] != input_dim:
+ with self.assertRaises(Exception):
+ output_tensor = model(input_tensor)
+ continue
+ else:
+ output_tensor = model(input_tensor)
+
+ input_shape = input_tensor.shape
+ output_shape = output_tensor.shape
+
+ output_shape_gt = (
+ input_shape[0],
+ output_dim,
+ input_shape[2],
+ (((input_shape[3] - 1) // 2 + 1) - 1) // 2 + 1,
+ (((input_shape[4] - 1) // 2 + 1) - 1) // 2 + 1,
+ )
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_create_stem_with_callable(self):
+ """
+ Test builder `create_res_basic_stem` with callable inputs.
+ """
+ for (pool, activation, norm) in itertools.product(
+ (nn.AvgPool3d, nn.MaxPool3d, None),
+ (nn.ReLU, nn.Softmax, nn.Sigmoid, None),
+ (nn.BatchNorm3d, None),
+ ):
+ model = create_res_basic_stem(
+ in_channels=3,
+ out_channels=64,
+ pool=pool,
+ activation=activation,
+ norm=norm,
+ )
+ model_gt = ResNetBasicStem(
+ conv=nn.Conv3d(
+ 3,
+ 64,
+ kernel_size=[3, 7, 7],
+ stride=[1, 2, 2],
+ padding=[1, 3, 3],
+ bias=False,
+ ),
+ norm=None if norm is None else norm(64),
+ activation=None if activation is None else activation(),
+ pool=None
+ if pool is None
+ else pool(kernel_size=[1, 3, 3], stride=[1, 2, 2], padding=[0, 1, 1]),
+ )
+
+ model.load_state_dict(
+ model_gt.state_dict(), strict=True
+ ) # explicitly use strict mode.
+
+ # Test forwarding.
+ for input_tensor in TestResNetBasicStem._get_inputs():
+ with torch.no_grad():
+ if input_tensor.shape[1] != 3:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(input_tensor)
+ continue
+ else:
+ output_tensor = model(input_tensor)
+ output_tensor_gt = model_gt(input_tensor)
+ self.assertEqual(
+ output_tensor.shape,
+ output_tensor_gt.shape,
+ "Output shape {} is different from expected shape {}".format(
+ output_tensor.shape, output_tensor_gt.shape
+ ),
+ )
+ self.assertTrue(
+ np.allclose(output_tensor.numpy(), output_tensor_gt.numpy())
+ )
+
+ def test_create_acoustic_stem_with_callable(self):
+ """
+ Test builder `create_acoustic_res_basic_stem` with callable
+ inputs.
+ """
+ for (pool, activation, norm) in itertools.product(
+ (nn.AvgPool3d, nn.MaxPool3d, None),
+ (nn.ReLU, nn.Softmax, nn.Sigmoid, None),
+ (nn.BatchNorm3d, None),
+ ):
+ model = create_acoustic_res_basic_stem(
+ in_channels=3,
+ out_channels=64,
+ pool=pool,
+ activation=activation,
+ norm=norm,
+ )
+ model_gt = ResNetBasicStem(
+ conv=ConvReduce3D(
+ in_channels=3,
+ out_channels=64,
+ kernel_size=((3, 1, 1), (1, 7, 7)),
+ stride=((1, 1, 1), (1, 1, 1)),
+ padding=((1, 0, 0), (0, 3, 3)),
+ bias=(False, False),
+ ),
+ norm=None if norm is None else norm(64),
+ activation=None if activation is None else activation(),
+ pool=None
+ if pool is None
+ else pool(kernel_size=[1, 3, 3], stride=[1, 2, 2], padding=[0, 1, 1]),
+ )
+
+ model.load_state_dict(
+ model_gt.state_dict(), strict=True
+ ) # explicitly use strict mode.
+
+ # Test forwarding.
+ for input_tensor in TestResNetBasicStem._get_inputs():
+ with torch.no_grad():
+ if input_tensor.shape[1] != 3:
+ with self.assertRaises(RuntimeError):
+ output_tensor = model(input_tensor)
+ continue
+ else:
+ output_tensor = model(input_tensor)
+ output_tensor_gt = model_gt(input_tensor)
+ self.assertEqual(
+ output_tensor.shape,
+ output_tensor_gt.shape,
+ "Output shape {} is different from expected shape {}".format(
+ output_tensor.shape, output_tensor_gt.shape
+ ),
+ )
+ self.assertTrue(
+ np.allclose(output_tensor.numpy(), output_tensor_gt.numpy())
+ )
+
+ @staticmethod
+ def _get_inputs(input_dim: int = 3) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random tensor as test cases.
+ shapes = (
+ # Forward succeeded.
+ (1, input_dim, 3, 7, 7),
+ (1, input_dim, 5, 7, 7),
+ (1, input_dim, 7, 7, 7),
+ (2, input_dim, 3, 7, 7),
+ (4, input_dim, 3, 7, 7),
+ (8, input_dim, 3, 7, 7),
+ (2, input_dim, 3, 7, 14),
+ (2, input_dim, 3, 14, 7),
+ (2, input_dim, 3, 14, 14),
+ # Forward failed.
+ (8, input_dim * 2, 3, 7, 7),
+ (8, input_dim * 4, 5, 7, 7),
+ )
+ for shape in shapes:
+ yield torch.rand(shape)
diff --git a/tests/test_models_x3d.py b/tests/test_models_x3d.py
new file mode 100644
index 00000000..e0014c0b
--- /dev/null
+++ b/tests/test_models_x3d.py
@@ -0,0 +1,135 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import os
+import unittest
+
+import torch
+from pytorchvideo.layers.swish import Swish
+from pytorchvideo.models.x3d import create_x3d, create_x3d_bottleneck_block
+from torch import nn
+
+
+class TestX3d(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_create_x3d(self):
+ """
+ To test different versions of X3D, set the input to:
+ X3D-XS: (4, 160, 2.0, 2.2, 2.25)
+ X3D-S: (13, 160, 2.0, 2.2, 2.25)
+ X3D-M: (16, 224, 2.0, 2.2, 2.25)
+ X3D-L: (16, 312, 2.0, 5.0, 2.25)
+
+ Each of the parameters corresponds to input_clip_length, input_crop_size,
+ width_factor, depth_factor and bottleneck_factor.
+ """
+ for (
+ input_clip_length,
+ input_crop_size,
+ width_factor,
+ depth_factor,
+ bottleneck_factor,
+ ) in [
+ (4, 160, 2.0, 2.2, 2.25),
+ ]:
+ model = create_x3d(
+ input_clip_length=input_clip_length,
+ input_crop_size=input_crop_size,
+ model_num_class=400,
+ dropout_rate=0.5,
+ width_factor=width_factor,
+ depth_factor=depth_factor,
+ norm=nn.BatchNorm3d,
+ activation=nn.ReLU,
+ stem_dim_in=12,
+ stem_conv_kernel_size=(5, 3, 3),
+ stem_conv_stride=(1, 2, 2),
+ stage_conv_kernel_size=((3, 3, 3),) * 4,
+ stage_spatial_stride=(2, 2, 2, 2),
+ stage_temporal_stride=(1, 1, 1, 1),
+ bottleneck=create_x3d_bottleneck_block,
+ bottleneck_factor=bottleneck_factor,
+ se_ratio=0.0625,
+ inner_act=Swish,
+ head_dim_out=2048,
+ head_pool_act=nn.ReLU,
+ head_bn_lin5_on=False,
+ head_activation=nn.Softmax,
+ )
+
+ # Test forwarding.
+ for tensor in TestX3d._get_inputs(input_clip_length, input_crop_size):
+ if tensor.shape[1] != 3:
+ with self.assertRaises(RuntimeError):
+ out = model(tensor)
+ continue
+
+ out = model(tensor)
+
+ output_shape = out.shape
+ output_shape_gt = (tensor.shape[0], 400)
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ def test_load_hubconf(self):
+ path = os.path.join(
+ os.path.dirname(os.path.realpath(__file__)),
+ "..",
+ )
+ for (input_clip_length, input_crop_size, model_name) in [
+ (4, 160, "x3d_xs"),
+ (13, 160, "x3d_s"),
+ (16, 224, "x3d_m"),
+ ]:
+ model = torch.hub.load(
+ repo_or_dir=path,
+ source="local",
+ model=model_name,
+ pretrained=False,
+ head_output_with_global_average=True,
+ )
+ self.assertIsNotNone(model)
+
+ # Test forwarding.
+ for tensor in TestX3d._get_inputs(input_clip_length, input_crop_size):
+ if tensor.shape[1] != 3:
+ with self.assertRaises(RuntimeError):
+ out = model(tensor)
+ continue
+
+ out = model(tensor)
+
+ output_shape = out.shape
+ output_shape_gt = (tensor.shape[0], 400)
+
+ self.assertEqual(
+ output_shape,
+ output_shape_gt,
+ "Output shape {} is different from expected shape {}".format(
+ output_shape, output_shape_gt
+ ),
+ )
+
+ @staticmethod
+ def _get_inputs(clip_length: int = 4, crop_size: int = 160) -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random inputs as test cases.
+ shapes = (
+ (1, 3, clip_length, crop_size, crop_size),
+ (2, 3, clip_length, crop_size, crop_size),
+ )
+ for shape in shapes:
+ yield torch.rand(shape)
diff --git a/tests/test_simclr.py b/tests/test_simclr.py
new file mode 100644
index 00000000..962b5425
--- /dev/null
+++ b/tests/test_simclr.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import unittest
+
+import torch
+from pytorchvideo.models.simclr import SimCLR
+from torch import nn
+
+
+class TestSimCLR(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ torch.set_rng_state(torch.manual_seed(42).get_state())
+
+ def test_simclr(self):
+ simclr = SimCLR(
+ backbone=nn.Linear(8, 4),
+ mlp=nn.Linear(4, 2),
+ temperature=0.07,
+ )
+ for crop1, crop2 in TestSimCLR._get_inputs():
+ simclr(crop1, crop2)
+
+ @staticmethod
+ def _get_inputs() -> torch.tensor:
+ """
+ Provide different tensors as test cases.
+
+ Yield:
+ (torch.tensor): tensor as test case input.
+ """
+ # Prepare random inputs as test cases.
+ shapes = (
+ (1, 8),
+ (2, 8),
+ )
+ for shape in shapes:
+ yield torch.rand(shape), torch.rand(shape)
diff --git a/tests/test_transforms.py b/tests/test_transforms.py
new file mode 100644
index 00000000..30ddba16
--- /dev/null
+++ b/tests/test_transforms.py
@@ -0,0 +1,196 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import unittest
+
+import torch
+from pytorchvideo.data.utils import thwc_to_cthw
+from pytorchvideo.transforms import (
+ ApplyTransformToKey,
+ RandomShortSideScale,
+ UniformCropVideo,
+ UniformTemporalSubsample,
+)
+from pytorchvideo.transforms.functional import (
+ repeat_temporal_frames_subsample,
+ short_side_scale,
+ uniform_crop,
+ uniform_temporal_subsample,
+)
+from torchvision.transforms import Compose
+from torchvision.transforms._transforms_video import (
+ NormalizeVideo,
+ RandomCropVideo,
+ RandomHorizontalFlipVideo,
+)
+from utils import create_dummy_video_frames
+
+
+class TestTransforms(unittest.TestCase):
+ def test_compose_with_video_transforms(self):
+ video = thwc_to_cthw(create_dummy_video_frames(20, 30, 40)).to(
+ dtype=torch.float32
+ )
+ test_clip = {"video": video, "label": 0}
+
+ # Compose using torchvision and pytorchvideo transformst to ensure they interact
+ # correctly.
+ num_subsample = 10
+ transform = Compose(
+ [
+ ApplyTransformToKey(
+ key="video",
+ transform=Compose(
+ [
+ UniformTemporalSubsample(num_subsample),
+ NormalizeVideo([video.mean()] * 3, [video.std()] * 3),
+ RandomShortSideScale(min_size=15, max_size=25),
+ RandomCropVideo(10),
+ RandomHorizontalFlipVideo(p=0.5),
+ ]
+ ),
+ )
+ ]
+ )
+
+ actual = transform(test_clip)
+ c, t, h, w = actual["video"].shape
+ self.assertEqual(c, 3)
+ self.assertEqual(t, num_subsample)
+ self.assertEqual(h, 10)
+ self.assertEqual(w, 10)
+
+ def test_uniform_temporal_subsample(self):
+ video = thwc_to_cthw(create_dummy_video_frames(20, 30, 40)).to(
+ dtype=torch.float32
+ )
+ actual = uniform_temporal_subsample(video, video.shape[1])
+ self.assertTrue(actual.equal(video))
+
+ video = thwc_to_cthw(create_dummy_video_frames(20, 30, 40)).to(
+ dtype=torch.float32
+ )
+ actual = uniform_temporal_subsample(video, video.shape[1] // 2)
+ self.assertTrue(actual.equal(video[:, [0, 2, 4, 6, 8, 10, 12, 14, 16, 19]]))
+
+ video = thwc_to_cthw(create_dummy_video_frames(20, 30, 40)).to(
+ dtype=torch.float32
+ )
+ actual = uniform_temporal_subsample(video, 1)
+ self.assertTrue(actual.equal(video[:, 0:1]))
+
+ def test_short_side_scale_width_shorter_pytorch(self):
+ video = thwc_to_cthw(create_dummy_video_frames(20, 20, 10)).to(
+ dtype=torch.float32
+ )
+ actual = short_side_scale(video, 5, backend="pytorch")
+ self.assertEqual(actual.shape, (3, 20, 10, 5))
+
+ def test_short_side_scale_height_shorter_pytorch(self):
+ video = thwc_to_cthw(create_dummy_video_frames(20, 10, 20)).to(
+ dtype=torch.float32
+ )
+ actual = short_side_scale(video, 5, backend="pytorch")
+ self.assertEqual(actual.shape, (3, 20, 5, 10))
+
+ def test_short_side_scale_equal_size_pytorch(self):
+ video = thwc_to_cthw(create_dummy_video_frames(20, 10, 10)).to(
+ dtype=torch.float32
+ )
+ actual = short_side_scale(video, 10, backend="pytorch")
+ self.assertEqual(actual.shape, (3, 20, 10, 10))
+
+ def test_short_side_scale_width_shorter_opencv(self):
+ video = thwc_to_cthw(create_dummy_video_frames(20, 20, 10)).to(
+ dtype=torch.float32
+ )
+ actual = short_side_scale(video, 5, backend="opencv")
+ self.assertEqual(actual.shape, (3, 20, 10, 5))
+
+ def test_short_side_scale_height_shorter_opencv(self):
+ video = thwc_to_cthw(create_dummy_video_frames(20, 10, 20)).to(
+ dtype=torch.float32
+ )
+ actual = short_side_scale(video, 5, backend="opencv")
+ self.assertEqual(actual.shape, (3, 20, 5, 10))
+
+ def test_short_side_scale_equal_size_opencv(self):
+ video = thwc_to_cthw(create_dummy_video_frames(20, 10, 10)).to(
+ dtype=torch.float32
+ )
+ actual = short_side_scale(video, 10, backend="opencv")
+ self.assertEqual(actual.shape, (3, 20, 10, 10))
+
+ def test_torchscriptable_input_output(self):
+ video = thwc_to_cthw(create_dummy_video_frames(20, 30, 40)).to(
+ dtype=torch.float32
+ )
+
+ # Test all the torchscriptable tensors.
+ for transform in [UniformTemporalSubsample(10), RandomShortSideScale(10, 20)]:
+
+ transform_script = torch.jit.script(transform)
+ self.assertTrue(isinstance(transform_script, torch.jit.ScriptModule))
+
+ # Seed before each transform to force determinism.
+ torch.manual_seed(0)
+ output = transform(video)
+ torch.manual_seed(0)
+ script_output = transform_script(video)
+ self.assertTrue(output.equal(script_output))
+
+ def test_repeat_temporal_frames_subsample(self):
+ video = thwc_to_cthw(create_dummy_video_frames(32, 10, 10)).to(
+ dtype=torch.float32
+ )
+ actual = repeat_temporal_frames_subsample(video, (1, 4))
+ expected_shape = ((3, 32, 10, 10), (3, 8, 10, 10))
+ for idx in range(len(actual)):
+ self.assertEqual(actual[idx].shape, expected_shape[idx])
+
+ def test_uniform_crop(self):
+ # For videos with height < width.
+ video = thwc_to_cthw(create_dummy_video_frames(20, 30, 40)).to(
+ dtype=torch.float32
+ )
+ # Left crop.
+ actual = uniform_crop(video, size=20, spatial_idx=0)
+ self.assertTrue(actual.equal(video[:, :, 5:25, :20]))
+ # Center crop.
+ actual = uniform_crop(video, size=20, spatial_idx=1)
+ self.assertTrue(actual.equal(video[:, :, 5:25, 10:30]))
+ # Right crop.
+ actual = uniform_crop(video, size=20, spatial_idx=2)
+ self.assertTrue(actual.equal(video[:, :, 5:25, 20:]))
+
+ # For videos with height > width.
+ video = thwc_to_cthw(create_dummy_video_frames(20, 40, 30)).to(
+ dtype=torch.float32
+ )
+ # Top crop.
+ actual = uniform_crop(video, size=20, spatial_idx=0)
+ self.assertTrue(actual.equal(video[:, :, :20, 5:25]))
+ # Center crop.
+ actual = uniform_crop(video, size=20, spatial_idx=1)
+ self.assertTrue(actual.equal(video[:, :, 10:30, 5:25]))
+ # Bottom crop.
+ actual = uniform_crop(video, size=20, spatial_idx=2)
+ self.assertTrue(actual.equal(video[:, :, 20:, 5:25]))
+
+ def test_uniform_crop_transforms(self):
+ video = thwc_to_cthw(create_dummy_video_frames(10, 30, 40)).to(
+ dtype=torch.float32
+ )
+ test_clip = {"video": video, "aug_index": 1, "label": 0}
+
+ transform = UniformCropVideo(20)
+
+ actual = transform(test_clip)
+ c, t, h, w = actual["video"].shape
+ self.assertEqual(c, 3)
+ self.assertEqual(t, 10)
+ self.assertEqual(h, 20)
+ self.assertEqual(w, 20)
+ self.assertTrue(actual["video"].equal(video[:, :, 5:25, 10:30]))
+
+ # TODO: add a test case for short_side_scale in the next diff
+ # (a sanity check to make sure the interp is not changed)
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 00000000..01f867fa
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,217 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import contextlib
+import os
+import pathlib
+import tempfile
+
+import av
+import numpy as np
+import torch
+import torchvision.io as io
+import torchvision.transforms as transforms
+from pytorchvideo.data.dataset_manifest_utils import (
+ EncodedVideoInfo,
+ VideoFrameInfo,
+ VideoInfo,
+)
+from pytorchvideo.data.utils import thwc_to_cthw
+
+
+def create_dummy_video_frames(num_frames: int, height: int, width: int):
+ y, x = torch.meshgrid(torch.linspace(-2, 2, height), torch.linspace(-2, 2, width))
+ data = []
+ for i in range(num_frames):
+ xc = float(i) / num_frames
+ yc = 1 - float(i) / (2 * num_frames)
+ d = torch.exp(-((x - xc) ** 2 + (y - yc) ** 2) / 2) * 255
+ data.append(d.unsqueeze(2).repeat(1, 1, 3).byte())
+ return torch.stack(data, 0)
+
+
+@contextlib.contextmanager
+def temp_encoded_video(num_frames: int, fps: int, height=10, width=10, prefix=None):
+ """
+ Creates a temporary lossless, mp4 video with synthetic content. Uses a context which
+ deletes the video after exit.
+ """
+ # Lossless options.
+ video_codec = "libx264rgb"
+ options = {"crf": "0"}
+ data = create_dummy_video_frames(num_frames, height, width)
+ with tempfile.NamedTemporaryFile(prefix=prefix, suffix=".mp4") as f:
+ f.close()
+ io.write_video(f.name, data, fps=fps, video_codec=video_codec, options=options)
+ yield f.name, thwc_to_cthw(data).to(torch.float32)
+ os.unlink(f.name)
+
+
+@contextlib.contextmanager
+def temp_encoded_video_with_audio(
+ num_frames: int,
+ fps: int,
+ num_audio_samples: int,
+ audio_rate: int = 48000,
+ height=10,
+ width=10,
+ prefix=None,
+):
+ audio_data = torch.from_numpy(np.random.rand(1, num_audio_samples).astype("` (for simple layers) and `pytorchvideo/models/accelerator/` (for complex modules such as residual block). Inferencing of a model built up with corresponding efficient blocks on target device is guranteed to be efficient.\n",
+ "\n",
+ "Each efficient block module is an instance of nn.Module, and has two forms: **original form** (for training) and **deploy form** (for inference). When in original form, the efficient block module has exactly the same behavior as a corresponding vanilla nn.Module for both forward and backward operation. User can freely mix and match efficient blocks for the same target device and build up their own model. Once model is built and trained, user can convert each efficient block in model into deploy form. The conversion will do graph and kernel optimization on each efficient block, and efficient block in deploy form is arithmetically equivalent to original form but has much higher efficiency during inference. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "zrK_kLiClgMB"
+ },
+ "source": [
+ "## Design, train and deploy a model composed of efficient blocks for mobile CPU\n",
+ "### Build a model\n",
+ "In this section, let's go through the process of design, train and deploy using a example toy model using efficient blocks under `pytorchvideo/layers/accelerator/mobile_cpu` and `pytorchvideo/models/accelerator/mobile_cpu`, which includes:\n",
+ "- One conv3d head layer with 5x1x1 kernel followed by ReLU activation;\n",
+ "- One residual block with squeeze-excite;\n",
+ "- One average pool and fully connected layer as final output.\n",
+ "\n",
+ "First, let's import efficient blocks."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "0jg4cZI5lgMC"
+ },
+ "outputs": [],
+ "source": [
+ "# Imports\n",
+ "import torch.nn as nn\n",
+ "from pytorchvideo.layers.accelerator.mobile_cpu.activation_functions import (\n",
+ " supported_act_functions,\n",
+ ")\n",
+ "from pytorchvideo.layers.accelerator.mobile_cpu.convolutions import (\n",
+ " Conv3d5x1x1BnAct,\n",
+ ")\n",
+ "from pytorchvideo.models.accelerator.mobile_cpu.residual_blocks import (\n",
+ " X3dBottleneckBlock,\n",
+ ")\n",
+ "from pytorchvideo.layers.accelerator.mobile_cpu.pool import AdaptiveAvgPool3dOutSize1\n",
+ "from pytorchvideo.layers.accelerator.mobile_cpu.fully_connected import FullyConnected\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "MxKCY8TzlgMC"
+ },
+ "source": [
+ "Then we can build a model using those efficient blocks."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "FYNnTanxlgMD"
+ },
+ "outputs": [],
+ "source": [
+ "class MyNet(nn.Module):\n",
+ " def __init__(\n",
+ " self,\n",
+ " in_channel=3, # input channel of first 5x1x1 layer\n",
+ " residual_block_channel=24, # input channel of residual block\n",
+ " expansion_ratio=3, # expansion ratio of residual block\n",
+ " num_classes=4, # final output classes\n",
+ " ):\n",
+ " super().__init__()\n",
+ " # s1 - 5x1x1 conv3d layer\n",
+ " self.s1 = Conv3d5x1x1BnAct(\n",
+ " in_channel,\n",
+ " residual_block_channel,\n",
+ " bias=False,\n",
+ " groups=1,\n",
+ " use_bn=False,\n",
+ " )\n",
+ " # s2 - residual block\n",
+ " mid_channel = int(residual_block_channel * expansion_ratio)\n",
+ " self.s2 = X3dBottleneckBlock(\n",
+ " in_channels=residual_block_channel,\n",
+ " mid_channels=mid_channel,\n",
+ " out_channels=residual_block_channel,\n",
+ " use_residual=True,\n",
+ " spatial_stride=1,\n",
+ " se_ratio=0.0625,\n",
+ " act_functions=(\"relu\", \"swish\", \"relu\"),\n",
+ " use_bn=(True, True, True),\n",
+ " )\n",
+ " # Average pool and fully connected layer\n",
+ " self.avg_pool = AdaptiveAvgPool3dOutSize1()\n",
+ " self.projection = FullyConnected(residual_block_channel, num_classes, bias=True)\n",
+ " self.act = supported_act_functions['relu']()\n",
+ "\n",
+ " def forward(self, x):\n",
+ " x = self.s1(x)\n",
+ " x = self.s2(x)\n",
+ " x = self.avg_pool(x)\n",
+ " # (N, C, T, H, W) -> (N, T, H, W, C).\n",
+ " x = x.permute((0, 2, 3, 4, 1))\n",
+ " x = self.projection(x)\n",
+ " # Performs fully convlutional inference.\n",
+ " if not self.training:\n",
+ " x = self.act(x)\n",
+ " x = x.mean([1, 2, 3])\n",
+ " x = x.view(x.shape[0], -1)\n",
+ "\n",
+ " return x"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "fB-_UEHilgMD"
+ },
+ "source": [
+ "We can instantiate MyNet and its efficient blocks will be in original form."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "FvXjdqT1lgMD"
+ },
+ "outputs": [],
+ "source": [
+ "net_inst = MyNet()\n",
+ "print(net_inst)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-O6jd3umlgMF"
+ },
+ "source": [
+ "### Train model\n",
+ "Then we can train the model with your dataset/optimizer. Here we skip this training step, and just leave the weight as initial value."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "RdNV2EeMlgMF"
+ },
+ "source": [
+ "### Deploy model\n",
+ "Now the model is ready to deploy. First of all, let's convert the model into deploy form. In order to do that, we need to use `convert_to_deployable_form` utility and provide an example input tensor to the model. Note that once the model is converted into deploy form, the input size should be the same as the example input tensor size during conversion."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "hA5ER4bLlgMF"
+ },
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "from pytorchvideo.accelerator.deployment.mobile_cpu.utils.model_conversion import (\n",
+ " convert_to_deployable_form,\n",
+ ")\n",
+ "input_blob_size = (1, 3, 4, 6, 6)\n",
+ "input_tensor = torch.randn(input_blob_size)\n",
+ "net_inst_deploy = convert_to_deployable_form(net_inst, input_tensor)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6FC6knxWlgMG"
+ },
+ "source": [
+ "We can see that the network graph has been changed after conversion, which did kernel and graph optimization."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "WKXr2Pi1lgMG"
+ },
+ "outputs": [],
+ "source": [
+ "print(net_inst_deploy)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "BlA-TZivlgMG"
+ },
+ "source": [
+ "Let's check whether the network after conversion is arithmetically equivalent. We expect the output to be very close before/after conversion, with some small difference due to numeric noise from floating point operation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "I8lsM5oulgMG"
+ },
+ "outputs": [],
+ "source": [
+ "net_inst.eval()\n",
+ "out_ref = net_inst(input_tensor)\n",
+ "out = net_inst_deploy(input_tensor)\n",
+ "\n",
+ "max_err = float(torch.max(torch.abs(out_ref - out)))\n",
+ "print(f\"max error is {max_err}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Yq4c5HeYlgMH"
+ },
+ "source": [
+ "Next we have two options: either deploy floating point model, or quantize model into int8 and then deploy.\n",
+ "\n",
+ "Let's first assume we want to deploy floating point model. In this case, all we need to do is to export jit trace and then apply `optimize_for_mobile` for final optimization."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ZPX9InColgMH"
+ },
+ "outputs": [],
+ "source": [
+ "from torch.utils.mobile_optimizer import (\n",
+ " optimize_for_mobile,\n",
+ ")\n",
+ "traced_model = torch.jit.trace(net_inst_deploy, input_tensor, strict=False)\n",
+ "traced_model_opt = optimize_for_mobile(traced_model)\n",
+ "# Here we can save the traced_model_opt to JIT file using traced_model_opt.save()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6jFmLo-algMI"
+ },
+ "source": [
+ "Alternatively, we may also want to deploy a quantized model. Efficient blocks are quantization-friendly by design - just wrap the model in deploy form with `QuantStub/DeQuantStub` and it is ready for Pytorch eager mode quantization."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "syb-6y2glgMI"
+ },
+ "outputs": [],
+ "source": [
+ "# Wrapper class for adding QuantStub/DeQuantStub.\n",
+ "class quant_stub_wrapper(nn.Module):\n",
+ " def __init__(self, module_in):\n",
+ " super().__init__()\n",
+ " self.quant = torch.quantization.QuantStub()\n",
+ " self.model = module_in\n",
+ " self.dequant = torch.quantization.DeQuantStub()\n",
+ " def forward(self, x):\n",
+ " x = self.quant(x)\n",
+ " x = self.model(x)\n",
+ " x = self.dequant(x)\n",
+ " return x"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "yUrtbvo_lgMI"
+ },
+ "outputs": [],
+ "source": [
+ "net_inst_quant_stub_wrapper = quant_stub_wrapper(net_inst_deploy)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "qEX2FdcIlgMI"
+ },
+ "source": [
+ "Preparation step of quantization. Fusion has been done for efficient blocks automatically during `convert_to_deployable_form`, so we can just proceed to `torch.quantization.prepare`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "r6DfTh1ElgMI"
+ },
+ "outputs": [],
+ "source": [
+ "net_inst_quant_stub_wrapper.qconfig = torch.quantization.default_qconfig\n",
+ "net_inst_quant_stub_wrapper_prepared = torch.quantization.prepare(net_inst_quant_stub_wrapper)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "q-SkDlVflgMJ"
+ },
+ "source": [
+ "Calibration and quantization. After preparation we will do calibration of quantization by feeding calibration dataset (skipped here) and then do quantization."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "E9Zh45yalgMJ"
+ },
+ "outputs": [],
+ "source": [
+ "# calibration is skipped here.\n",
+ "net_inst_quant_stub_wrapper_quantized = torch.quantization.convert(net_inst_quant_stub_wrapper_prepared)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "n1j11-5KlgMJ"
+ },
+ "outputs": [],
+ "source": [
+ "print(net_inst_quant_stub_wrapper_quantized)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "7KjWPclrlgMJ"
+ },
+ "source": [
+ "Then we can export trace of int8 model and deploy on mobile devices."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "D5YXI4kvlgMK"
+ },
+ "outputs": [],
+ "source": [
+ "traced_model_int8 = torch.jit.trace(net_inst_quant_stub_wrapper_quantized, input_tensor, strict=False)\n",
+ "traced_model_int8_opt = optimize_for_mobile(traced_model_int8)\n",
+ "# Here we can save the traced_model_opt to JIT file using traced_model_int8_opt.save()"
+ ]
+ }
+ ],
+ "metadata": {
+ "bento_stylesheets": {
+ "bento/extensions/flow/main.css": true,
+ "bento/extensions/kernel_selector/main.css": true,
+ "bento/extensions/kernel_ui/main.css": true,
+ "bento/extensions/new_kernel/main.css": true,
+ "bento/extensions/system_usage/main.css": true,
+ "bento/extensions/theme/main.css": true
+ },
+ "colab": {
+ "collapsed_sections": [],
+ "name": "Build your model with PytorchVideo Accelerator.ipynb",
+ "provenance": []
+ },
+ "disseminate_notebook_id": {
+ "notebook_id": "709466976415887"
+ },
+ "disseminate_notebook_info": {
+ "bento_version": "20210314-210430",
+ "description": "PTV tutorial",
+ "hide_code": false,
+ "hipster_group": "",
+ "kernel_build_info": {
+ "error": ""
+ },
+ "no_uii": true,
+ "notebook_number": "512478",
+ "others_can_edit": false,
+ "reviewers": "",
+ "revision_id": "482523946213747",
+ "tags": "",
+ "tasks": "",
+ "title": "Build your model with PytorchVideo Accelerator"
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/tutorials/accelerator/Use_Model_Transmuter.ipynb b/tutorials/accelerator/Use_Model_Transmuter.ipynb
new file mode 100644
index 00000000..bdb21095
--- /dev/null
+++ b/tutorials/accelerator/Use_Model_Transmuter.ipynb
@@ -0,0 +1,279 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "yOVmvnwW6ism"
+ },
+ "source": [
+ "## Introduction\n",
+ "Got your own model, but still want to fully leverage efficient blocks in PytorchVideo/Accelerator? No problem, model transmuter can help you.\n",
+ "Model transmuter is a utility in PytorchVideo/Accelerator that takes user defined model, and replace modules in user model with equivalent efficient block when possible.\n",
+ "In this tutorial, we will go through typical steps of using model transmuter, including:\n",
+ "- Use model transmuter to replace modules in user model with efficient blocks\n",
+ "- Convert model into deploy form and deploy\n",
+ "\n",
+ "Before we start, let's install PytorchVideo."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "2_v3ehr3Bt1T"
+ },
+ "outputs": [],
+ "source": [
+ "!pip install pytorchvideo"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "1-RsOLo46iss"
+ },
+ "source": [
+ "## Use model transmuter to replace modules in user model with efficient blocks\n",
+ "First, let's assume user has following model to be transmuted:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ST7sgFdM6ist"
+ },
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "import torch.nn as nn\n",
+ "\n",
+ "class user_model_residual_block(nn.Module):\n",
+ " def __init__(self):\n",
+ " super().__init__()\n",
+ " self.stem0 = nn.Conv3d(3, 3, kernel_size=(3, 1, 1), padding=(1, 0, 0))\n",
+ " self.stem1 = nn.Conv3d(3, 3, kernel_size=(5, 1, 1), padding=(2, 0, 0))\n",
+ " self.pw = nn.Conv3d(3, 6, kernel_size=1)\n",
+ " self.relu = nn.ReLU()\n",
+ " self.dw = nn.Conv3d(6, 6, kernel_size=3, padding=1, groups=6)\n",
+ " self.relu1 = nn.ReLU()\n",
+ " self.pwl = nn.Conv3d(6, 3, kernel_size=1)\n",
+ " self.relu2 = nn.ReLU()\n",
+ "\n",
+ " def forward(self, x):\n",
+ " out = self.stem0(x)\n",
+ " out = self.stem1(out)\n",
+ " out = self.pw(out)\n",
+ " out = self.relu(out)\n",
+ " out = self.dw(out)\n",
+ " out = self.relu1(out)\n",
+ " out = self.pwl(out)\n",
+ " return self.relu2(out + x)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "f6vbMoE46ist"
+ },
+ "source": [
+ "Then, let's use model transmuter by importing transmuter for targeting device. In this tutorial, we are using mobile cpu as example. Therefore we will import (1) model transmuter for mobile cpu and (2) top-level wrapper of model transmuter."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "zi8KsCSh6isu"
+ },
+ "outputs": [],
+ "source": [
+ "import pytorchvideo.accelerator.deployment.mobile_cpu.transmuter # mobile cpu model transmuter\n",
+ "from pytorchvideo.accelerator.deployment.common.model_transmuter import transmute_model # top-level wrapper of model transmuter"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "t4meNp416isu"
+ },
+ "source": [
+ "We instantiate one user_model_residual_block, and transmute it by calling `transmute_model` with argument of `target_device=\"mobile_cpu\"`. We can see that the some of modules in model has been replaced by printing it again. In general, model transmuter will replace one submodule if its equivalent efficient block is found, otherwise that submodule will be kept intact."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "N-YzZp_d6isu"
+ },
+ "outputs": [],
+ "source": [
+ "model_transmute = user_model_residual_block()\n",
+ "print(\"original model\")\n",
+ "print(model_transmute)\n",
+ "transmute_model(\n",
+ " model_transmute,\n",
+ " target_device=\"mobile_cpu\",\n",
+ ")\n",
+ "print(\"after transmute\")\n",
+ "print(model_transmute)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "eQi8UFdD6isv"
+ },
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "74G3zWYF6isv"
+ },
+ "source": [
+ "## Convert model into deploy form and deploy\n",
+ "Now the model is ready to deploy. First of all, let's convert the model into deploy form. In order to do that, we need to use `convert_to_deployable_form` utility and provide an example input tensor to the model. `convert_to_deployable_form` will convert any instance of `EfficientBlockBase` (base class for efficient blocks in PytorchVideo/Accelerator) into deploy form, while leave other modules unchanged.\n",
+ "Note that once the model is converted into deploy form, the input size should be the same as the example input tensor size during conversion."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "NCeIb59m6isw"
+ },
+ "outputs": [],
+ "source": [
+ "# Define example input tensor\n",
+ "input_blob_size = (1, 3, 4, 6, 6)\n",
+ "input_tensor = torch.randn(input_blob_size)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "3y3GBWdF6isw"
+ },
+ "outputs": [],
+ "source": [
+ "from pytorchvideo.accelerator.deployment.mobile_cpu.utils.model_conversion import (\n",
+ " convert_to_deployable_form,\n",
+ ")\n",
+ "model_transmute_deploy = convert_to_deployable_form(\n",
+ " model_transmute, input_tensor\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "HLt0515O6isw"
+ },
+ "source": [
+ "We can observe further kernel graph change after conversion into deploy mode."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "7cd1NCew6isw"
+ },
+ "outputs": [],
+ "source": [
+ "print(model_transmute_deploy)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jCRJquGw6isx"
+ },
+ "source": [
+ "Currently model transmuter only supports fp32 operation, and it will support int8 with incoming torch.fx quantization mode. In this tutorial, we assume deploy transmuted model without quantization. In this case, all we need to do is to export jit trace and then apply `optimize_for_mobile` for final optimization."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "i2Mr_Il26isx"
+ },
+ "outputs": [],
+ "source": [
+ "from torch.utils.mobile_optimizer import (\n",
+ " optimize_for_mobile,\n",
+ ")\n",
+ "traced_model = torch.jit.trace(model_transmute_deploy, input_tensor, strict=False)\n",
+ "traced_model_opt = optimize_for_mobile(traced_model)\n",
+ "# Here we can save the traced_model_opt to JIT file using traced_model_opt.save()"
+ ]
+ }
+ ],
+ "metadata": {
+ "bento_stylesheets": {
+ "bento/extensions/flow/main.css": true,
+ "bento/extensions/kernel_selector/main.css": true,
+ "bento/extensions/kernel_ui/main.css": true,
+ "bento/extensions/new_kernel/main.css": true,
+ "bento/extensions/system_usage/main.css": true,
+ "bento/extensions/theme/main.css": true
+ },
+ "colab": {
+ "collapsed_sections": [],
+ "name": "Use Model Transmuter.ipynb",
+ "provenance": []
+ },
+ "disseminate_notebook_id": {
+ "notebook_id": "2903671383210410"
+ },
+ "disseminate_notebook_info": {
+ "bento_version": "20210321-210352",
+ "description": "",
+ "hide_code": false,
+ "hipster_group": "",
+ "kernel_build_info": {
+ "error": ""
+ },
+ "no_uii": true,
+ "notebook_number": "520938",
+ "others_can_edit": false,
+ "reviewers": "",
+ "revision_id": "464970858270301",
+ "tags": "",
+ "tasks": "",
+ "title": "Use Model Transmuter"
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/tutorials/accelerator/Use_PytorchVideo_Accelerator_Model_Zoo.ipynb b/tutorials/accelerator/Use_PytorchVideo_Accelerator_Model_Zoo.ipynb
new file mode 100644
index 00000000..89b2a740
--- /dev/null
+++ b/tutorials/accelerator/Use_PytorchVideo_Accelerator_Model_Zoo.ipynb
@@ -0,0 +1,345 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "PV1MwvbCm8X1"
+ },
+ "source": [
+ "## Introduction\n",
+ "This tutorial goes through how to use model zoo provided by PytorchVideo/Accelerator. To use model zoo in PytorchVideo/Accelerator, we should generally follow several steps:\n",
+ "- Use model builder to build selected model; \n",
+ "- Load pretrain checkpoint;\n",
+ "- (Optional) Finetune;\n",
+ "- Deploy.\n",
+ "\n",
+ "Before we start, let's install PytorchVideo."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "h21XJwAKnB8q"
+ },
+ "outputs": [],
+ "source": [
+ "!pip install pytorchvideo"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "kppASAd8m8X4"
+ },
+ "source": [
+ "## Use model builder to build selected model\n",
+ "We use model builder in PytorchVideo/Accelerator model zoo to build pre-defined efficient model. Here we use EfficientX3D-XS (for mobile_cpu) as an example. For more available models and details, please refer to [this page].\n",
+ "\n",
+ "EfficientX3D-XS is an implementation of X3D-XS network as described in [X3D paper](https://arxiv.org/abs/2004.04730) using efficient blocks. It is arithmetically equivalent with X3D-XS, but our benchmark on mobile phone shows 4.6X latency reduction compared with vanilla implementation.\n",
+ "\n",
+ "In order to build EfficientX3D-XS, we simply do the following:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "VwxiWAbQm8X5"
+ },
+ "outputs": [],
+ "source": [
+ "from pytorchvideo.models.accelerator.mobile_cpu.efficient_x3d import EfficientX3d\n",
+ "model_efficient_x3d_xs = EfficientX3d(expansion='XS', head_act='identity')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "uuRnwhYzm8X5"
+ },
+ "source": [
+ "Note that now the efficient blocks in the model are in original form, so the model is good for further training."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "RSYnB3p8m8X5"
+ },
+ "source": [
+ "## Load pretrain checkpoint and (optional) finetune\n",
+ "For each model in model zoo, we provide pretrain checkpoint state_dict for model in original form. See [this page] for details about checkpoints and where to download them."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "X9toVl9xm8X6"
+ },
+ "outputs": [],
+ "source": [
+ "from torch.hub import load_state_dict_from_url\n",
+ "checkpoint_path = 'https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/efficient_x3d_xs_original_form.pyth'\n",
+ "checkpoint = load_state_dict_from_url(checkpoint_path)\n",
+ "\n",
+ "model_efficient_x3d_xs.load_state_dict(checkpoint)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "cwPUPjJom8X6"
+ },
+ "source": [
+ "Now the model is ready for fine-tune. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jcD6nyVzm8X6"
+ },
+ "source": [
+ "## Deploy\n",
+ "Now the model is ready to deploy. First of all, let's convert the model into deploy form. In order to do that, we need to use `convert_to_deployable_form` utility and provide an example input tensor to the model. Note that once the model is converted into deploy form, the input size should be the same as the example input tensor size during conversion."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "2SAavQBZm8X7"
+ },
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "from pytorchvideo.accelerator.deployment.mobile_cpu.utils.model_conversion import (\n",
+ " convert_to_deployable_form,\n",
+ ")\n",
+ "input_blob_size = (1, 3, 4, 160, 160)\n",
+ "input_tensor = torch.randn(input_blob_size)\n",
+ "model_efficient_x3d_xs_deploy = convert_to_deployable_form(model_efficient_x3d_xs, input_tensor)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ToAwX-2Jm8X7"
+ },
+ "source": [
+ "We can see that the network graph has been changed after conversion, which did kernel and graph optimization."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true,
+ "id": "EWMrKRpim8X7"
+ },
+ "outputs": [],
+ "source": [
+ "print(model_efficient_x3d_xs_deploy)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "3HfFgDgCm8X8"
+ },
+ "source": [
+ "Next we have two options: either deploy floating point model, or quantize model into int8 and then deploy.\n",
+ "\n",
+ "Let's first assume we want to deploy floating point model. In this case, all we need to do is to export jit trace and then apply `optimize_for_mobile` for final optimization."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "966SbScHm8X9"
+ },
+ "outputs": [],
+ "source": [
+ "from torch.utils.mobile_optimizer import (\n",
+ " optimize_for_mobile,\n",
+ ")\n",
+ "traced_model = torch.jit.trace(model_efficient_x3d_xs_deploy, input_tensor, strict=False)\n",
+ "traced_model_opt = optimize_for_mobile(traced_model)\n",
+ "# Here we can save the traced_model_opt to JIT file using traced_model_opt.save()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Yjaeep9Wm8X9"
+ },
+ "source": [
+ "Alternatively, we may also want to deploy a quantized model. Efficient blocks are quantization-friendly by design - just wrap the model in deploy form with `QuantStub/DeQuantStub` and it is ready for Pytorch eager mode quantization."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "-cD-OL4km8X9"
+ },
+ "outputs": [],
+ "source": [
+ "import torch.nn as nn\n",
+ "# Wrapper class for adding QuantStub/DeQuantStub.\n",
+ "class quant_stub_wrapper(nn.Module):\n",
+ " def __init__(self, module_in):\n",
+ " super().__init__()\n",
+ " self.quant = torch.quantization.QuantStub()\n",
+ " self.model = module_in\n",
+ " self.dequant = torch.quantization.DeQuantStub()\n",
+ " def forward(self, x):\n",
+ " x = self.quant(x)\n",
+ " x = self.model(x)\n",
+ " x = self.dequant(x)\n",
+ " return x"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "b_-0Kyeym8X-"
+ },
+ "outputs": [],
+ "source": [
+ "model_efficient_x3d_xs_deploy_quant_stub_wrapper = quant_stub_wrapper(model_efficient_x3d_xs_deploy)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "S_rv-Gxcm8YK"
+ },
+ "source": [
+ "Preparation step of quantization. Fusion has been done for efficient blocks automatically during `convert_to_deployable_form`, so we can just proceed to `torch.quantization.prepare`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "-kLtF7tpm8YL"
+ },
+ "outputs": [],
+ "source": [
+ "model_efficient_x3d_xs_deploy_quant_stub_wrapper.qconfig = torch.quantization.default_qconfig\n",
+ "model_efficient_x3d_xs_deploy_quant_stub_wrapper_prepared = torch.quantization.prepare(model_efficient_x3d_xs_deploy_quant_stub_wrapper)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "2W10VcNwm8YM"
+ },
+ "source": [
+ "Calibration and quantization. After preparation we will do calibration of quantization by feeding calibration dataset (skipped here) and then do quantization."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "zR2MrKv-m8YM"
+ },
+ "outputs": [],
+ "source": [
+ "# calibration is skipped here.\n",
+ "model_efficient_x3d_xs_deploy_quant_stub_wrapper_quantized = torch.quantization.convert(model_efficient_x3d_xs_deploy_quant_stub_wrapper_prepared)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "87eImwZCm8YM"
+ },
+ "source": [
+ "Then we can export trace of int8 model and deploy on mobile devices."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "kbN27xw_m8YM"
+ },
+ "outputs": [],
+ "source": [
+ "traced_model_int8 = torch.jit.trace(model_efficient_x3d_xs_deploy_quant_stub_wrapper_quantized, input_tensor, strict=False)\n",
+ "traced_model_int8_opt = optimize_for_mobile(traced_model_int8)\n",
+ "# Here we can save the traced_model_opt to JIT file using traced_model_int8_opt.save()"
+ ]
+ }
+ ],
+ "metadata": {
+ "bento_stylesheets": {
+ "bento/extensions/flow/main.css": true,
+ "bento/extensions/kernel_selector/main.css": true,
+ "bento/extensions/kernel_ui/main.css": true,
+ "bento/extensions/new_kernel/main.css": true,
+ "bento/extensions/system_usage/main.css": true,
+ "bento/extensions/theme/main.css": true
+ },
+ "colab": {
+ "collapsed_sections": [],
+ "name": "Use PytorchVideo Accelerator Model Zoo.ipynb",
+ "provenance": []
+ },
+ "disseminate_notebook_id": {
+ "notebook_id": "478609506614914"
+ },
+ "disseminate_notebook_info": {
+ "bento_version": "20210314-210430",
+ "description": "",
+ "hide_code": false,
+ "hipster_group": "",
+ "kernel_build_info": {
+ "error": ""
+ },
+ "no_uii": true,
+ "notebook_number": "514048",
+ "others_can_edit": false,
+ "reviewers": "",
+ "revision_id": "466653834533727",
+ "tags": "",
+ "tasks": "",
+ "title": "Using PytorchVideo Accelerator Model Zoo"
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/tutorials/torchhub_inference_tutorial.ipynb b/tutorials/torchhub_inference_tutorial.ipynb
new file mode 100644
index 00000000..1a27daaa
--- /dev/null
+++ b/tutorials/torchhub_inference_tutorial.ipynb
@@ -0,0 +1,264 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Torch Hub Inference Tutorial\n",
+ "\n",
+ "In this tutorial you'll learn:\n",
+ "- how to load a pretrained model using Torch Hub \n",
+ "- run inference to classify the action in a demo video\n",
+ "\n",
+ "\n",
+ "NOTE: Currently this tutorial will only work with a local clone of the PyTorchVideo GitHub repo. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Import modules"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json \n",
+ "import torch\n",
+ "from torchvision.transforms import Compose, Lambda\n",
+ "from torchvision.transforms._transforms_video import (\n",
+ " CenterCropVideo,\n",
+ " NormalizeVideo,\n",
+ ")\n",
+ "from pytorchvideo.data.encoded_video import EncodedVideo\n",
+ "from pytorchvideo.transforms import (\n",
+ " ApplyTransformToKey,\n",
+ " ShortSideScale,\n",
+ " UniformTemporalSubsample\n",
+ ") "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Setup \n",
+ "\n",
+ "Download the id to label mapping for the Kinetics 400 dataset on which the torch hub models were trained. \n",
+ "This will be used to get the category label names from the predicted class ids."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!wget https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(\"kinetics_classnames.json\", \"r\") as f:\n",
+ " kinetics_classnames = json.load(f)\n",
+ "\n",
+ "# Create an id to label name mapping\n",
+ "kinetics_id_to_classname = {}\n",
+ "for k, v in kinetics_classnames.items():\n",
+ " kinetics_id_to_classname[v] = str(k).replace('\"', \"\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Load Model using Torch Hub API\n",
+ "\n",
+ "PyTorchVideo provides several pretrained models through Torch Hub. Available models are described in [model zoo documentation](https://github.com/facebookresearch/pytorchvideo/blob/master/docs/source/model_zoo.md#kinetics-400). \n",
+ "\n",
+ "Here we are selecting the `slow_r50` model which was trained using a 8x8 setting on the Kinetics 400 dataset. \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Device on which to run the model\n",
+ "device = \"cuda:0\"\n",
+ "\n",
+ "# Pick a pretrained model \n",
+ "model_name = \"slow_r50\"\n",
+ "\n",
+ "# Local path to the parent folder of hubconf.py in the pytorchvideo codebase\n",
+ "path = '../' \n",
+ "model = torch.hub.load(path, source=\"local\", model=model_name, pretrained=True)\n",
+ "\n",
+ "# Set to eval mode and move to desired device\n",
+ "model = model.eval()\n",
+ "model = model.to(device)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Define the transformations for the input required by the model\n",
+ "\n",
+ "Before passing the video into the model we need to apply some input transforms and sample a clip of the correct duration. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "side_size = 256\n",
+ "mean = [0.45, 0.45, 0.45]\n",
+ "std = [0.225, 0.225, 0.225]\n",
+ "crop_size = 256\n",
+ "num_frames = 8\n",
+ "sampling_rate = 8\n",
+ "frames_per_second = 30\n",
+ "\n",
+ "# Note that this transform is specific to the slow_R50 model. \n",
+ "# If you want to try another of the torch hub models you will need to modify this transform\n",
+ "transform = ApplyTransformToKey(\n",
+ " key=\"video\",\n",
+ " transform=Compose(\n",
+ " [\n",
+ " UniformTemporalSubsample(num_frames),\n",
+ " Lambda(lambda x: x/255.0),\n",
+ " NormalizeVideo(mean, std),\n",
+ " ShortSideScale(\n",
+ " size=side_size\n",
+ " ),\n",
+ " CenterCropVideo(crop_size=(crop_size, crop_size))\n",
+ " ]\n",
+ " ),\n",
+ ")\n",
+ "\n",
+ "# The duration of the input clip is also specific to the model.\n",
+ "clip_duration = (num_frames * sampling_rate)/frames_per_second"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Load an example video\n",
+ "We can test the classification of an example video from the kinetics validation set such as this [archery video](https://www.youtube.com/watch?v=3and4vWkW4s)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Download the example video file\n",
+ "!wget https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4 "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load the example video\n",
+ "video_path = \"archery.mp4\" \n",
+ "\n",
+ "# Select the duration of the clip to load by specifying the start and end duration\n",
+ "# The start_sec should correspond to where the action occurs in the video\n",
+ "start_sec = 0\n",
+ "end_sec = start_sec + clip_duration \n",
+ "\n",
+ "# Initialize an EncodedVideo helper class\n",
+ "video = EncodedVideo.from_path(video_path)\n",
+ "\n",
+ "# Load the desired clip\n",
+ "video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)\n",
+ "\n",
+ "# Apply a transform to normalize the video input\n",
+ "video_data = transform(video_data)\n",
+ "\n",
+ "# Move the inputs to the desired device\n",
+ "inputs = video_data[\"video\"]\n",
+ "inputs = inputs.to(device)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Get model predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Pass the input clip through the model \n",
+ "preds = model(inputs[None, ...])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get the predicted classes \n",
+ "post_act = torch.nn.Softmax(dim=1)\n",
+ "preds = post_act(preds)\n",
+ "pred_classes = preds.topk(k=5).indices\n",
+ "\n",
+ "# Map the predicted classes to the label names\n",
+ "pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes[0]]\n",
+ "print(\"Predicted labels: %s\" % \", \".join(pred_class_names))"
+ ]
+ }
+ ],
+ "metadata": {
+ "bento_stylesheets": {
+ "bento/extensions/flow/main.css": true,
+ "bento/extensions/kernel_selector/main.css": true,
+ "bento/extensions/kernel_ui/main.css": true,
+ "bento/extensions/new_kernel/main.css": true,
+ "bento/extensions/system_usage/main.css": true,
+ "bento/extensions/theme/main.css": true
+ },
+ "kernelspec": {
+ "display_name": "pytorchvideo_etc (local)",
+ "language": "python",
+ "name": "pytorchvideo_etc_local"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/website/.dockerignore b/website/.dockerignore
new file mode 100644
index 00000000..27d2dae2
--- /dev/null
+++ b/website/.dockerignore
@@ -0,0 +1,2 @@
+*/node_modules
+*.log
diff --git a/website/.gitignore b/website/.gitignore
new file mode 100644
index 00000000..5395ea79
--- /dev/null
+++ b/website/.gitignore
@@ -0,0 +1,12 @@
+.DS_Store
+
+node_modules
+
+lib/core/metadata.js
+lib/core/MetadataBlog.js
+
+website/translated_docs
+website/build/
+website/yarn.lock
+website/node_modules
+website/i18n/*
diff --git a/website/docs/tutorial_accelerator_build_your_model.md b/website/docs/tutorial_accelerator_build_your_model.md
new file mode 100644
index 00000000..aa28c342
--- /dev/null
+++ b/website/docs/tutorial_accelerator_build_your_model.md
@@ -0,0 +1,439 @@
+---
+id: tutorial_accelerator_build_your_model
+title: Build your efficient model with PytorchVideo/Accelerator
+---
+
+
+## Introduction
+In this tutorial, we will go through:
+- Basics of efficient blocks in PytorchVideo/Accelerator;
+- Design, train and deploy a model composed of efficient blocks for mobile CPU.
+
+## Basics of efficient blocks in PytorchVideo/Accelerator
+Efficient blocks are blocks with high efficiency. For a target device, we benchmark efficiency of basic network components and provide a collection of efficient blocks under `pytorchvideo/layers/accelerator/` (for simple layers) and `pytorchvideo/models/accelerator/` (for complex modules such as residual block). Inferencing of a model built up with corresponding efficient blocks on target device is guranteed to be efficient.
+
+Each efficient block module is an instance of nn.Module, and has two forms: **original form** (for training) and **deploy form** (for inference). When in original form, the efficient block module has exactly the same behavior as a corresponding vanilla nn.Module for both forward and backward operation. User can freely mix and match efficient blocks for the same target device and build up their own model. Once model is built and trained, user can convert each efficient block in model into deploy form. The conversion will do graph and kernel optimization on each efficient block, and efficient block in deploy form is arithmetically equivalent to original form but has much higher efficiency during inference.
+
+## Design, train and deploy a model composed of efficient blocks for mobile CPU
+### Build a model
+In this section, let's go through the process of design, train and deploy using a example toy model using efficient blocks under `pytorchvideo/layers/accelerator/mobile_cpu` and `pytorchvideo/models/accelerator/mobile_cpu`, which includes:
+- One conv3d head layer with 5x1x1 kernel followed by ReLU activation;
+- One residual block with squeeze-excite;
+- One average pool and fully connected layer as final output.
+
+First, let's import efficient blocks.
+
+
+```python
+# Imports
+import torch.nn as nn
+from pytorchvideo.layers.accelerator.mobile_cpu.activation_functions import (
+ supported_act_functions,
+)
+from pytorchvideo.layers.accelerator.mobile_cpu.convolutions import (
+ Conv3d5x1x1BnAct,
+)
+from pytorchvideo.models.accelerator.mobile_cpu.residual_blocks import (
+ X3dBottleneckBlock,
+)
+from pytorchvideo.layers.accelerator.mobile_cpu.pool import AdaptiveAvgPool3dOutSize1
+from pytorchvideo.layers.accelerator.mobile_cpu.fully_connected import FullyConnected
+
+```
+
+Then we can build a model using those efficient blocks.
+
+
+```python
+class MyNet(nn.Module):
+ def __init__(
+ self,
+ in_channel=3, # input channel of first 5x1x1 layer
+ residual_block_channel=24, # input channel of residual block
+ expansion_ratio=3, # expansion ratio of residual block
+ num_classes=4, # final output classes
+ ):
+ super().__init__()
+ # s1 - 5x1x1 conv3d layer
+ self.s1 = Conv3d5x1x1BnAct(
+ in_channel,
+ residual_block_channel,
+ bias=False,
+ groups=1,
+ use_bn=False,
+ )
+ # s2 - residual block
+ mid_channel = int(residual_block_channel * expansion_ratio)
+ self.s2 = X3dBottleneckBlock(
+ in_channels=residual_block_channel,
+ mid_channels=mid_channel,
+ out_channels=residual_block_channel,
+ use_residual=True,
+ spatial_stride=1,
+ se_ratio=0.0625,
+ act_functions=("relu", "swish", "relu"),
+ use_bn=(True, True, True),
+ )
+ # Average pool and fully connected layer
+ self.avg_pool = AdaptiveAvgPool3dOutSize1()
+ self.projection = FullyConnected(residual_block_channel, num_classes, bias=True)
+ self.act = supported_act_functions['relu']()
+
+ def forward(self, x):
+ x = self.s1(x)
+ x = self.s2(x)
+ x = self.avg_pool(x)
+ # (N, C, T, H, W) -> (N, T, H, W, C).
+ x = x.permute((0, 2, 3, 4, 1))
+ x = self.projection(x)
+ # Performs fully convlutional inference.
+ if not self.training:
+ x = self.act(x)
+ x = x.mean([1, 2, 3])
+ x = x.view(x.shape[0], -1)
+
+ return x
+```
+
+We can instantiate MyNet and its efficient blocks will be in original form.
+
+
+```python
+net_inst = MyNet()
+print(net_inst)
+```
+
+ MyNet(
+ (s1): Conv3d5x1x1BnAct(
+ (kernel): Sequential(
+ (conv): Conv3d(3, 24, kernel_size=(5, 1, 1), stride=(1, 1, 1), padding=(2, 0, 0), bias=False)
+ (act): ReLU(
+ (act): ReLU(inplace=True)
+ )
+ )
+ )
+ (s2): X3dBottleneckBlock(
+ (_residual_add_func): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (final_act): ReLU(
+ (act): ReLU(inplace=True)
+ )
+ (layers): Sequential(
+ (conv_0): Conv3dPwBnAct(
+ (kernel): Sequential(
+ (conv): Conv3d(24, 72, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
+ (bn): BatchNorm3d(72, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+ (act): ReLU(
+ (act): ReLU(inplace=True)
+ )
+ )
+ )
+ (conv_1): Conv3d3x3x3DwBnAct(
+ (kernel): Sequential(
+ (conv): Conv3d(72, 72, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), groups=72, bias=False)
+ (bn): BatchNorm3d(72, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+ (act): Identity(
+ (act): Identity()
+ )
+ )
+ )
+ (se): SqueezeExcitation(
+ (se): SqueezeExcitation(
+ (block): Sequential(
+ (0): Conv3d(72, 8, kernel_size=(1, 1, 1), stride=(1, 1, 1))
+ (1): ReLU()
+ (2): Conv3d(8, 72, kernel_size=(1, 1, 1), stride=(1, 1, 1))
+ (3): Sigmoid()
+ )
+ )
+ )
+ (act_func_1): Swish(
+ (act): Swish()
+ )
+ (conv_2): Conv3dPwBnAct(
+ (kernel): Sequential(
+ (conv): Conv3d(72, 24, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
+ (bn): BatchNorm3d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+ (act): Identity(
+ (act): Identity()
+ )
+ )
+ )
+ )
+ )
+ (avg_pool): AdaptiveAvgPool3dOutSize1(
+ (pool): AdaptiveAvgPool3d(output_size=1)
+ )
+ (projection): FullyConnected(
+ (model): Linear(in_features=24, out_features=4, bias=True)
+ )
+ (act): ReLU(
+ (act): ReLU(inplace=True)
+ )
+ )
+
+
+### Train model
+Then we can train the model with your dataset/optimizer. Here we skip this training step, and just leave the weight as initial value.
+
+### Deploy model
+Now the model is ready to deploy. First of all, let's convert the model into deploy form. In order to do that, we need to use `convert_to_deployable_form` utility and provide an example input tensor to the model. Note that once the model is converted into deploy form, the input size should be the same as the example input tensor size during conversion.
+
+
+```python
+import torch
+from pytorchvideo.accelerator.deployment.mobile_cpu.utils.model_conversion import (
+ convert_to_deployable_form,
+)
+input_blob_size = (1, 3, 4, 6, 6)
+input_tensor = torch.randn(input_blob_size)
+net_inst_deploy = convert_to_deployable_form(net_inst, input_tensor)
+
+```
+
+We can see that the network graph has been changed after conversion, which did kernel and graph optimization.
+
+
+```python
+print(net_inst_deploy)
+```
+
+ MyNet(
+ (s1): Conv3d5x1x1BnAct(
+ (kernel): Sequential(
+ (conv): _Conv3dTemporalKernel5Decomposed(
+ (_conv2d_0): Conv2d(3, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
+ (_conv2d_1): Conv2d(3, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
+ (_conv2d_2): Conv2d(3, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
+ (_conv2d_3): Conv2d(3, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
+ (_conv2d_4): Conv2d(3, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
+ (_add_funcs): ModuleList(
+ (0): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (1): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (2): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (3): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (4): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (5): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (6): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (7): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (8): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (9): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ )
+ (_cat_func): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ )
+ (act): ReLU(
+ (act): ReLU(inplace=True)
+ )
+ )
+ )
+ (s2): X3dBottleneckBlock(
+ (_residual_add_func): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (final_act): ReLU(
+ (act): ReLU(inplace=True)
+ )
+ (layers): Sequential(
+ (conv_0): Conv3dPwBnAct(
+ (kernel): Sequential(
+ (0): _Reshape()
+ (1): Sequential(
+ (conv): ConvReLU2d(
+ (0): Conv2d(24, 72, kernel_size=(1, 1), stride=(1, 1))
+ (1): ReLU(inplace=True)
+ )
+ (bn): Identity()
+ (act): ReLU(
+ (act): Identity()
+ )
+ )
+ (2): _Reshape()
+ )
+ )
+ (conv_1): Conv3d3x3x3DwBnAct(
+ (kernel): Sequential(
+ (conv): _Conv3dTemporalKernel3Decomposed(
+ (_conv2d_3_3_0): Conv2d(72, 72, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=72, bias=False)
+ (_conv2d_3_3_2): Conv2d(72, 72, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=72, bias=False)
+ (_conv2d_3_3_1): Conv2d(72, 72, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=72)
+ (_add_funcs): ModuleList(
+ (0): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (1): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (2): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (3): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (4): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ (5): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ )
+ (_cat_func): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ )
+ (bn): Identity()
+ (act): Identity(
+ (act): Identity()
+ )
+ )
+ )
+ (se): SqueezeExcitation(
+ (se): _SkipConnectMul(
+ (layer): Sequential(
+ (0): AdaptiveAvgPool3d(output_size=1)
+ (1): _Reshape()
+ (2): Linear(in_features=72, out_features=8, bias=True)
+ (3): ReLU()
+ (4): Linear(in_features=8, out_features=72, bias=True)
+ (5): Sigmoid()
+ (6): _Reshape()
+ )
+ (mul_func): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ )
+ )
+ (act_func_1): Swish(
+ (act): _NaiveSwish(
+ (mul_func): FloatFunctional(
+ (activation_post_process): Identity()
+ )
+ )
+ )
+ (conv_2): Conv3dPwBnAct(
+ (kernel): Sequential(
+ (0): _Reshape()
+ (1): Sequential(
+ (conv): Conv2d(72, 24, kernel_size=(1, 1), stride=(1, 1))
+ (bn): Identity()
+ (act): Identity(
+ (act): Identity()
+ )
+ )
+ (2): _Reshape()
+ )
+ )
+ )
+ )
+ (avg_pool): AdaptiveAvgPool3dOutSize1(
+ (pool): AvgPool3d(kernel_size=(4, 6, 6), stride=(4, 6, 6), padding=0)
+ )
+ (projection): FullyConnected(
+ (model): Linear(in_features=24, out_features=4, bias=True)
+ )
+ (act): ReLU(
+ (act): ReLU(inplace=True)
+ )
+ )
+
+
+Let's check whether the network after conversion is arithmetically equivalent. We expect the output to be very close before/after conversion, with some small difference due to numeric noise from floating point operation.
+
+
+```python
+net_inst.eval()
+out_ref = net_inst(input_tensor)
+out = net_inst_deploy(input_tensor)
+
+max_err = float(torch.max(torch.abs(out_ref - out)))
+print(f"max error is {max_err}")
+```
+
+ max error is 2.9802322387695312e-08
+
+
+Next we have two options: either deploy floating point model, or quantize model into int8 and then deploy.
+
+Let's first assume we want to deploy floating point model. In this case, all we need to do is to export jit trace and then apply `optimize_for_mobile` for final optimization.
+
+
+```python
+from torch.utils.mobile_optimizer import (
+ optimize_for_mobile,
+)
+traced_model = torch.jit.trace(net_inst_deploy, input_tensor, strict=False)
+traced_model_opt = optimize_for_mobile(traced_model)
+# Here we can save the traced_model_opt to JIT file using traced_model_opt.save()
+```
+
+Alternatively, we may also want to deploy a quantized model. Efficient blocks are quantization-friendly by design - just wrap the model in deploy form with `QuantStub/DeQuantStub` and it is ready for Pytorch eager mode quantization.
+
+
+```python
+# Wrapper class for adding QuantStub/DeQuantStub.
+class quant_stub_wrapper(nn.Module):
+ def __init__(self, module_in):
+ super().__init__()
+ self.quant = torch.quantization.QuantStub()
+ self.model = module_in
+ self.dequant = torch.quantization.DeQuantStub()
+ def forward(self, x):
+ x = self.quant(x)
+ x = self.model(x)
+ x = self.dequant(x)
+ return x
+```
+
+
+```python
+net_inst_quant_stub_wrapper = quant_stub_wrapper(net_inst_deploy)
+```
+
+Preparation step of quantization. Fusion has been done for efficient blocks automatically during `convert_to_deployable_form`, so we can just proceed to `torch.quantization.prepare`
+
+
+```python
+net_inst_quant_stub_wrapper.qconfig = torch.quantization.default_qconfig
+net_inst_quant_stub_wrapper_prepared = torch.quantization.prepare(net_inst_quant_stub_wrapper)
+```
+
+Calibration and quantization. After preparation we will do calibration of quantization by feeding calibration dataset (skipped here) and then do quantization.
+
+
+```python
+# calibration is skipped here.
+net_inst_quant_stub_wrapper_quantized = torch.quantization.convert(net_inst_quant_stub_wrapper_prepared)
+```
+
+
+Then we can export trace of int8 model and deploy on mobile devices.
+
+
+```python
+traced_model_int8 = torch.jit.trace(net_inst_quant_stub_wrapper_quantized, input_tensor, strict=False)
+traced_model_int8_opt = optimize_for_mobile(traced_model_int8)
+# Here we can save the traced_model_opt to JIT file using traced_model_int8_opt.save()
+```
+
diff --git a/website/docs/tutorial_accelerator_use_accelerator_model_zoo.md b/website/docs/tutorial_accelerator_use_accelerator_model_zoo.md
new file mode 100644
index 00000000..9df08ca8
--- /dev/null
+++ b/website/docs/tutorial_accelerator_use_accelerator_model_zoo.md
@@ -0,0 +1,118 @@
+---
+id: tutorial_accelerator_use_accelerator_model_zoo
+title: Use PytorchVideo/Accelerator Model Zoo
+---
+
+
+## Introduction
+This tutorial goes through how to use model zoo provided by PytorchVideo/Accelerator. To use model zoo in PytorchVideo/Accelerator, we should generally follow several steps:
+- Use model builder to build selected model;
+- Load pretrain checkpoint;
+- (Optional) Finetune;
+- Deploy.
+
+## Use model builder to build selected model
+We use model builder in PytorchVideo/Accelerator model zoo to build pre-defined efficient model. Here we use EfficientX3D-XS (for mobile_cpu) as an example. For more available models and details, please refer to [this page].
+
+EfficientX3D-XS is an implementation of X3D-XS network as described in [X3D paper](https://arxiv.org/abs/2004.04730) using efficient blocks. It is arithmetically equivalent with X3D-XS, but our benchmark on mobile phone shows 4.6X latency reduction compared with vanilla implementation.
+
+In order to build EfficientX3D-XS, we simply do the following:
+
+
+```python
+from pytorchvideo.models.accelerator.mobile_cpu.efficient_x3d import EfficientX3d
+model_efficient_x3d_xs = EfficientX3d(expansion='XS', head_act='identity')
+```
+
+Note that now the efficient blocks in the model are in original form, so the model is good for further training.
+
+## Load pretrain checkpoint and (optional) finetune
+For each model in model zoo, we provide pretrain checkpoint state_dict for model in original form. See [this page] for details about checkpoints and where to download them.
+
+
+```python
+from torch.hub import load_state_dict_from_url
+checkpoint_path = 'https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/efficient_x3d_xs_original_form.pyth'
+checkpoint = load_state_dict_from_url(checkpoint_path)
+
+model_efficient_x3d_xs.load_state_dict(checkpoint)
+```
+
+Now the model is ready for fine-tune.
+
+## Deploy
+Now the model is ready to deploy. First of all, let's convert the model into deploy form. In order to do that, we need to use `convert_to_deployable_form` utility and provide an example input tensor to the model. Note that once the model is converted into deploy form, the input size should be the same as the example input tensor size during conversion.
+
+
+```python
+import torch
+from pytorchvideo.accelerator.deployment.mobile_cpu.utils.model_conversion import (
+ convert_to_deployable_form,
+)
+input_blob_size = (1, 3, 4, 160, 160)
+input_tensor = torch.randn(input_blob_size)
+model_efficient_x3d_xs_deploy = convert_to_deployable_form(model_efficient_x3d_xs, input_tensor)
+```
+
+Next we have two options: either deploy floating point model, or quantize model into int8 and then deploy.
+
+Let's first assume we want to deploy floating point model. In this case, all we need to do is to export jit trace and then apply `optimize_for_mobile` for final optimization.
+
+
+```python
+from torch.utils.mobile_optimizer import (
+ optimize_for_mobile,
+)
+traced_model = torch.jit.trace(model_efficient_x3d_xs_deploy, input_tensor, strict=False)
+traced_model_opt = optimize_for_mobile(traced_model)
+# Here we can save the traced_model_opt to JIT file using traced_model_opt.save()
+```
+
+Alternatively, we may also want to deploy a quantized model. Efficient blocks are quantization-friendly by design - just wrap the model in deploy form with `QuantStub/DeQuantStub` and it is ready for Pytorch eager mode quantization.
+
+
+```python
+# Wrapper class for adding QuantStub/DeQuantStub.
+class quant_stub_wrapper(nn.Module):
+ def __init__(self, module_in):
+ super().__init__()
+ self.quant = torch.quantization.QuantStub()
+ self.model = module_in
+ self.dequant = torch.quantization.DeQuantStub()
+ def forward(self, x):
+ x = self.quant(x)
+ x = self.model(x)
+ x = self.dequant(x)
+ return x
+```
+
+
+```python
+model_efficient_x3d_xs_deploy_quant_stub_wrapper = quant_stub_wrapper(model_efficient_x3d_xs_deploy)
+```
+
+Preparation step of quantization. Fusion has been done for efficient blocks automatically during `convert_to_deployable_form`, so we can just proceed to `torch.quantization.prepare`
+
+
+```python
+model_efficient_x3d_xs_deploy_quant_stub_wrapper.qconfig = torch.quantization.default_qconfig
+model_efficient_x3d_xs_deploy_quant_stub_wrapper_prepared = torch.quantization.prepare(model_efficient_x3d_xs_deploy_quant_stub_wrapper)
+```
+
+Calibration and quantization. After preparation we will do calibration of quantization by feeding calibration dataset (skipped here) and then do quantization.
+
+
+```python
+# calibration is skipped here.
+model_efficient_x3d_xs_deploy_quant_stub_wrapper_quantized = torch.quantization.convert(model_efficient_x3d_xs_deploy_quant_stub_wrapper_prepared)
+```
+
+Then we can export trace of int8 model and deploy on mobile devices.
+
+
+```python
+traced_model_int8 = torch.jit.trace(model_efficient_x3d_xs_deploy_quant_stub_wrapper_quantized, input_tensor, strict=False)
+traced_model_int8_opt = optimize_for_mobile(traced_model_int8)
+# Here we can save the traced_model_opt to JIT file using traced_model_int8_opt.save()
+```
+
diff --git a/website/docs/tutorial_accelerator_use_model_transmuter.md b/website/docs/tutorial_accelerator_use_model_transmuter.md
new file mode 100644
index 00000000..45e74d12
--- /dev/null
+++ b/website/docs/tutorial_accelerator_use_model_transmuter.md
@@ -0,0 +1,98 @@
+---
+id: tutorial_accelerator_use_model_transmuter
+title: Accelerate your model with model transmuter in PytorchVideo/Accelerator
+---
+
+
+## Introduction
+Got your own model, but still want to fully leverage efficient blocks in PytorchVideo/Accelerator? No problem, model transmuter can help you.
+Model transmuter is a utility in PytorchVideo/Accelerator that takes user defined model, and replace modules in user model with equivalent efficient block when possible.
+In this tutorial, we will go through typical steps of using model transmuter, including:
+- Use model transmuter to replace modules in user model with efficient blocks
+- Convert model into deploy form and deploy
+
+## Use model transmuter to replace modules in user model with efficient blocks
+First, let's assume user has following model to be transmuted:
+
+
+```python
+import torch
+import torch.nn as nn
+
+class user_model_residual_block(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.stem0 = nn.Conv3d(3, 3, kernel_size=(3, 1, 1), padding=(1, 0, 0))
+ self.stem1 = nn.Conv3d(3, 3, kernel_size=(5, 1, 1), padding=(2, 0, 0))
+ self.pw = nn.Conv3d(3, 6, kernel_size=1)
+ self.relu = nn.ReLU()
+ self.dw = nn.Conv3d(6, 6, kernel_size=3, padding=1, groups=6)
+ self.relu1 = nn.ReLU()
+ self.pwl = nn.Conv3d(6, 3, kernel_size=1)
+ self.relu2 = nn.ReLU()
+
+ def forward(self, x):
+ out = self.stem0(x)
+ out = self.stem1(out)
+ out = self.pw(out)
+ out = self.relu(out)
+ out = self.dw(out)
+ out = self.relu1(out)
+ out = self.pwl(out)
+ return self.relu2(out + x)
+```
+
+Then, let's use model transmuter by importing transmuter for targeting device. In this tutorial, we are using mobile cpu as example. Therefore we will import (1) model transmuter for mobile cpu and (2) top-level wrapper of model transmuter.
+
+
+```python
+import pytorchvideo.accelerator.deployment.mobile_cpu.transmuter # mobile cpu model transmuter
+from pytorchvideo.accelerator.deployment.common.model_transmuter import transmute_model # top-level wrapper of model transmuter
+```
+
+We instantiate one user_model_residual_block, and transmute it by calling `transmute_model` with argument of `target_device="mobile_cpu"`
+
+
+```python
+model_transmute = user_model_residual_block()
+transmute_model(
+ model_transmute,
+ target_device="mobile_cpu",
+)
+```
+
+If we print the model, we will find that the some of modules in model has been replaced. In geenral, model transmuter will replace one submodule if its equivalent efficient block is found, otherwise that submodule will be kept intact.
+
+
+## Convert model into deploy form and deploy
+Now the model is ready to deploy. First of all, let's convert the model into deploy form. In order to do that, we need to use `convert_to_deployable_form` utility and provide an example input tensor to the model. `convert_to_deployable_form` will convert any instance of `EfficientBlockBase` (base class for efficient blocks in PytorchVideo/Accelerator) into deploy form, while leave other modules unchanged.
+Note that once the model is converted into deploy form, the input size should be the same as the example input tensor size during conversion.
+
+
+```python
+# Define example input tensor
+input_blob_size = (1, 3, 4, 6, 6)
+input_tensor = torch.randn(input_blob_size)
+```
+
+
+```python
+from pytorchvideo.accelerator.deployment.mobile_cpu.utils.model_conversion import (
+ convert_to_deployable_form,
+)
+model_transmute_deploy = convert_to_deployable_form(
+ model_transmute, input_tensor
+)
+```
+
+Currently model transmuter only supports fp32 operation, and it will support int8 with incoming torch.fx quantization mode. In this tutorial, we assume deploy transmuted model without quantization. In this case, all we need to do is to export jit trace and then apply `optimize_for_mobile` for final optimization.
+
+
+```python
+from torch.utils.mobile_optimizer import (
+ optimize_for_mobile,
+)
+traced_model = torch.jit.trace(model_transmute_deploy, input_tensor, strict=False)
+traced_model_opt = optimize_for_mobile(traced_model)
+# Here we can save the traced_model_opt to JIT file using traced_model_opt.save()
+```
diff --git a/website/docs/tutorial_classification.md b/website/docs/tutorial_classification.md
new file mode 100644
index 00000000..07a5a34e
--- /dev/null
+++ b/website/docs/tutorial_classification.md
@@ -0,0 +1,221 @@
+---
+id: tutorial_classification
+title: Training a PyTorchVideo classification model
+---
+
+# Introduction
+
+In this tutorial we will show how to build a simple video classification training pipeline using PyTorchVideo models, datasets and transforms. We'll be using a 3D ResNet [1] for the model, Kinetics [2] for the dataset and a standard video transform augmentation recipe. As PyTorchVideo doesn't contain training code, we'll use [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning) - a lightweight PyTorch training framework - to help out. Don't worry if you don't have Lightning experience, we'll explain what's needed as we go along.
+
+[1] He, Kaiming, et al. Deep Residual Learning for Image Recognition. ArXiv:1512.03385, 2015.
+
+[2] W. Kay, et al. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950, 2017.
+
+# Dataset
+
+To start off with, let's setup the PyTorchVideo Kinetics data loader using a [pytorch_lightning_LightningDataModule](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.core.datamodule.html#pytorch_lightning.core.datamodule.LightningDataModule) . A LightningDataModule is a wrapper that defines the train, val and test data partitions, we'll use it to wrap the PyTorchVideo Kinetics dataset below.
+
+The PyTorchVideo Kinetics dataset is just an alias for the general [pytorchvideo.data.EncodedVideoDataset](http://pytorchvideo.org/api/data/encoded_video.html#pytorchvideo.data.encoded_video_dataset.EncodedVideoDataset) class. If you look at its constructor, you'll notice that most args are what you'd expect (e.g. path to data). However, there are a few args that are more specific to PyTorchVideo datasets:
+- video_sampler - defining the order to sample a video at each iteration. The default is a "random".
+- clip_sampler - defining how to sample a clip from the chosen video at each iteration. For a train partition it is typical to use a "random" clip sampler (i.e. take a random clip of the specified duration from the video). For testing, typically you'll use "uniform" (i.e. uniformly sample all clips of the specified duration from the video) to ensure the entire video is sampled in each epoch.
+- transform - this provides a way to apply user defined data preprocessing or augmentation before batch collating by the PyTorch data loader. We'll show an example using this later.
+
+
+```python
+import os
+import pytorch_lightning
+import pytorchvideo.data
+import torch.utils.data
+
+class KineticsDataModule(pytorch_lightning.LightningDataModule):
+
+ # Dataset configuration
+ _DATA_PATH =
+ _CLIP_DURATION = 2 # Duration of sampled clip for each video
+ _BATCH_SIZE = 8
+ _NUM_WORKERS = 8 # Number of parallel processes fetching data
+
+ def train_dataloader(self):
+ """
+ Create the Kinetics train partition from the list of video labels
+ in {self._DATA_PATH}/train.csv
+ """
+ train_dataset = pytorchvideo.data.Kinetics(
+ data_path=os.path.join(self._DATA_PATH, "train.csv"),
+ clip_sampler=pytorchvideo.data.make_clip_sampler("random", self._CLIP_DURATION),
+ )
+ return torch.utils.data.DataLoader(
+ train_dataset,
+ batch_size=self._BATCH_SIZE,
+ num_workers=self._NUM_WORKERS,
+ )
+
+ def val_dataloader(self):
+ """
+ Create the Kinetics validation partition from the list of video labels
+ in {self._DATA_PATH}/train.csv
+ """
+ val_dataset = pytorchvideo.data.Kinetics(
+ data_path=os.path.join(self._DATA_PATH, "val.csv"),
+ clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", self._CLIP_DURATION),
+ )
+ return torch.utils.data.DataLoader(
+ val_dataset,
+ batch_size=self._BATCH_SIZE,
+ num_workers=self._NUM_WORKERS,
+ )
+```
+
+# Transforms
+
+As mentioned above, PyTorchVideo datasets take a "transform" callable arg that defines custom processing (e.g. augmentations, normalization) that's applied to each clip. The callable arg takes a clip dictionary defining the different modalities and metadata. pytorchvideo.data.Kinetics clips have the following dictionary format:
+
+```python
+ {
+ 'video': , # Shape: (C, T, H, W)
+ 'audio': , # Shape: (S)
+ 'label': , # Integer defining class annotation
+ 'video_name': , # Video file path stem
+ 'video_index': , # index of video used by sampler
+ 'clip_index': # index of clip sampled within video
+ }
+```
+
+PyTorchVideo provides several transforms which you can see in the [docs](http://pytorchvideo.org/api/transforms/transforms.html) Notably, PyTorchVideo provides dictionary transforms that can be used to easily interoperate with other domain specifc libraries. For example, [pytorchvideo.transforms.ApplyTransformToKey(key, transform)](http://pytorchvideo.org/api/transforms/transforms.html#pytorchvideo.transforms.transforms.ApplyTransformToKey), can be used to apply domain specific transforms to a specific dictionary key. For video tensors we use the same tensor shape as TorchVision and for audio we use TorchAudio tensor shapes, making it east to apply their transforms alongside PyTorchVideo ones.
+
+Below we revise the LightningDataModule from the last section to include transforms coming from both TorchVision and PyTorchVideo. For brevity we'll just show the KineticsDataModule.train_dataloader method. The validation dataset transforms would be the same just without the augmentations (RandomShortSideScale, RandomCropVideo, RandomHorizontalFlipVideo).
+
+```python
+from pytorchvideo.transforms import (
+ ApplyTransformToKey,
+ RandomShortSideScale,
+ RemoveKey,
+ ShortSideScale,
+ UniformTemporalSubsample
+)
+
+from torchvision.transforms import (
+ Compose,
+ Normalize,
+ RandomCrop,
+ RandomHorizontalFlip
+)
+
+class KineticsDataModule(pytorch_lightning.LightningDataModule):
+
+# ...
+
+ def train_dataloader(self):
+ """
+ Create the Kinetics train partition from the list of video labels
+ in {self._DATA_PATH}/train.csv. Add transform that subsamples and
+ normalizes the video before applying the scale, crop and flip augmentations.
+ """
+ train_transform = Compose(
+ [
+ ApplyTransformToKey(
+ key="video",
+ transform=Compose(
+ [
+ UniformTemporalSubsample(8),
+ Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
+ RandomShortSideScale(min_size=256, max_size=320),
+ RandomCrop(244),
+ RandomHorizontalFlip(p=0.5),
+ ]
+ ),
+ ),
+ ]
+ )
+ train_dataset = pytorchvideo.data.Kinetics(
+ data_path=os.path.join(self._DATA_PATH, "train.csv"),
+ clip_sampler=pytorchvideo.data.make_clip_sampler("random", self._CLIP_DURATION),
+ transform=train_transform
+ )
+ return torch.utils.data.DataLoader(
+ train_dataset,
+ batch_size=self._BATCH_SIZE,
+ num_workers=self._NUM_WORKERS,
+ )
+
+# ...
+
+```
+
+# Model
+
+All PyTorchVideo models and layers can be built with simple, reproducible factory functions. We call this the "flat" model interface because the args don't require hierachies of configs to be used. An example building a default ResNet can be found below. See the [docs](http://pytorchvideo.org/api/models/resnet.html#pytorchvideo.models.resnet.create_bottleneck_block) for more configuration options.
+
+```python
+import pytorchvideo.models.resnet
+
+def make_kinetics_resnet():
+ return pytorchvideo.models.resnet.create_resnet(
+ input_channel=3, # RGB input from Kinetics
+ model_depth=50, # For the tutorial let's just use a 50 layer network
+ model_num_class=400, # Kinetics has 400 classes so we need out final head to align
+ norm=nn.BatchNorm3d,
+ activation=nn.ReLU,
+ )
+```
+
+# Putting it all together
+
+To put everything together, let's create a [pytorch_lightning.LightningModule](https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html). This defines the train and validation step code (i.e. the code inside the training and evaluation loops), and the optimizer.
+
+```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class VideoClassificationLightningModule(pytorch_lightning.LightningModule):
+ def __init__(self):
+ super().__init__()
+ self.model = make_kinetics_resnet()
+
+ def forward(self, x):
+ return self.model(x)
+
+ def training_step(self, batch, batch_idx):
+ # The model expects a video tensor of shape (B, C, T, H, W), which is the
+ # format provided by the dataset
+ y_hat = self.model(batch["video"])
+
+ # Compute cross entropy loss, loss.backwards will be called behind the scenes
+ # by PyTorchLightning after being returned from this method.
+ loss = F.cross_entropy(y_hat, batch["label"])
+
+ # Log the train loss to Tensorboard
+ self.log("train_loss", loss.item())
+
+ return loss
+
+ def validation_step(self, batch, batch_idx):
+ y_hat = self.model(batch["video"])
+ loss = F.cross_entropy(y_hat, batch["label"])
+ self.log("val_loss", loss)
+ return loss
+
+ def configure_optimizers(self):
+ """
+ Setup the Adam optimizer. Note, that this function also can return a lr scheduler, which is
+ usually useful for training video models.
+ """
+ return torch.optim.Adam(self.parameters(), lr=1e-1)
+```
+
+Our VideoClassificationLightningModule and KineticsDataModule are ready be trained together using the [pytorch_lightning.Trainer](https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html)!. The trainer class has many arguments to define the training environment (e.g. num_gpus, distributed_backend). To keep things simple we'll just use the default local cpu training but note that this would likely take weeks to train so you might want to use more performant settings based on your environment.
+
+```python
+ def train():
+ classification_module = VideoClassificationLightningModule()
+ data_module = KineticsDataModule()
+ trainer = pytorch_lightning.Trainer()
+ trainer.fit(classification_module, data_module)
+```
+
+# Conclusion
+
+In this tutorial we showed how to train a 3D ResNet on Kinetics using PyTorch Lightning. You can see the final code from the tutorial (including a few extra bells and whistles) in the PyTorchVideo projects directory.
+
+To learn more about PyTorchVideo, check out the rest of the [documentation](http://pytorchvideo.org/docs/api/index.html) and [tutorials](http://pytorchvideo.org/docs/tutorial_overview).
diff --git a/website/docs/tutorial_overview.md b/website/docs/tutorial_overview.md
new file mode 100644
index 00000000..eab4c90e
--- /dev/null
+++ b/website/docs/tutorial_overview.md
@@ -0,0 +1,10 @@
+---
+id: tutorial_overview
+title: Tutorials
+sidebar_label: Overview
+---
+
+PyTorchVideo tutorials are designed to help you get aquainted with the library and also give you an idea on how to incorporate different PyTorchVideo components into your own video-research workflow. In the tutorials, through examples, we also show how PyTorchVideo makes it easy to address some of the common deeplearning video use cases.
+
+PyTorchVideo is built on PyTorch. If you are new to PyTorch, the easiest way to get started is with the [PyTorch: A 60 Minute Blitz](https://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html#sphx-glr-beginner-blitz-tensor-tutorial-py) tutorial.
+
diff --git a/website/docs/tutorial_torchhub_inference.md b/website/docs/tutorial_torchhub_inference.md
new file mode 100644
index 00000000..9a39460e
--- /dev/null
+++ b/website/docs/tutorial_torchhub_inference.md
@@ -0,0 +1,164 @@
+---
+id: tutorial_torchhub_inference
+title: Running a pre-trained PyTorchVideo classification model using Torch Hub
+---
+
+# Introduction
+
+PyTorchVideo provides several pretrained models through [Torch Hub](https://pytorch.org/hub/). In this tutorial we will show how to load a pre trained video classification model in PyTorchVideo and run it on a test video. The PyTorchVideo Torch Hub models were trained on the Kinetics 400 [1] dataset. Available models are described in [model zoo documentation](https://github.com/facebookresearch/pytorchvideo/blob/master/docs/source/model_zoo.md#kinetics-400).
+
+[1] W. Kay, et al. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950, 2017.
+
+NOTE: Currently, this tutorial will only work with a local clone of the PyTorchVideo GitHub repo.
+
+# Imports
+
+```python
+import json
+import torch
+from torchvision.transforms import Compose, Lambda
+from torchvision.transforms._transforms_video import (
+ CenterCropVideo,
+ NormalizeVideo,
+)
+from pytorchvideo.data.encoded_video import EncodedVideo
+from pytorchvideo.transforms import (
+ ApplyTransformToKey,
+ ShortSideScale,
+ UniformTemporalSubsample
+)
+```
+
+# Load Model
+
+Let's select the `slow_r50` model which was trained using a 8x8 setting on the Kinetics 400 dataset.
+
+```python
+# Device on which to run the model
+device = "cuda:0"
+
+# Pick a pretrained model
+model_name = "slow_r50"
+
+# Local path to the parent folder of hubconf.py in the pytorchvideo codebase
+path = '../'
+model = torch.hub.load(path, source="local", model=model_name, pretrained=True)
+
+# Set to eval mode and move to desired device
+model = model.eval()
+model = model.to(device)
+```
+
+# Setup Labels
+
+Next let's download the id-to-label mapping for the Kinetics 400 dataset on which the torch hub models were trained. This will be used to get the category label names from the predicted class ids.
+
+```python
+!wget https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json
+```
+
+```python
+with open("kinetics_classnames.json", "r") as f:
+ kinetics_classnames = json.load(f)
+
+# Create an id to label name mapping
+kinetics_id_to_classname = {}
+for k, v in kinetics_classnames.items():
+ kinetics_id_to_classname[v] = str(k).replace('"', "")
+```
+
+# Input Transform
+
+Before passing the video into the model we need to apply some input transforms and sample a clip of the correct duration. We will define them below.
+
+```python
+side_size = 256
+mean = [0.45, 0.45, 0.45]
+std = [0.225, 0.225, 0.225]
+crop_size = 256
+num_frames = 8
+sampling_rate = 8
+frames_per_second = 30
+
+# Note that this transform is specific to the slow_R50 model.
+# If you want to try another of the torch hub models you will need to modify this transform
+transform = ApplyTransformToKey(
+ key="video",
+ transform=Compose(
+ [
+ UniformTemporalSubsample(num_frames),
+ Lambda(lambda x: x/255.0),
+ NormalizeVideo(mean, std),
+ ShortSideScale(
+ size=side_size
+ ),
+ CenterCropVideo(crop_size=(crop_size, crop_size))
+ ]
+ ),
+)
+
+# The duration of the input clip is also specific to the model.
+clip_duration = (num_frames * sampling_rate)/frames_per_second
+```
+
+# Load an example video
+We can now test the model with an example video from the Kinetics validation set such as this [archery video](https://www.youtube.com/watch?v=3and4vWkW4s).
+
+We will load the video and apply the input transform.
+
+
+```python
+# Download the example video file
+!wget https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4
+```
+
+```python
+# Load the example video
+video_path = "archery.mp4"
+
+# Select the duration of the clip to load by specifying the start and end duration
+# The start_sec should correspond to where the action occurs in the video
+start_sec = 0
+end_sec = start_sec + clip_duration
+
+# Initialize an EncodedVideo helper class
+video = EncodedVideo.from_path(video_path)
+
+# Load the desired clip
+video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
+
+# Apply a transform to normalize the video input
+video_data = transform(video_data)
+
+# Move the inputs to the desired device
+inputs = video_data["video"]
+inputs = inputs.to(device)
+```
+
+### Get model predictions
+
+Now we are ready to pass the input into the model and classify the action.
+
+```python
+# Pass the input clip through the model
+preds = model(inputs[None, ...])
+```
+
+Let's look at the top 5 best predictions:
+
+```python
+# Get the predicted classes
+post_act = torch.nn.Softmax(dim=1)
+preds = post_act(preds)
+pred_classes = preds.topk(k=5).indices
+
+# Map the predicted classes to the label names
+pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes[0]]
+print("Predicted labels: %s" % ", ".join(pred_class_names))
+```
+
+# Conclusion
+
+In this tutorial we showed how to load and run a pretrained PyTorchVideo model on a test video. You can run this tutorial as a notebook in the PyTorchVideo tutorials directory.
+
+To learn more about PyTorchVideo, check out the rest of the [documentation](http://pytorchvideo.org/docs/api/index.html) and [tutorials](http://pytorchvideo.org/docs/tutorial_overview).
diff --git a/website/website/README.md b/website/website/README.md
new file mode 100644
index 00000000..2e7802e3
--- /dev/null
+++ b/website/website/README.md
@@ -0,0 +1,216 @@
+This website was created with [Docusaurus](https://docusaurus.io/).
+
+# Building the PyTorchVideo website
+
+## Install
+
+1. Make sure all the dependencies for the website are installed:
+
+```sh
+# Install dependencies
+$ yarn
+
+or
+
+$ npm install docusaurus-init
+```
+
+2. Run your dev server:
+
+```sh
+# Start the site
+$ yarn start
+
+or
+$ ./node_modules/docusaurus/lib/start-server.js
+```
+
+
+## Edit the landing page
+
+To change the content of the landing page modify: `website/pages/en/index.js`.
+
+
+---------------------------------------------------------
+
+## Docusaurus docs
+
+- [Get Started in 5 Minutes](#get-started-in-5-minutes)
+- [Directory Structure](#directory-structure)
+- [Editing Content](#editing-content)
+- [Adding Content](#adding-content)
+- [Full Documentation](#full-documentation)
+
+
+## Directory Structure
+
+Your project file structure should look something like this
+
+```
+my-docusaurus/
+ docs/
+ doc-1.md
+ doc-2.md
+ doc-3.md
+ website/
+ blog/
+ 2016-3-11-oldest-post.md
+ 2017-10-24-newest-post.md
+ core/
+ node_modules/
+ pages/
+ static/
+ css/
+ img/
+ package.json
+ sidebars.json
+ siteConfig.js
+```
+
+# Editing Content
+
+## Editing an existing docs page
+
+Edit docs by navigating to `docs/` and editing the corresponding document:
+
+`docs/doc-to-be-edited.md`
+
+```markdown
+---
+id: page-needs-edit
+title: This Doc Needs To Be Edited
+---
+
+Edit me...
+```
+
+For more information about docs, click [here](https://docusaurus.io/docs/en/navigation)
+
+## Editing an existing blog post
+
+Edit blog posts by navigating to `website/blog` and editing the corresponding post:
+
+`website/blog/post-to-be-edited.md`
+
+```markdown
+---
+id: post-needs-edit
+title: This Blog Post Needs To Be Edited
+---
+
+Edit me...
+```
+
+For more information about blog posts, click [here](https://docusaurus.io/docs/en/adding-blog)
+
+# Adding Content
+
+## Adding a new docs page to an existing sidebar
+
+1. Create the doc as a new markdown file in `/docs`, example `docs/newly-created-doc.md`:
+
+```md
+---
+id: newly-created-doc
+title: This Doc Needs To Be Edited
+---
+
+My new content here..
+```
+
+1. Refer to that doc's ID in an existing sidebar in `website/sidebars.json`:
+
+```javascript
+// Add newly-created-doc to the Getting Started category of docs
+{
+ "docs": {
+ "Getting Started": [
+ "quick-start",
+ "newly-created-doc" // new doc here
+ ],
+ ...
+ },
+ ...
+}
+```
+
+For more information about adding new docs, click [here](https://docusaurus.io/docs/en/navigation)
+
+## Adding a new blog post
+
+1. Make sure there is a header link to your blog in `website/siteConfig.js`:
+
+`website/siteConfig.js`
+
+```javascript
+headerLinks: [
+ ...
+ { blog: true, label: 'Blog' },
+ ...
+]
+```
+
+2. Create the blog post with the format `YYYY-MM-DD-My-Blog-Post-Title.md` in `website/blog`:
+
+`website/blog/2018-05-21-New-Blog-Post.md`
+
+```markdown
+---
+author: Frank Li
+authorURL: https://twitter.com/foobarbaz
+authorFBID: 503283835
+title: New Blog Post
+---
+
+Lorem Ipsum...
+```
+
+For more information about blog posts, click [here](https://docusaurus.io/docs/en/adding-blog)
+
+## Adding items to your site's top navigation bar
+
+1. Add links to docs, custom pages or external links by editing the headerLinks field of `website/siteConfig.js`:
+
+`website/siteConfig.js`
+
+```javascript
+{
+ headerLinks: [
+ ...
+ /* you can add docs */
+ { doc: 'my-examples', label: 'Examples' },
+ /* you can add custom pages */
+ { page: 'help', label: 'Help' },
+ /* you can add external links */
+ { href: 'https://github.com/facebook/docusaurus', label: 'GitHub' },
+ ...
+ ],
+ ...
+}
+```
+
+For more information about the navigation bar, click [here](https://docusaurus.io/docs/en/navigation)
+
+## Adding custom pages
+
+1. Docusaurus uses React components to build pages. The components are saved as .js files in `website/pages/en`:
+1. If you want your page to show up in your navigation header, you will need to update `website/siteConfig.js` to add to the `headerLinks` element:
+
+`website/siteConfig.js`
+
+```javascript
+{
+ headerLinks: [
+ ...
+ { page: 'my-new-custom-page', label: 'My New Custom Page' },
+ ...
+ ],
+ ...
+}
+```
+
+For more information about custom pages, click [here](https://docusaurus.io/docs/en/custom-pages).
+
+# Full Documentation
+
+Full documentation can be found on the [website](https://docusaurus.io/).
diff --git a/website/website/core/Footer.js b/website/website/core/Footer.js
new file mode 100644
index 00000000..ee5fc65e
--- /dev/null
+++ b/website/website/core/Footer.js
@@ -0,0 +1,91 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+const PropTypes = require("prop-types");
+const React = require('react');
+
+function SocialFooter(props) {
+ const repoUrl = `https://github.com/${props.config.organizationName}/${props.config.projectName}`;
+ return (
+
+ );
+}
+
+SocialFooter.propTypes = {
+ config: PropTypes.object
+};
+
+class Footer extends React.Component {
+ docUrl(doc, language) {
+ const baseUrl = this.props.config.baseUrl;
+ const docsUrl = this.props.config.docsUrl;
+ const docsPart = `${docsUrl ? `${docsUrl}/` : ''}`;
+ const langPart = `${language ? `${language}/` : ''}`;
+ return `${baseUrl}${docsPart}${langPart}${doc}`;
+ }
+
+ pageUrl(doc, language) {
+ const baseUrl = this.props.config.baseUrl;
+ return baseUrl + (language ? `${language}/` : '') + doc;
+ }
+
+ render() {
+ const repoUrl = `https://github.com/${this.props.config.organizationName}/${this.props.config.projectName}`;
+ return (
+
+ );
+ }
+}
+
+module.exports = Footer;
\ No newline at end of file
diff --git a/website/website/package.json b/website/website/package.json
new file mode 100644
index 00000000..a92c8b6e
--- /dev/null
+++ b/website/website/package.json
@@ -0,0 +1,14 @@
+{
+ "scripts": {
+ "examples": "docusaurus-examples",
+ "start": "docusaurus-start",
+ "build": "docusaurus-build",
+ "publish-gh-pages": "docusaurus-publish",
+ "write-translations": "docusaurus-write-translations",
+ "version": "docusaurus-version",
+ "rename-version": "docusaurus-rename-version"
+ },
+ "devDependencies": {
+ "docusaurus": "^1.14.6"
+ }
+}
diff --git a/website/website/pages/en/index.js b/website/website/pages/en/index.js
new file mode 100644
index 00000000..63d74628
--- /dev/null
+++ b/website/website/pages/en/index.js
@@ -0,0 +1,237 @@
+/**
+ * Copyright (c) 2021-present, Facebook, Inc.
+**/
+
+const React = require('react');
+
+const CompLibrary = require('../../core/CompLibrary.js');
+
+const MarkdownBlock = CompLibrary.MarkdownBlock; /* Used to read markdown */
+const Container = CompLibrary.Container;
+const GridBlock = CompLibrary.GridBlock;
+const bash = (...args) => `~~~bash\n${String.raw(...args)}\n~~~`;
+class HomeSplash extends React.Component {
+ render() {
+ const {siteConfig, language = ''} = this.props;
+ const {baseUrl, docsUrl} = siteConfig;
+ const docsPart = `${docsUrl ? `${docsUrl}/` : ''}`;
+ const langPart = `${language ? `${language}/` : ''}`;
+ const docUrl = doc => `${baseUrl}${docsPart}${langPart}${doc}`;
+
+ const SplashContainer = props => (
+
+ );
+
+ const Logo = props => (
+
+
+
+ );
+
+ const ProjectTitle = props => (
+
+ {props.tagline}
+
+ );
+
+ const PromoSection = props => (
+
+ );
+
+ const Button = props => (
+
+ );
+
+ return (
+
+
+
+
+
+ Get Started
+ Tutorials
+ GitHub
+
+
+
+ );
+ }
+}
+
+class Index extends React.Component {
+ render() {
+ const {config: siteConfig, language = ''} = this.props;
+ const {baseUrl} = siteConfig;
+
+ const Block = props => (
+
+
+
+ );
+
+ const Description = () => (
+
+ {[
+ {
+ content:
+ 'This is another description of how this project is useful',
+ image: `${baseUrl}img/placeholder.png`,
+ imageAlign: 'right',
+ title: 'Description',
+ },
+ ]}
+
+ );
+
+ const pre = '```';
+
+ const codeExample = `${pre}python
+from pytorchvideo import foo
+from pytorchvideo.models import bar
+ `;
+ const install = `${pre}bash
+pip install pytorchvideo
+ `;
+
+ const QuickStart = () => (
+
+
Get Started
+
+
+
+ Install pytorchvideo (Confirm requirements following the instructions here )
+ {install}
+
+
+ Try Video classification with Model Zoo
+ {codeExample}
+
+
+
+
+ );
+
+ const UseCases = () => (
+
+
Some use cases
+
+
+
+
+
Detection (Add GIF)
+
+
+
+
+
Tracking (Add GIF)
+
+
+
+
+
Classification (Add GIF)
+
+
+
+ );
+
+ const Features = () => (
+
+
+ {[
+ {
+ content:
+ 'Built using PyTorch. Makes it easy to use all the PyTorch-ecosystem components.',
+ image: `${baseUrl}img/pytorch.svg`,
+ imageAlign: 'top',
+ title: 'Based on PyTorch',
+ },
+ {
+ content:
+ 'Variety of state of the art pretrained video models and their associated benchmarks that are ready to use.',
+ image: `${baseUrl}img/modelzoo.svg`,
+ imageAlign: 'top',
+ title: 'Reproducible Model Zoo',
+ },
+ // {
+ // content:
+ // 'Variety of benchmark tasks available to evaluate the models.',
+ // image: `${baseUrl}img/reproducible.svg`,
+ // imageAlign: 'top',
+ // title: 'Reproducible Benchmarks',
+ // },
+ {
+ content:
+ 'Video-focused fast and efficient components that are easy to use. Supports accelerated inference on hardware.',
+ image: `${baseUrl}img/efficient.svg`,
+ imageAlign: 'top',
+ title: 'Efficient Video Components',
+ },
+ ]}
+
+
+ );
+
+ const Showcase = () => {
+ if ((siteConfig.users || []).length === 0) {
+ return null;
+ }
+
+ const showcase = siteConfig.users
+ .filter(user => user.pinned)
+ .map(user => (
+
+
+
+ ));
+
+ const pageUrl = page => baseUrl + (language ? `${language}/` : '') + page;
+
+ return (
+
+
Who is Using This?
+
This project is used by all these people
+
{showcase}
+
+
+ );
+ };
+
+ return (
+
+ );
+ }
+}
+
+module.exports = Index;
\ No newline at end of file
diff --git a/website/website/sidebars.json b/website/website/sidebars.json
new file mode 100644
index 00000000..75ed97de
--- /dev/null
+++ b/website/website/sidebars.json
@@ -0,0 +1,7 @@
+{
+ "docs-other": {
+ "Tutorials": ["tutorial_overview"],
+ "Classification": ["tutorial_classification", "tutorial_torchhub_inference"],
+ "Accelerator": ["tutorial_accelerator_build_your_model", "tutorial_accelerator_use_accelerator_model_zoo", "tutorial_accelerator_use_model_transmuter"]
+ }
+}
diff --git a/website/website/siteConfig.js b/website/website/siteConfig.js
new file mode 100644
index 00000000..dcbdd07c
--- /dev/null
+++ b/website/website/siteConfig.js
@@ -0,0 +1,63 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// See https://docusaurus.io/docs/site-config for all the possible
+// site configuration options.
+
+
+const siteConfig = {
+ title: 'PyTorchVideo', // Title for your website.
+ tagline: 'A deep learning library for video understanding research',
+ url: 'https://pytorchvideo.org', // Your website URL
+ baseUrl: '/',
+
+ // Used for publishing and more
+ projectName: 'pytorchvideo',
+ organizationName: 'facebookresearch',
+
+ // For no header links in the top nav bar -> headerLinks: [],
+ headerLinks: [
+ {doc: 'tutorial_overview', label: 'Tutorials'},
+ {href: "https://ptv-temp.readthedocs.io/en/latest/index.html", label: 'Docs'}, // TODO: Change this after the repo becomes public.
+ {href: "https://github.com/facebookresearch/pytorchvideo/", label: 'GitHub'}, //TODO: Change this after repo becomes public
+ ],
+
+
+ /* path to images for header/footer */
+ headerIcon: 'img/logo.svg',
+ footerIcon: 'img/logo.svg',
+ favicon: 'img/favicon.png',
+
+ /* Colors for website */
+ colors: {
+ primaryColor: '#812ce5',
+ secondaryColor: '#cc33cc',
+ },
+
+ // This copyright info is used in /core/Footer.js and blog RSS/Atom feeds.
+ copyright: `Copyright © ${new Date().getFullYear()} Facebook, Inc`,
+
+ highlight: {
+ // Highlight.js theme to use for syntax highlighting in code blocks.
+ theme: 'atom-one-dark',
+ },
+
+ // Add custom scripts here that would be placed in