hub

Summary: More mvit model Reviewed By: haooooooqi, lyttonhao Differential Revision: D30633106 fbshipit-source-id: 6e65c64afed063a0e91541b77dd234543f78484c
facebookresearch · Aug 30, 2021 · d9ea0fa · d9ea0fa
1 parent adf576d
commit d9ea0fa
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 6 deletions.
diff --git a/hubconf.py b/hubconf.py
@@ -7,7 +7,9 @@
     efficient_x3d_s,
     efficient_x3d_xs,
     i3d_r50,
+    mvit_base_16,
     mvit_base_16x4,
+    mvit_base_32x3,
     r2plus1d_r50,
     slow_r50,
     slow_r50_detection,

diff --git a/pytorchvideo/models/hub/__init__.py b/pytorchvideo/models/hub/__init__.py
@@ -10,5 +10,5 @@
     slowfast_r50_detection,
     slowfast_r101,
 )
-from .vision_transformers import mvit_base_16x4
+from .vision_transformers import mvit_base_16, mvit_base_16x4, mvit_base_32x3
 from .x3d import x3d_l, x3d_m, x3d_s, x3d_xs
diff --git a/pytorchvideo/models/hub/vision_transformers.py b/pytorchvideo/models/hub/vision_transformers.py
@@ -12,9 +12,12 @@
 
 checkpoint_paths = {
     "mvit_base_16x4": "{}/kinetics/MVIT_B_16x4.pyth".format(MODEL_ZOO_ROOT_DIR),
+    "mvit_base_32x3": "{}/kinetics/MVIT_B_32x3_f294077834.pyth".format(
+        MODEL_ZOO_ROOT_DIR
+    ),
+    "mvit_base_16": "{}/imagenet/MVIT_B_16_f292487636.pyth".format(MODEL_ZOO_ROOT_DIR),
 }
 
-
 mvit_video_base_config = {
     "spatial_size": 224,
     "temporal_size": 16,
@@ -25,6 +28,31 @@
     "pool_kvq_kernel": [3, 3, 3],
 }
 
+mvit_video_base_32x3_config = {
+    "spatial_size": 224,
+    "temporal_size": 32,
+    "embed_dim_mul": [[1, 2.0], [3, 2.0], [14, 2.0]],
+    "atten_head_mul": [[1, 2.0], [3, 2.0], [14, 2.0]],
+    "pool_q_stride_size": [[1, 1, 2, 2], [3, 1, 2, 2], [14, 1, 2, 2]],
+    "pool_kv_stride_adaptive": [1, 8, 8],
+    "pool_kvq_kernel": [3, 3, 3],
+}
+
+mvit_image_base_16_config = {
+    "spatial_size": 224,
+    "temporal_size": 1,
+    "depth": 16,
+    "conv_patch_embed_kernel": [7, 7],
+    "conv_patch_embed_stride": [4, 4],
+    "conv_patch_embed_padding": [3, 3],
+    "use_2d_patch": True,
+    "embed_dim_mul": [[1, 2.0], [3, 2.0], [14, 2.0]],
+    "atten_head_mul": [[1, 2.0], [3, 2.0], [14, 2.0]],
+    "pool_q_stride_size": [[1, 1, 2, 2], [3, 1, 2, 2], [14, 1, 2, 2]],
+    "pool_kv_stride_adaptive": [1, 4, 4],
+    "pool_kvq_kernel": [1, 3, 3],
+}
+
 
 def mvit_base_16x4(
     pretrained: bool = False,
@@ -34,7 +62,7 @@ def mvit_base_16x4(
     """
     Multiscale Vision Transformers model architecture [1] trained with an 16x4
     setting on the Kinetics400 dataset. Model with pretrained weights has top1
-    accuracy of 79.0.
+    accuracy of 79.0%.
 
     [1] Haoqi Fan, Bo Xiong, Karttikeya Mangalam, Yanghao Li, Zhicheng Yan, Jitendra
     Malik, Christoph Feichtenhofer, "Multiscale Vision Transformers"
@@ -59,3 +87,72 @@ def mvit_base_16x4(
         default_config=mvit_video_base_config,
         **kwargs,
     )
+
+
+def mvit_base_32x3(
+    pretrained: bool = False,
+    progress: bool = True,
+    **kwargs: Any,
+) -> nn.Module:
+    """
+    Multiscale Vision Transformers model architecture [1] trained with an 32x3
+    setting on the Kinetics400 dataset. Model with pretrained weights has top1
+    accuracy of 80.3%.
+
+    [1] Haoqi Fan, Bo Xiong, Karttikeya Mangalam, Yanghao Li, Zhicheng Yan, Jitendra
+    Malik, Christoph Feichtenhofer, "Multiscale Vision Transformers"
+    https://arxiv.org/pdf/2104.11227.pdf
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on Kinetics400 dataset.
+        progress (bool): If True, displays a progress bar of the download to stderr.
+        kwargs: Use these to modify any of the other model settings. All the
+            options are defined in create_multiscale_vision_transformers.
+
+    NOTE: to use the pretrained model, do not modify the model configuration
+    via the kwargs. Only modify settings via kwargs to initialize a new model
+    without pretrained weights.
+    """
+
+    return hub_model_builder(
+        model_builder_func=create_multiscale_vision_transformers,
+        pretrained=pretrained,
+        progress=progress,
+        checkpoint_path=checkpoint_paths["mvit_base_32x3"],
+        default_config=mvit_video_base_32x3_config,
+        **kwargs,
+    )
+
+
+def mvit_base_16(
+    pretrained: bool = False,
+    progress: bool = True,
+    **kwargs: Any,
+) -> nn.Module:
+    """
+    Multiscale Vision Transformers model architecture [1] with a depth 16 trained on
+    ImageNet-1k dataset. Model with pretrained weights has top1 accuracy of 83%.
+
+    [1] Haoqi Fan, Bo Xiong, Karttikeya Mangalam, Yanghao Li, Zhicheng Yan, Jitendra
+    Malik, Christoph Feichtenhofer, "Multiscale Vision Transformers"
+    https://arxiv.org/pdf/2104.11227.pdf
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on Kinetics400 dataset.
+        progress (bool): If True, displays a progress bar of the download to stderr.
+        kwargs: Use these to modify any of the other model settings. All the
+            options are defined in create_multiscale_vision_transformers.
+
+    NOTE: to use the pretrained model, do not modify the model configuration
+    via the kwargs. Only modify settings via kwargs to initialize a new model
+    without pretrained weights.
+    """
+
+    return hub_model_builder(
+        model_builder_func=create_multiscale_vision_transformers,
+        pretrained=pretrained,
+        progress=progress,
+        checkpoint_path=checkpoint_paths["mvit_base_16"],
+        default_config=mvit_image_base_16_config,
+        **kwargs,
+    )
diff --git a/tests/test_models_hub_vision_transformers.py b/tests/test_models_hub_vision_transformers.py
@@ -23,12 +23,16 @@ def test_load_mvit_(model_name, pretrained):
                 repo_or_dir=path,
                 source="local",
                 model=model_name,
-                pretrained=False,
+                pretrained=pretrained,
             )
             self.assertIsNotNone(model)
 
-        models = ["mvit_base_16x4", "mvit_base_16x4"]
-        pretrains = [True, False]
+        models = [
+            "mvit_base_16x4",
+            "mvit_base_16",
+            "mvit_base_32x3",
+        ]
+        pretrains = [False, False, False]
 
         for model_name, pretrain in zip(models, pretrains):
             test_load_mvit_(model_name, pretrain)