Skip to content

Commit

Permalink
hub
Browse files Browse the repository at this point in the history
Summary: More mvit model

Reviewed By: haooooooqi, lyttonhao

Differential Revision: D30633106

fbshipit-source-id: 6e65c64afed063a0e91541b77dd234543f78484c
  • Loading branch information
bxiong1202 authored and facebook-github-bot committed Aug 30, 2021
1 parent adf576d commit d9ea0fa
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 6 deletions.
2 changes: 2 additions & 0 deletions hubconf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
efficient_x3d_s,
efficient_x3d_xs,
i3d_r50,
mvit_base_16,
mvit_base_16x4,
mvit_base_32x3,
r2plus1d_r50,
slow_r50,
slow_r50_detection,
Expand Down
2 changes: 1 addition & 1 deletion pytorchvideo/models/hub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@
slowfast_r50_detection,
slowfast_r101,
)
from .vision_transformers import mvit_base_16x4
from .vision_transformers import mvit_base_16, mvit_base_16x4, mvit_base_32x3
from .x3d import x3d_l, x3d_m, x3d_s, x3d_xs
101 changes: 99 additions & 2 deletions pytorchvideo/models/hub/vision_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@

checkpoint_paths = {
"mvit_base_16x4": "{}/kinetics/MVIT_B_16x4.pyth".format(MODEL_ZOO_ROOT_DIR),
"mvit_base_32x3": "{}/kinetics/MVIT_B_32x3_f294077834.pyth".format(
MODEL_ZOO_ROOT_DIR
),
"mvit_base_16": "{}/imagenet/MVIT_B_16_f292487636.pyth".format(MODEL_ZOO_ROOT_DIR),
}


mvit_video_base_config = {
"spatial_size": 224,
"temporal_size": 16,
Expand All @@ -25,6 +28,31 @@
"pool_kvq_kernel": [3, 3, 3],
}

mvit_video_base_32x3_config = {
"spatial_size": 224,
"temporal_size": 32,
"embed_dim_mul": [[1, 2.0], [3, 2.0], [14, 2.0]],
"atten_head_mul": [[1, 2.0], [3, 2.0], [14, 2.0]],
"pool_q_stride_size": [[1, 1, 2, 2], [3, 1, 2, 2], [14, 1, 2, 2]],
"pool_kv_stride_adaptive": [1, 8, 8],
"pool_kvq_kernel": [3, 3, 3],
}

mvit_image_base_16_config = {
"spatial_size": 224,
"temporal_size": 1,
"depth": 16,
"conv_patch_embed_kernel": [7, 7],
"conv_patch_embed_stride": [4, 4],
"conv_patch_embed_padding": [3, 3],
"use_2d_patch": True,
"embed_dim_mul": [[1, 2.0], [3, 2.0], [14, 2.0]],
"atten_head_mul": [[1, 2.0], [3, 2.0], [14, 2.0]],
"pool_q_stride_size": [[1, 1, 2, 2], [3, 1, 2, 2], [14, 1, 2, 2]],
"pool_kv_stride_adaptive": [1, 4, 4],
"pool_kvq_kernel": [1, 3, 3],
}


def mvit_base_16x4(
pretrained: bool = False,
Expand All @@ -34,7 +62,7 @@ def mvit_base_16x4(
"""
Multiscale Vision Transformers model architecture [1] trained with an 16x4
setting on the Kinetics400 dataset. Model with pretrained weights has top1
accuracy of 79.0.
accuracy of 79.0%.
[1] Haoqi Fan, Bo Xiong, Karttikeya Mangalam, Yanghao Li, Zhicheng Yan, Jitendra
Malik, Christoph Feichtenhofer, "Multiscale Vision Transformers"
Expand All @@ -59,3 +87,72 @@ def mvit_base_16x4(
default_config=mvit_video_base_config,
**kwargs,
)


def mvit_base_32x3(
pretrained: bool = False,
progress: bool = True,
**kwargs: Any,
) -> nn.Module:
"""
Multiscale Vision Transformers model architecture [1] trained with an 32x3
setting on the Kinetics400 dataset. Model with pretrained weights has top1
accuracy of 80.3%.
[1] Haoqi Fan, Bo Xiong, Karttikeya Mangalam, Yanghao Li, Zhicheng Yan, Jitendra
Malik, Christoph Feichtenhofer, "Multiscale Vision Transformers"
https://arxiv.org/pdf/2104.11227.pdf
Args:
pretrained (bool): If True, returns a model pre-trained on Kinetics400 dataset.
progress (bool): If True, displays a progress bar of the download to stderr.
kwargs: Use these to modify any of the other model settings. All the
options are defined in create_multiscale_vision_transformers.
NOTE: to use the pretrained model, do not modify the model configuration
via the kwargs. Only modify settings via kwargs to initialize a new model
without pretrained weights.
"""

return hub_model_builder(
model_builder_func=create_multiscale_vision_transformers,
pretrained=pretrained,
progress=progress,
checkpoint_path=checkpoint_paths["mvit_base_32x3"],
default_config=mvit_video_base_32x3_config,
**kwargs,
)


def mvit_base_16(
pretrained: bool = False,
progress: bool = True,
**kwargs: Any,
) -> nn.Module:
"""
Multiscale Vision Transformers model architecture [1] with a depth 16 trained on
ImageNet-1k dataset. Model with pretrained weights has top1 accuracy of 83%.
[1] Haoqi Fan, Bo Xiong, Karttikeya Mangalam, Yanghao Li, Zhicheng Yan, Jitendra
Malik, Christoph Feichtenhofer, "Multiscale Vision Transformers"
https://arxiv.org/pdf/2104.11227.pdf
Args:
pretrained (bool): If True, returns a model pre-trained on Kinetics400 dataset.
progress (bool): If True, displays a progress bar of the download to stderr.
kwargs: Use these to modify any of the other model settings. All the
options are defined in create_multiscale_vision_transformers.
NOTE: to use the pretrained model, do not modify the model configuration
via the kwargs. Only modify settings via kwargs to initialize a new model
without pretrained weights.
"""

return hub_model_builder(
model_builder_func=create_multiscale_vision_transformers,
pretrained=pretrained,
progress=progress,
checkpoint_path=checkpoint_paths["mvit_base_16"],
default_config=mvit_image_base_16_config,
**kwargs,
)
10 changes: 7 additions & 3 deletions tests/test_models_hub_vision_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,16 @@ def test_load_mvit_(model_name, pretrained):
repo_or_dir=path,
source="local",
model=model_name,
pretrained=False,
pretrained=pretrained,
)
self.assertIsNotNone(model)

models = ["mvit_base_16x4", "mvit_base_16x4"]
pretrains = [True, False]
models = [
"mvit_base_16x4",
"mvit_base_16",
"mvit_base_32x3",
]
pretrains = [False, False, False]

for model_name, pretrain in zip(models, pretrains):
test_load_mvit_(model_name, pretrain)
Expand Down

0 comments on commit d9ea0fa

Please sign in to comment.