Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions checkpoints/download_ckpts_efficienttam.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# Adapted from https://github.com/facebookresearch/sam2/blob/main/checkpoints/download_ckpts.sh

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

# Use either wget or curl to download the checkpoints
if command -v wget &> /dev/null; then
CMD="wget"
elif command -v curl &> /dev/null; then
CMD="curl -L -O"
else
echo "Please install wget or curl to download the checkpoints."
exit 1
fi

# Define the URLs for Efficient Track Anything checkpoints
EfficientTAM_BASE_URL="https://huggingface.co/yunyangx/efficient-track-anything/resolve/main"
efficienttam_s_url="${EfficientTAM_BASE_URL}/efficienttam_s.pt"
efficienttam_s_512x512_url="${EfficientTAM_BASE_URL}/efficienttam_s_512x512.pt"
efficienttam_s_1_url="${EfficientTAM_BASE_URL}/efficienttam_s_1.pt"
efficienttam_s_2_url="${EfficientTAM_BASE_URL}/efficienttam_s_2.pt"
efficienttam_ti_url="${EfficientTAM_BASE_URL}/efficienttam_ti.pt"
efficienttam_ti_512x512_url="${EfficientTAM_BASE_URL}/efficienttam_ti_512x512.pt"
efficienttam_ti_1_url="${EfficientTAM_BASE_URL}/efficienttam_ti_1.pt"
efficienttam_ti_2_url="${EfficientTAM_BASE_URL}/efficienttam_ti_2.pt"

# Efficient Track Anything checkpoints
echo "Downloading efficienttam_s.pt checkpoint..."
$CMD $efficienttam_s_url || { echo "Failed to download checkpoint from $efficienttam_s_url"; exit 1; }

echo "Downloading efficienttam_s_512x512.pt checkpoint..."
$CMD $efficienttam_s_512x512_url || { echo "Failed to download checkpoint from $efficienttam_s_512x512_url"; exit 1; }

echo "Downloading efficienttam_s_1.pt checkpoint..."
$CMD $efficienttam_s_1_url || { echo "Failed to download checkpoint from $efficienttam_s_1_url"; exit 1; }

echo "Downloading efficienttam_s_2.pt checkpoint..."
$CMD $efficienttam_s_2_url || { echo "Failed to download checkpoint from $efficienttam_s_2_url"; exit 1; }

echo "Downloading efficienttam_ti.pt checkpoint..."
$CMD $efficienttam_ti_url || { echo "Failed to download checkpoint from $efficienttam_ti_url"; exit 1; }

echo "Downloading efficienttam_ti_512x512.pt checkpoint..."
$CMD $efficienttam_ti_512x512_url || { echo "Failed to download checkpoint from $efficienttam_ti_512x512_url"; exit 1; }

echo "Downloading efficienttam_ti_1.pt checkpoint..."
$CMD $efficienttam_ti_1_url || { echo "Failed to download checkpoint from $efficienttam_ti_1_url"; exit 1; }

echo "Downloading efficienttam_ti_2.pt checkpoint..."
$CMD $efficienttam_ti_2_url || { echo "Failed to download checkpoint from $efficienttam_ti_2_url"; exit 1; }

echo "All efficient track anything checkpoints are downloaded successfully."
120 changes: 72 additions & 48 deletions notebooks/video_predictor_example.ipynb

Large diffs are not rendered by default.

120 changes: 120 additions & 0 deletions sam2/configs/efficienttam/efficienttam_s.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# @package _global_

# Model
model:
_target_: sam2.modeling.efficienttam_base.EfficientTAMBase
image_encoder:
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
scalp: 0
trunk:
_target_: sam2.modeling.backbones.vitdet.ViT
patch_size: 16
embed_dim: 384
depth: 12
num_heads: 6
mlp_ratio: 4.0
qkv_bias: true
drop_path_rate: 0.0
use_rel_pos: false
window_size: 14
window_block_indexes: [0, 1, 3, 4, 6, 7, 9, 10]
neck:
_target_: sam2.modeling.backbones.image_encoder.ViTDetNeck
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 256
normalize: true
scale: null
temperature: 10000
d_model: 256
backbone_channel_list: [384,]
neck_norm: LN

memory_attention:
_target_: sam2.modeling.memory_attention.MemoryAttention
d_model: 256
pos_enc_at_input: true
layer:
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
activation: relu
dim_feedforward: 2048
dropout: 0.1
pos_enc_at_attn: false
self_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes: [64, 64]
embedding_dim: 256
num_heads: 1
downsample_rate: 1
dropout: 0.1
d_model: 256
pos_enc_at_cross_attn_keys: true
pos_enc_at_cross_attn_queries: false
cross_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes: [64, 64]
rope_k_repeat: True
embedding_dim: 256
num_heads: 1
downsample_rate: 1
dropout: 0.1
kv_in_dim: 64
num_layers: 4

memory_encoder:
_target_: sam2.modeling.memory_encoder.MemoryEncoder
out_dim: 64
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 64
normalize: true
scale: null
temperature: 10000
mask_downsampler:
_target_: sam2.modeling.memory_encoder.MaskDownSampler
kernel_size: 3
stride: 2
padding: 1
fuser:
_target_: sam2.modeling.memory_encoder.Fuser
layer:
_target_: sam2.modeling.memory_encoder.CXBlock
dim: 256
kernel_size: 7
padding: 3
layer_scale_init_value: 1e-6
use_dwconv: True # depth-wise convs
num_layers: 2

num_maskmem: 7
image_size: 1024
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
# SAM decoder
sigmoid_scale_for_mem_enc: 20.0
sigmoid_bias_for_mem_enc: -10.0
use_mask_input_as_output_without_sam: true
# Memory
directly_add_no_mem_embed: true
use_high_res_features_in_sam: false
# output 3 masks on the first click on initial conditioning frames
multimask_output_in_sam: true
# SAM heads
iou_prediction_use_sigmoid: True
# cross-attend to object pointers from other frames in the ViT encoder
use_obj_ptrs_in_encoder: true
add_tpos_enc_to_obj_ptrs: false
only_obj_ptrs_in_the_past_for_eval: true
# object occlusion prediction
pred_obj_scores: true
pred_obj_scores_mlp: true
fixed_no_obj_ptr: true
# multimask tracking settings
multimask_output_for_tracking: true
use_multimask_token_for_obj_ptr: true
multimask_min_pt_num: 0
multimask_max_pt_num: 1
use_mlp_for_obj_ptr_proj: true
# Compilation flag
compile_image_encoder: False
120 changes: 120 additions & 0 deletions sam2/configs/efficienttam/efficienttam_s_1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# @package _global_

# Model
model:
_target_: sam2.modeling.efficienttam_base.EfficientTAMBase
image_encoder:
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
scalp: 0
trunk:
_target_: sam2.modeling.backbones.vitdet.ViT
patch_size: 16
embed_dim: 384
depth: 12
num_heads: 6
mlp_ratio: 4.0
qkv_bias: true
drop_path_rate: 0.0
use_rel_pos: false
window_size: 14
window_block_indexes: [0, 1, 3, 4, 6, 7, 9, 10]
neck:
_target_: sam2.modeling.backbones.image_encoder.ViTDetNeck
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 256
normalize: true
scale: null
temperature: 10000
d_model: 256
backbone_channel_list: [384,]
neck_norm: LN

memory_attention:
_target_: sam2.modeling.memory_attention.MemoryAttention
d_model: 256
pos_enc_at_input: true
layer:
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
activation: relu
dim_feedforward: 2048
dropout: 0.1
pos_enc_at_attn: false
self_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes: [64, 64]
embedding_dim: 256
num_heads: 1
downsample_rate: 1
dropout: 0.1
d_model: 256
pos_enc_at_cross_attn_keys: true
pos_enc_at_cross_attn_queries: false
cross_attention:
_target_: sam2.modeling.sam.transformer.EfficientRoPEAttention1
rope_theta: 10000.0
feat_sizes: [64, 64]
rope_k_repeat: True
embedding_dim: 256
num_heads: 1
downsample_rate: 1
dropout: 0.1
kv_in_dim: 64
num_layers: 4

memory_encoder:
_target_: sam2.modeling.memory_encoder.MemoryEncoder
out_dim: 64
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 64
normalize: true
scale: null
temperature: 10000
mask_downsampler:
_target_: sam2.modeling.memory_encoder.MaskDownSampler
kernel_size: 3
stride: 2
padding: 1
fuser:
_target_: sam2.modeling.memory_encoder.Fuser
layer:
_target_: sam2.modeling.memory_encoder.CXBlock
dim: 256
kernel_size: 7
padding: 3
layer_scale_init_value: 1e-6
use_dwconv: True # depth-wise convs
num_layers: 2

num_maskmem: 7
image_size: 1024
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
# SAM decoder
sigmoid_scale_for_mem_enc: 20.0
sigmoid_bias_for_mem_enc: -10.0
use_mask_input_as_output_without_sam: true
# Memory
directly_add_no_mem_embed: true
use_high_res_features_in_sam: false
# output 3 masks on the first click on initial conditioning frames
multimask_output_in_sam: true
# SAM heads
iou_prediction_use_sigmoid: True
# cross-attend to object pointers from other frames in the ViT encoder
use_obj_ptrs_in_encoder: true
add_tpos_enc_to_obj_ptrs: false
only_obj_ptrs_in_the_past_for_eval: true
# object occlusion prediction
pred_obj_scores: true
pred_obj_scores_mlp: true
fixed_no_obj_ptr: true
# multimask tracking settings
multimask_output_for_tracking: true
use_multimask_token_for_obj_ptr: true
multimask_min_pt_num: 0
multimask_max_pt_num: 1
use_mlp_for_obj_ptr_proj: true
# Compilation flag
compile_image_encoder: False
Loading