Skip to content

Commit

Permalink
Update README and refine of MM-GDINO (#11298)
Browse files Browse the repository at this point in the history
  • Loading branch information
hhaAndroid authored Dec 26, 2023
1 parent 63a4bb8 commit e5f9f35
Show file tree
Hide file tree
Showing 51 changed files with 2,956 additions and 341 deletions.
9 changes: 5 additions & 4 deletions configs/glip/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,8 @@ Learning visual representations from natural language supervision has recently s

### Results on Flickr30k

| Model | Official | Pre-Train Data | Val R@1 | Val R@5 | Val R@10 | Test R@1 | Test R@5 | Test R@10 |
| ------------- | -------- | -------------- | ------- | ------- | -------- | -------- | -------- | --------- |
| **GLIP-T(C)** || O365, GoldG | 84.8 | 94.9 | 96.3 | 85.5 | 95.4 | 96.6 |
| **GLIP-T(C)** | | O365, GoldG | 84.9 | 94.9 | 96.3 | 85.6 | 95.4 | 96.7 |
| Model | Official | Pre-Train Data | Val R@1 | Val R@5 | Val R@10 | Test R@1 | Test R@5 | Test R@10 |
| ------------- | -------- | ------------------- | ------- | ------- | -------- | -------- | -------- | --------- |
| **GLIP-T(C)** || O365, GoldG | 84.8 | 94.9 | 96.3 | 85.5 | 95.4 | 96.6 |
| **GLIP-T(C)** | | O365, GoldG | 84.9 | 94.9 | 96.3 | 85.6 | 95.4 | 96.7 |
| **GLIP-T** | | O365,GoldG,CC3M,SBU | 85.3 | 95.5 | 96.9 | 86.0 | 95.9 | 97.2 |
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

lang_model_name = 'bert-base-uncased'

model = dict(bbox_head=dict(early_fuse=True), )
model = dict(bbox_head=dict(early_fuse=True))

dataset_type = 'Flickr30kDataset'
data_root = 'data/flickr30k/'
data_root = 'data/flickr30k_entities/'

test_pipeline = [
dict(
Expand All @@ -27,15 +27,15 @@
dataset_Flickr30k_val = dict(
type=dataset_type,
data_root=data_root,
ann_file='mdetr_annotations/final_flickr_separateGT_val.json',
ann_file='final_flickr_separateGT_val.json',
data_prefix=dict(img='flickr30k_images/'),
pipeline=test_pipeline,
)

dataset_Flickr30k_test = dict(
type=dataset_type,
data_root=data_root,
ann_file='mdetr_annotations/final_flickr_separateGT_test.json',
ann_file='final_flickr_separateGT_test.json',
data_prefix=dict(img='flickr30k_images/'),
pipeline=test_pipeline,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'

dataset_type = 'Flickr30kDataset'
data_root = 'data/flickr30k/'
data_root = 'data/flickr30k_entities/'

test_pipeline = [
dict(
Expand All @@ -23,15 +23,15 @@
dataset_Flickr30k_val = dict(
type=dataset_type,
data_root=data_root,
ann_file='mdetr_annotations/final_flickr_separateGT_val.json',
ann_file='final_flickr_separateGT_val.json',
data_prefix=dict(img='flickr30k_images/'),
pipeline=test_pipeline,
)

dataset_Flickr30k_test = dict(
type=dataset_type,
data_root=data_root,
ann_file='mdetr_annotations/final_flickr_separateGT_test.json',
ann_file='final_flickr_separateGT_test.json',
data_prefix=dict(img='flickr30k_images/'),
pipeline=test_pipeline,
)
Expand Down
432 changes: 319 additions & 113 deletions configs/mm_grounding_dino/README.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# https://universe.roboflow.com/roboflow-100/brain-tumor-m2pbp/dataset/2
data_root = 'data/brain_tumor_v2/'
class_name = ('label0', 'label1', 'label2')
label_name = '_annotations.coco.json'

palette = [(220, 20, 60), (255, 0, 0), (0, 0, 142)]

metainfo = dict(classes=class_name, palette=palette)
Expand Down Expand Up @@ -64,20 +66,20 @@
pipeline=train_pipeline,
return_classes=True,
data_prefix=dict(img='train/'),
ann_file='train/_annotations.coco.json')))
ann_file='train/' + label_name)))

val_dataloader = dict(
dataset=dict(
metainfo=metainfo,
data_root=data_root,
return_classes=True,
ann_file='valid/_annotations.coco.json',
ann_file='valid/' + label_name,
data_prefix=dict(img='valid/')))
test_dataloader = val_dataloader

val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'valid/_annotations.coco.json',
ann_file=data_root + 'valid/' + label_name,
metric='bbox',
format_only=False)
test_evaluator = val_evaluator
Expand Down Expand Up @@ -107,4 +109,4 @@

default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))

load_from = ''
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,4 @@
train_cfg = dict(max_epochs=max_epochs, val_interval=1)
default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))

load_from = ''
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
custom_keys={
'absolute_pos_embed': dict(decay_mult=0.),
'backbone': dict(lr_mult=0.1),
# 'language_model': dict(lr_mult=0),
'language_model': dict(lr_mult=0.1),
}))

# learning policy
Expand All @@ -75,11 +75,11 @@
begin=0,
end=max_epochs,
by_epoch=True,
milestones=[11],
milestones=[8, 11],
gamma=0.1)
]
train_cfg = dict(max_epochs=max_epochs, val_interval=1)

default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))

load_from = ''
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,20 @@
'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
'pizza', 'donut', 'chair', 'bed', 'toilet', 'tv', 'laptop',
'mouse', 'remote', 'microwave', 'oven', 'toaster',
'refrigerator', 'book', 'clock', 'vase', 'toothbrush')
'refrigerator', 'book', 'clock', 'vase', 'toothbrush') # 48
novel_classes = ('airplane', 'bus', 'cat', 'dog', 'cow', 'elephant',
'umbrella', 'tie', 'snowboard', 'skateboard', 'cup', 'knife',
'cake', 'couch', 'keyboard', 'sink', 'scissors')
all_classes = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'kite', 'skateboard', 'surfboard',
'bottle', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'pizza',
'donut', 'cake', 'chair', 'couch', 'bed', 'toilet', 'tv',
'laptop', 'mouse', 'remote', 'keyboard', 'microwave', 'oven',
'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
'scissors', 'toothbrush')
'cake', 'couch', 'keyboard', 'sink', 'scissors') # 17
all_classes = (
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
'truck', 'boat', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'kite', 'skateboard',
'surfboard', 'bottle', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'pizza', 'donut',
'cake', 'chair', 'couch', 'bed', 'toilet', 'tv', 'laptop', 'mouse',
'remote', 'keyboard', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'toothbrush') # 65

train_metainfo = dict(classes=base_classes)
test_metainfo = dict(
Expand Down Expand Up @@ -95,7 +94,7 @@
type='CocoDataset',
metainfo=train_metainfo,
data_root=data_root,
ann_file='zero-shot/instances_train2017_seen_2.json',
ann_file='annotations/instances_train2017_seen_2.json',
data_prefix=dict(img='train2017/'),
return_classes=True,
filter_cfg=dict(filter_empty_gt=False, min_size=32),
Expand All @@ -111,7 +110,7 @@
type='CocoDataset',
metainfo=test_metainfo,
data_root=data_root,
ann_file='zero-shot/instances_val2017_all_2.json',
ann_file='annotations/instances_val2017_all_2.json',
data_prefix=dict(img='val2017/'),
test_mode=True,
pipeline=test_pipeline,
Expand All @@ -121,7 +120,7 @@

val_evaluator = dict(
type='OVCocoMetric',
ann_file=data_root + 'zero-shot/instances_val2017_all_2.json',
ann_file=data_root + 'annotations/instances_val2017_all_2.json',
metric='bbox',
format_only=False)
test_evaluator = val_evaluator
Expand Down Expand Up @@ -155,4 +154,4 @@
checkpoint=dict(
max_keep_ckpts=1, save_best='coco/novel_ap50', rule='greater'))

load_from = 'epoch_30.pth'
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'

data_root = 'data/coco/'

train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='RandomFlip', prob=0.5),
dict(
type='RandomChoice',
transforms=[
[
dict(
type='RandomChoiceResize',
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
(736, 1333), (768, 1333), (800, 1333)],
keep_ratio=True)
],
[
dict(
type='RandomChoiceResize',
# The radio of all image in train dataset < 7
# follow the original implement
scales=[(400, 4200), (500, 4200), (600, 4200)],
keep_ratio=True),
dict(
type='RandomCrop',
crop_type='absolute_range',
crop_size=(384, 600),
allow_negative_crop=True),
dict(
type='RandomChoiceResize',
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
(736, 1333), (768, 1333), (800, 1333)],
keep_ratio=True)
]
]),
dict(
type='RandomSamplingNegPos',
tokenizer_name=_base_.lang_model_name,
num_sample_negative=20, # ======= important =====
label_map_file='data/coco/annotations/coco2017_label_map.json',
max_tokens=256),
dict(
type='PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
'scale_factor', 'flip', 'flip_direction', 'text',
'custom_entities', 'tokens_positive', 'dataset_mode'))
]

train_dataloader = dict(
dataset=dict(
_delete_=True,
type='ODVGDataset',
need_text=False,
data_root=data_root,
ann_file='annotations/instances_train2017_od.json',
label_map_file='annotations/coco2017_label_map.json',
data_prefix=dict(img='train2017/'),
return_classes=True,
filter_cfg=dict(filter_empty_gt=False, min_size=32),
pipeline=train_pipeline))

optim_wrapper = dict(
_delete_=True,
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=0.00005, weight_decay=0.0001),
clip_grad=dict(max_norm=0.1, norm_type=2),
paramwise_cfg=dict(
custom_keys={
'absolute_pos_embed': dict(decay_mult=0.),
'backbone': dict(lr_mult=0.1),
'language_model': dict(lr_mult=0.0),
}))

# learning policy
max_epochs = 12
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=max_epochs,
by_epoch=True,
milestones=[8, 11],
gamma=0.1)
]
train_cfg = dict(max_epochs=max_epochs, val_interval=1)

default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))

load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa
Loading

0 comments on commit e5f9f35

Please sign in to comment.